import logging import os import pathlib import re from shutil import copyfile from bs4 import BeautifulSoup, NavigableString from livecarta_config import LawCartaConfig, BookLogger, BookApiWrapper class HTMLPreprocessor: def __init__(self, html_soup, logger_object, book_api_wrapper=None): self.body_tag = html_soup.body self.html_soup = html_soup self.logger_object: BookLogger = logger_object self.book_api_wrapper: BookApiWrapper = book_api_wrapper self.top_level_headers = None self.content = list() def _clean_tag(self, tag, attr_name, attr_value): """ Function to clean tags by its name and attribute value. :param tag: Tag name to clean. :param attr_name: Attribute name. :param attr_value: Attribute value. """ tags = self.body_tag.find_all(tag, {attr_name: attr_value}) for tag in tags: if len(tag.attrs) == 1: tag.unwrap() def _clean_underline_links(self): """ Function cleans meaningless tags before links. """ underlines = self.body_tag.find_all("u") for u in underlines: if u.find_all('a'): u.unwrap() links = self.body_tag.find_all('a') for link in links: u = link.find_all('u') if u and len(u) == 1: u[0].unwrap() @classmethod def convert_pt_to_px(cls, value): value = float(value) if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE: return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE else: return value @classmethod def convert_font_pt_to_px(cls, style): """ Method converts point in the font-size to pixels. :param style: Str with style to process. :return: Str with converted style. """ size = re.search(r"font-size: (\d{1,3})pt", style) if size is None: return style size = size.group(1) new_size = cls.convert_pt_to_px(size) if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE: return "" return re.sub(size + "pt", str(new_size) + "px", style) def _font_to_span(self): """ Function to convert tag to . If font style is default, then remove this tag. """ fonts = self.body_tag.find_all("font") for font in fonts: face = font.get("face") style = font.get("style") color = font.get("color") font.attrs = {} font.name = "span" if style: style = self.convert_font_pt_to_px(style) if style != "": if color and color in LawCartaConfig.COLORS_MAP: style += f'; color: {color};' font.attrs["style"] = style elif color and color in LawCartaConfig.COLORS_MAP: font.attrs["style"] = f'color: {color};' if face is not None: face = re.sub(r",[\w,\- ]*$", "", face) if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face): font.attrs["face"] = LawCartaConfig.font_correspondence_table[face] else: font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME if len(font.attrs) == 0: font.unwrap() assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more tags def delete_content_before_toc(self): # remove all tag upper the only in content !!! body tag is not updated toc_tag = self.html_soup.new_tag('TOC') if toc_tag in self.content: ind = self.content.index(toc_tag) + 1 self.content = self.content[ind:] def clean_trash(self): """ Function to remove all styles and tags we don't need. """ self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$')) self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$')) self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$')) self._clean_tag("a", "name", "_GoBack") self._clean_underline_links() self._font_to_span() # replace toc with empty tag tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) for table in tables: table.wrap(self.html_soup.new_tag("TOC")) table.decompose() def _process_paragraph(self): """ Function to process

tags (text-align and text-indent value). """ paragraphs = self.body_tag.find_all('p') for p in paragraphs: # libra converts some \n into

with 2
# there we remove 1 unnecessary
brs = p.find_all('br') text = p.text if brs and text == '\n\n' and len(brs) == 2: brs[0].decompose() indent_should_be_added = False if text and ((text[0:1] == '\t') or (text[:2] == '\n\t')): indent_should_be_added = True align = p.get('align') style = p.get('style') if style: indent = re.search(r'text-indent: ([\d\.]{1,4})in', style) margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style) margin_right = re.search(r'margin-right: ([\d\.]{1,4})in', style) margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style) margin_bottom = re.search(r'margin-bottom: ([\d\.]{1,4})in', style) else: indent = None margin_left = None margin_right = None margin_top = None margin_bottom = None if margin_left and margin_right and margin_top and margin_bottom and \ margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \ margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11': p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) p.attrs = {} style = '' if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE: style += f'text-align: {align};' if indent is not None or indent_should_be_added: # indent = indent.group(1) style += f'text-indent: {LawCartaConfig.INDENT};' if style: p.attrs['style'] = style def _process_two_columns(self): """ Function to process paragraphs which has two columns layout. """ two_columns = self.body_tag.find_all("div", style="column-count: 2") for div in two_columns: for child in div.children: if child.name == "p": child["class"] = "columns2" div.unwrap() def _process_tables(self): """ Function to process tables. Set "border" attribute. """ tables = self.body_tag.find_all("table") for table in tables: tds = table.find_all("td") sizes = [] for td in tds: style = td.get('style') if style: match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) if match: size = match.group(1) units = match.group(2) if units == "pt": size = self.convert_pt_to_px(size) sizes.append(float(size)) width = td.get('width') td.attrs = {} if width: td.attrs['width'] = width if sizes: border_size = sum(sizes) / len(sizes) table.attrs['border'] = f'{border_size:.2}' self.tables_amount = len(tables) def _process_quotes(self): """ Function to process block quotes. After docx to html conversion block quotes are stored inside table with 1 cell. All text is wrapped in a tag. Such tables will be replaced with

tags.

aaaaa


""" tables = self.body_tag.find_all("table") for table in tables: trs = table.find_all("tr") tds = table.find_all("td") if len(trs) == 1 and len(tds) == 1 and tds[0].get('width') == '600': td = tds[0] is_zero_border = 'border: none;' in td.get('style') paragraphs = td.find_all("p") has_i_tag_or_br = [(p.i, p.br) for p in paragraphs] has_i_tag_or_br = [x[0] is not None or x[1] is not None for x in has_i_tag_or_br] if all(has_i_tag_or_br) and is_zero_border: new_div = BeautifulSoup(features='lxml').new_tag('blockquote') for p in paragraphs: new_div.append(p) table.replaceWith(new_div) def _process_hrefs(self): a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^.*http.+')}) # remove char=end of file for some editors for tag in a_tags_with_href: tag.string = tag.text.replace('\u200c', '') tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^(?!#sdfootnote)')}) for tag in a_tags_with_href: tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace('\u200b', '') # zero-width-space tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') @staticmethod def _clean_footnote_content(content): content = content.strip() return content.strip() def _process_footnotes(self): """ Function returns list of footnotes and delete them from html_soup. """ footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc') footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$')) footnote_amt = len(footnote_anchors) assert footnote_amt == len(footnote_content), \ 'Some ting went wrong with footnotes after libra conversion' footnotes = [] for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): true_a_tag = cont_tag.find_all('a', class_=re.compile(r'^sdfootnote.+$'))[0] if true_a_tag.attrs.get('href') is None: cont_tag.a.decompose() continue assert anc_tag['name'] == true_a_tag['href'][1:], \ 'Something went wrong with footnotes after libra conversion' new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' new_tag['data-id'] = i + 1 new_tag['id'] = f'footnote-{i + 1}' new_tag.string = '*' anc_tag.replace_with(new_tag) # extra digits in footnotes from documents downloaded from livecarta a_text = true_a_tag.text if len(cont_tag.find_all('p')): sup = cont_tag.find_all('p')[0].find('sup') if sup and sup.text == a_text: sup.decompose() for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}): tag_a.decompose() # remove font-size for span in cont_tag.find_all('span', {'style': re.compile('font-size')}): style = span.get('style') style = re.sub(r"font-size: \d+px", "", style) if style == '': del span.attrs['style'] else: span.attrs['style'] = style unicode_string = '' for child in cont_tag.children: if type(child) is NavigableString: continue if child.name == 'blockquote': unicode_string += str(child) else: unicode_string += child.decode_contents() content = self._clean_footnote_content(unicode_string) cont_tag.decompose() footnotes.append(content) self.footnotes = footnotes def _process_images(self, access, html_path, book_id): """ Function to process tag. Img should be sent Amazon S3 and then return new tag with valid link. For now images are moved to one folder. """ img_tags = self.body_tag.find_all('img') if len(img_tags): if access is None: folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/')) new_path.mkdir(exist_ok=True) for img in img_tags: img_name = img.attrs.get('src') # quick fix for bad links if (len(img_name) >= 3) and img_name[:3] == '../': img_name = img_name[3:] img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') if access is not None: link = access.send_image(img_path, book_id) img.attrs['src'] = link self.logger_object.log(f'{img_name} successfully uploaded.') else: img_size = os.path.getsize(img_path) self.logger_object.log(f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG) new_img_path = new_path / img_name copyfile(img_path, new_img_path) img.attrs["src"] = str(new_img_path) self.images = img_tags def _process_footer(self): """ Function to process
tags. All the tags will be deleted from file. """ divs = self.body_tag.find_all('div', {'title': 'footer'}) for div in divs: div.decompose() def _process_div(self): """ Function to process
tags. All the tags will be deleted from file, all content of the tags will stay. """ divs = self.body_tag.find_all("div") for div in divs: div.unwrap() def _check_parent_link_exist_in_toc(self, tag_with_link): toc_links = [] for a_tag in tag_with_link.find_all("a", {'name': re.compile(r'^_Toc\d+')}): link_name = a_tag.attrs['name'] toc_item = self.body_tag.find("a", {'href': '#' + link_name}) if toc_item: toc_links.append(toc_item) return len(toc_links) > 0 def _process_toc_links(self): """ Function to extract nodes which contains TOC links, remove links from file and detect headers. """ toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')}) headers = [link.parent for link in toc_links] outline_level = "1" # All the unknown outlines will be predicted as

for tag in headers: if re.search(r"^h\d$", tag.name): tag.a.unwrap() # outline_level = tag.name[-1] # TODO: add prediction of the outline level elif tag.name == "p": exist_in_toc = self._check_parent_link_exist_in_toc(tag) if tag in self.body_tag.find_all("p") and exist_in_toc: new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level) text = tag.text tag.replaceWith(new_tag) new_tag.string = text else: # rethink document structure when you have toc_links, other cases? self.logger_object.log(f'Something went wrong in processing toc_links.' f' Check the structure of the file. ' f'Tag name: {tag.name}') @staticmethod def clean_title_from_numbering(title: str): """ Function to remove digits from headers. """ title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) return title @staticmethod def clean_tag_from_tabs(tag: NavigableString): cleaned = re.sub(r'(\s+)+', ' ', tag) this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) tag.replace_with(this) # print('input: ', repr(tag)) # print('test: ', repr(cleaned)) def clean_tag_from_numbering(self, tag): cleaned = self.clean_title_from_numbering(tag) this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) tag.replace_with(this) # print('input: ', repr(tag)) # print('test: ', repr(cleaned)) def apply_func_to_last_child(self, tag, func=None): """ works only with constructions like (((child to work with))) where child is object of NavigableString """ if type(tag) is NavigableString: func(tag) else: children = list(tag.children) if children: self.apply_func_to_last_child(children[0], func) def _preprocessing_headings(self): """ Function to convert all lower level headings to p tags """ pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = self.body_tag.find_all(re.compile(pattern)) for tag in header_tags: tag.name = 'p' def _get_top_level_headers(self): """ Function for gathering info about top-level chapters. Assume: - Headers with smallest outline(or digit in ) are top level chapters. [ It is consistent with a recursive algorithm for saving content to a resulted json structure, which happens in header_to_json()] """ headers_info = [] header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) headers_outline = [int(re.sub(r"^h", "", tag.name)) for tag in header_tags] if headers_outline: top_level_outline = min(headers_outline) top_level_headers = [tag for tag in header_tags if int(re.sub(r"^h", "", tag.name)) == top_level_outline] for tag in top_level_headers: if tag.parent.name == "li": tag.parent.unwrap() while tag.parent.name == "ol": tag.parent.unwrap() title = tag.text title = re.sub(r'\s+', ' ', title).strip() number = re.match(r'^(?:\.?\d+\.? ?)+', title) is_numbered = number is not None cleaned_title = self.clean_title_from_numbering(tag.text) is_introduction = cleaned_title.lower() == 'introduction' headers_info.append({ 'title': cleaned_title, 'is_numbered': is_numbered, 'is_introduction': is_introduction}) return headers_info def _mark_introduction_headers(self): """ Function to find out: what header shouldn't be numbered and can be treated as introduction chapter Assume header(s) to be introduction if: 1. one header not numbered, before 1 numbered header 2. it is first header from the top level list and it equals to 'introduction' Result : Mark each top-level header with flag should_be_numbered = true/false """ is_numbered_header = [header['is_numbered'] for header in self.top_level_headers] is_title = [header['is_introduction'] for header in self.top_level_headers] first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 second_is_numbered_or_not_exist = all(is_numbered_header[1:2]) first_header_is_introduction = is_title and is_title[0] if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction: self.top_level_headers[0]['should_be_numbered'] = False for i in range(1, len(self.top_level_headers)): self.top_level_headers[i]['should_be_numbered'] = True else: for i in range(0, len(self.top_level_headers)): self.top_level_headers[i]['should_be_numbered'] = True def _process_headings(self): """ Function to process tags . """ header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) # 1. remove , for tag in header_tags: b_tags = tag.find_all("b") [tag.unwrap() for tag in b_tags] spans = tag.find_all("span") if spans: for span in spans: style = span.attrs.get("style") span.unwrap() tag.attrs = {} header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) # 2. clean text in header from numbering and \n for tag in header_tags: if tag.parent.name == "li": tag.parent.unwrap() while tag.parent.name == "ol": tag.parent.unwrap() title = tag.text title = self.clean_title_from_numbering(title) if title == "": tag.unwrap() else: assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \ f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' content = list(tag.children) # do not take into account rubbish empty tags like , but don't remove them content = [item for item in content if (type(item) is not NavigableString and item.text != '') or (type(item) is NavigableString)] for i, item in enumerate(content): if type(content[i]) is NavigableString: cleaned = re.sub(r'(\s+)+', ' ', content[i]) this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) content[i].replace_with(this) content[i] = this else: self.apply_func_to_last_child(content[i], self.clean_tag_from_tabs) content[0] = '' if content[0] == ' ' else content[0] content = [item for item in content if item != ''] if type(content[0]) is NavigableString: cleaned = self.clean_title_from_numbering(content[0]) this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) content[0].replace_with(this) content[0] = this else: self.apply_func_to_last_child(content[0], self.clean_tag_from_numbering) def _process_lists(self): """ Function to process tags
  • . Unwrap

    tags. """ li_tags = self.body_tag.find_all("li") for il_tag in li_tags: il_tag.attrs.update(il_tag.p.attrs) il_tag.p.unwrap() def process_html(self, access, html_path, book_id): """ Process html code to satisfy LawCarta formatting. """ try: self.logger_object.log(f'Processing TOC and headers.') self._process_toc_links() self.clean_trash() # process main elements of the .html doc self.logger_object.log(f'Processing main elements of html.') self._preprocessing_headings() self._process_paragraph() self._process_two_columns() self.logger_object.log('Block quotes processing.') self._process_quotes() self.logger_object.log('Tables processing.') self._process_tables() self.logger_object.log(f'{self.tables_amount} tables have been processed.') self.logger_object.log('Hrefs processing.') self._process_hrefs() self.logger_object.log('Footnotes processing.') self._process_footnotes() self.logger_object.log(f'{len(self.footnotes)} footnotes have been processed.') self.logger_object.log('Image processing.') self._process_images(access=access, html_path=html_path, book_id=book_id) self.logger_object.log(f'{len(self.images)} images have been processed.') self._process_footer() self._process_div() self.content = self.body_tag.find_all(recursive=False) self.top_level_headers = self._get_top_level_headers() self._mark_introduction_headers() self._process_headings() self.content = self.body_tag.find_all(recursive=False) self._process_lists() # delete text before table of content if exists self.delete_content_before_toc() except Exception as exc: self.logger_object.log('Error has occurred while processing html.', logging.ERROR) self.logger_object.log_error_to_main_log() if self.book_api_wrapper: self.book_api_wrapper.set_error_status() raise exc self.logger_object.log('End of processing .html file.') return self.content, self.footnotes, self.top_level_headers