tags. - --
- - """ - tables = self.body_tag.find_all("table") - for table in tables: - trs = table.find_all("tr") - tds = table.find_all("td") - if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600": - td = tds[0] - is_zero_border = "border: none;" in td.get("style") - paragraphs = td.find_all("p") - has_i_tag_or_br = [(p.i, p.br) for p in paragraphs] - has_i_tag_or_br = [x[0] is not None or x[1] is not None - for x in has_i_tag_or_br] - - if all(has_i_tag_or_br) and is_zero_border: - new_div = BeautifulSoup( - features="lxml").new_tag("blockquote") - for p in paragraphs: - new_div.append(p) - - table.replaceWith(new_div) - - def _process_tables(self): - """Function to process tables. Set "border" attribute.""" - tables = self.body_tag.find_all("table") - for table in tables: - tds = table.find_all("td") - sizes = [] - for td in tds: - style = td.get("style") - if style: - match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) - if match: - size = match.group(1) - units = match.group(2) - if units == "pt": - value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\ - if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\ - else float(size) - size = value - sizes.append(float(size)) - width = td.get("width") - td.attrs = {} - if width: - td.attrs["width"] = width - if sizes: - border_size = sum(sizes) / len(sizes) - table.attrs["border"] = f"{border_size:.2}" - self.tables_amount = len(tables) - - def _process_hrefs(self): - a_tags_with_href = self.body_tag.find_all( - "a", {"href": re.compile("^.*http.+")}) - - # remove char=end of file for some editors - for tag in a_tags_with_href: - tag.string = tag.text.replace("\u200c", "") - tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") - - a_tags_with_href = self.body_tag.find_all( - "a", {"href": re.compile("^(?!#sdfootnote)")}) - for tag in a_tags_with_href: - tag.string = tag.text.replace("\u200c", "") - tag.string = tag.text.replace("\u200b", "") # zero-width-space - tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") - def _get_top_level_headers(self) -> List[Dict[str, Union[str, bool]]]: """ Function for gathering info about top-level chapters. Assume: _ - Headers with the smallest outline(or digit in- - -- -aaaaa
--
) are top level chapters. - [ It is consistent with a recursive algorithm + [It is consistent with a recursive algorithm for saving content to a resulted json structure, which happens in header_to_json()] @@ -172,7 +111,8 @@ class HtmlDocxProcessor: "is_introduction": is_introduction}) return headers_info - def _mark_introduction_headers(self): + @staticmethod + def _mark_introduction_headers(top_level_headers: List[Dict[str, Union[str, bool]]]): """ Function to find out: what header shouldn't be numbered and can be treated as introduction chapter @@ -187,21 +127,21 @@ class HtmlDocxProcessor: """ is_numbered_header = [header["is_numbered"] - for header in self.top_level_headers] + for header in top_level_headers] is_title = [header["is_introduction"] - for header in self.top_level_headers] + for header in top_level_headers] first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 second_is_numbered_or_not_exist = all(is_numbered_header[1:2]) first_header_is_introduction = is_title and is_title[0] if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction: - self.top_level_headers[0]["should_be_numbered"] = False - for i in range(1, len(self.top_level_headers)): - self.top_level_headers[i]["should_be_numbered"] = True + top_level_headers[0]["should_be_numbered"] = False + for i in range(1, len(top_level_headers)): + top_level_headers[i]["should_be_numbered"] = True else: - for i in range(0, len(self.top_level_headers)): - self.top_level_headers[i]["should_be_numbered"] = True + for i in range(0, len(top_level_headers)): + top_level_headers[i]["should_be_numbered"] = True @staticmethod def clean_title_from_tabs(tag: NavigableString): @@ -217,10 +157,8 @@ class HtmlDocxProcessor: """ if type(tag) is NavigableString: func(tag) - else: - children = list(tag.children) - if children: - self.apply_func_to_last_child(children[0], func) + elif list(tag.children): + self.apply_func_to_last_child(list(tag.children)[0], func) def _process_headings(self): """ @@ -233,25 +171,20 @@ class HtmlDocxProcessor: processed tags """ - header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) - + header_tags = self.body_tag.find_all(re.compile("^h[1-5]$")) # clean header from attrs and text in header from numbering and \n for h_tag in header_tags: h_tag.attrs = {} + for tag in h_tag.find_all(): + tag.attrs = {} if h_tag.parent.name == "li": h_tag.parent.unwrap() while h_tag.parent.name == "ol": h_tag.parent.unwrap() cleaned_title = re.sub(r"[\s\xa0]", " ", h_tag.text) - if cleaned_title == "": - h_tag.unwrap() - else: - assert h_tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \ - f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings." - + if cleaned_title != "": content = list(h_tag.children) - # do not take into account rubbish empty tags like , but don"t remove them content = [item for item in content if (type(item) is not NavigableString and item.text != "") @@ -270,11 +203,13 @@ class HtmlDocxProcessor: else: self.apply_func_to_last_child( content[i], self.clean_title_from_tabs) + else: + h_tag.unwrap() + def delete_content_before_toc(self): # remove all tag upper the only in content !!! body tag is not updated toc_tag = self.html_soup.new_tag("TOC") - self.content: List[Tag] = self.body_tag.find_all(recursive=False) if toc_tag in self.content: ind = self.content.index(toc_tag) + 1 self.content = self.content[ind:] @@ -297,54 +232,35 @@ class HtmlDocxProcessor: modify_html_soup_with_css_styles(self.body_tag) self.logger.log("Image processing.") - self.images = process_images(access, path_to_html=html_path, - book_id=book_id, body_tag=self.body_tag) + images = process_images(access, path_to_html=html_path, + book_id=book_id, body_tag=self.body_tag) self.logger.log( - f"{len(self.images)} images have been processed.") + f"{len(images)} images have been processed.") self.logger.log("Footnotes processing.") - self.footnotes: List[str] = process_footnotes(self.body_tag) + footnotes: List[str] = process_footnotes(self.body_tag) self.logger.log( - f"{len(self.footnotes)} footnotes have been processed.") - - self.logger.log(f"Processing TOC and headers.") - self._process_toc_links() - - self.logger.log(f"Preprocess Html using presets.") - _preprocess_html(html_preprocessor=self.html_preprocessor, - html_soup=self.html_soup) - - # CSS after html processing cause of that aren't supported by html - self.logger.log("CSS inline style preprocessing.") - self.style_preprocessor.process_inline_styles_in_html_soup( - self.body_tag) - - self.logger.log("CSS inline style processing.") - modify_html_soup_with_css_styles(self.body_tag) - - # process main elements of the .html doc - self.logger.log(f"Processing main elements of html.") - - self.logger.log("Block quotes processing.") - self._process_quotes() - - self.logger.log("Tables processing.") - self._process_tables() - self.logger.log( - f"{self.tables_amount} tables have been processed.") + f"{len(footnotes)} footnotes have been processed.") self.logger.log("Hrefs processing.") self._process_hrefs() - self.top_level_headers: List[Dict[str, Union[str, bool]]]\ + self.logger.log(f"TOC processing.") + self._process_toc_links() + + top_level_headers: List[Dict[str, Union[str, bool]]]\ = self._get_top_level_headers() - self._mark_introduction_headers() + self._mark_introduction_headers(top_level_headers) self._process_headings() + self.logger.log(f".html using presets processing.") + _process_presets(html_preprocessor=self.html_preprocessor, + html_soup=self.html_soup) + + self.content = self.body_tag.find_all(recursive=False) # delete text before table of content if exists self.delete_content_before_toc() self.logger.log("End of processing .html file.") - - return self.content, self.footnotes, self.top_level_headers + return self.content, footnotes, top_level_headers