import re import json import pathlib from typing import List, Tuple, Dict, Union from bs4 import BeautifulSoup, Tag, NavigableString from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.docx_converter.image_processing import process_images from src.docx_converter.footnotes_processing import process_footnotes from src.tag_inline_style_processor import modify_html_soup_with_css_styles class HTMLDocxProcessor: def __init__(self, html_soup: BeautifulSoup, logger: BookLogger, style_processor, preset_path: str = "presets/docx_presets.json"): self.body_tag = html_soup.body self.html_soup = html_soup self.logger = logger self.preset = json.load(open(preset_path)) self.style_processor = style_processor self.name2action = { "decomposer": self._decompose_tag, "replacer": self._replace_tag, "attr_replacer": self._replace_attr, "unwrapper": self._unwrap_tag } def _process_toc_links(self): """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" def _check_parent_link_exist_in_toc(tag_with_link: Tag) -> bool: toc_links = [] for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}): link_name = a_tag.attrs["name"] toc_item = self.body_tag.find("a", {"href": "#" + link_name}) if toc_item: toc_links.append(toc_item) return len(toc_links) > 0 toc_links = self.body_tag.find_all( "a", {"name": re.compile(r"^_Toc\d+")}) headers = [link.parent for link in toc_links] outline_level = "1" # All the unknown outlines will be predicted as

for tag in headers: if re.search(r"^h\d$", tag.name): tag.a.unwrap() elif tag.name == "p": exist_in_toc = _check_parent_link_exist_in_toc(tag) if tag in self.body_tag.find_all("p") and exist_in_toc: new_tag = BeautifulSoup( features="lxml").new_tag("h" + outline_level) text = tag.text tag.replaceWith(new_tag) new_tag.string = text else: # rethink document structure when you have toc_links, other cases? self.logger.log(f"Something went wrong in processing toc_links." f"Check the structure of the file." f"Tag name: {tag.name}") @staticmethod def _decompose_tag(**kwargs): kwargs["tag"].decompose() @staticmethod def _replace_tag(**kwargs): tag_to_replace: str = kwargs["rule"]["tag_to_replace"] kwargs["tag"].name = tag_to_replace @staticmethod def _replace_attr(**kwargs): attr, attr_value =\ kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"] attr_to_replace, attr_value_to_replace =\ kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] if attr_to_replace: kwargs["tag"][attr_to_replace] = kwargs["tag"][attr] if attr_value_to_replace: kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace del kwargs["tag"][attr] elif attr_value_to_replace: kwargs["tag"].attrs[attr] = attr_value_to_replace @staticmethod def _unwrap_tag(**kwargs): kwargs["tag"].unwrap() @staticmethod def _process_tags(body_tag: Tag, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], action): """ Function do action with tags Parameters ---------- body_tag: Tag Tag & contents of the chapter tag rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] list of conditions when fire function action: function action what to do with tag Returns ------- NoReturn Body Tag with processed certain tags """ for rule in rules: tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"] if rule["condition"]: for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): if condition_on_tag[0] == "parent_tags": for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag) for tag in tags])): tag.parent.attrs.update(tag.attrs) action(body_tag=body_tag, tag=tag, rule=rule) elif condition_on_tag[0] == "child_tags": for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1] for tag in tags])): action(body_tag=body_tag, tag=tag, rule=rule) elif condition_on_tag[0] == "attrs": for attr in rule["condition"]["attrs"]: for tag in body_tag.find_all([re.compile(tag) for tag in tags], {attr["name"]: re.compile(fr"{attr['value']}")}): action(body_tag=body_tag, tag=tag, rule=rule) # attr replacer elif condition_on_tag[0] == "tags": attr = rule["attr"] for tag in body_tag.find_all([re.compile(tag) for tag in tags], {attr['name']: re.compile(fr"{attr['value']}")}): action(body_tag=body_tag, tag=tag, rule=rule) else: for tag in body_tag.find_all([re.compile(tag) for tag in tags]): action(body_tag=body_tag, tag=tag, rule=rule) def _process_paragraph(self): """Function to process

tags (text-align and text-indent value).""" # todo debug and remove if inline is enough paragraphs = self.body_tag.find_all("p") for p in paragraphs: # libre converts some \n into

with 2
# there we remove 1 unnecessary
brs = p.find_all("br") text = p.text if brs and text == "\n\n" and len(brs) == 2: brs[0].decompose() indent_should_be_added = False if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")): indent_should_be_added = True align = p.get("align") style = p.get("style") if style: indent = re.search(r"text-indent: ([\d.]{1,4})in", style) margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style) margin_right = re.search( r"margin-right: ([\d.]{1,4})in", style) margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style) margin_bottom = re.search( r"margin-bottom: ([\d.]{1,4})in", style) else: indent = margin_left = margin_right = \ margin_top = margin_bottom = None if margin_left and margin_right and margin_top and margin_bottom and \ margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \ margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11": p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote")) p.attrs = {} style = "" if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE: style += f"text-align: {align};" if indent is not None or indent_should_be_added: # indent = indent.group(1) style += f"text-indent: {LiveCartaConfig.INDENT};" if style: p.attrs["style"] = style def _process_quotes(self): """ Function to process block quotes. After docx to html conversion block quotes are stored inside table with 1 cell. All text is wrapped in a tag. Such tables will be replaced with

tags.

aaaaa


""" tables = self.body_tag.find_all("table") for table in tables: trs = table.find_all("tr") tds = table.find_all("td") if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600": td = tds[0] is_zero_border = "border: none;" in td.get("style") paragraphs = td.find_all("p") has_i_tag_or_br = [(p.i, p.br) for p in paragraphs] has_i_tag_or_br = [x[0] is not None or x[1] is not None for x in has_i_tag_or_br] if all(has_i_tag_or_br) and is_zero_border: new_div = BeautifulSoup( features="lxml").new_tag("blockquote") for p in paragraphs: new_div.append(p) table.replaceWith(new_div) @staticmethod def convert_pt_to_px(value: float) -> float: value = float(value) if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE: return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE else: return value def _process_tables(self): """Function to process tables. Set "border" attribute.""" tables = self.body_tag.find_all("table") for table in tables: tds = table.find_all("td") sizes = [] for td in tds: style = td.get("style") if style: match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) if match: size = match.group(1) units = match.group(2) if units == "pt": size = self.convert_pt_to_px(size) sizes.append(float(size)) width = td.get("width") td.attrs = {} if width: td.attrs["width"] = width if sizes: border_size = sum(sizes) / len(sizes) table.attrs["border"] = f"{border_size:.2}" self.tables_amount = len(tables) def _process_hrefs(self): a_tags_with_href = self.body_tag.find_all( "a", {"href": re.compile("^.*http.+")}) # remove char=end of file for some editors for tag in a_tags_with_href: tag.string = tag.text.replace("\u200c", "") tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") a_tags_with_href = self.body_tag.find_all( "a", {"href": re.compile("^(?!#sdfootnote)")}) for tag in a_tags_with_href: tag.string = tag.text.replace("\u200c", "") tag.string = tag.text.replace("\u200b", "") # zero-width-space tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") def _process_div(self): # todo unwrapper """Function to process
tags. All the tags will be deleted from file, all content of the tags will stay.""" divs = self.body_tag.find_all("div") for div in divs: div.unwrap() def _get_top_level_headers(self) -> List[Dict[str, Union[str, bool]]]: """ Function for gathering info about top-level chapters. Assume: _ - Headers with the smallest outline(or digit in ) are top level chapters. [ It is consistent with a recursive algorithm for saving content to a resulted json structure, which happens in header_to_json()] """ headers_info = [] header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) headers_outline = [int(re.sub(r"^h", "", tag.name)) for tag in header_tags] if headers_outline: top_level_outline = min(headers_outline) top_level_headers = [tag for tag in header_tags if int(re.sub(r"^h", "", tag.name)) == top_level_outline] for tag in top_level_headers: if tag.parent.name == "li": tag.parent.unwrap() while tag.parent.name == "ol": tag.parent.unwrap() title = tag.text title = re.sub(r"\s+", " ", title).strip() number = re.match(r"^(?:\.?\d+\.? ?)+", title) is_numbered = number is not None cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text) is_introduction = cleaned_title.lower() == "introduction" headers_info.append({ "title": cleaned_title, "is_numbered": is_numbered, "is_introduction": is_introduction}) return headers_info def _mark_introduction_headers(self): """ Function to find out: what header shouldn't be numbered and can be treated as introduction chapter Assume header(s) to be introduction if: 1. one header not numbered, before 1 numbered header 2. it is first header from the top level list, and it equals to "introduction" Returns ------- None mark each top-level header with flag should_be_numbered = true/false """ is_numbered_header = [header["is_numbered"] for header in self.top_level_headers] is_title = [header["is_introduction"] for header in self.top_level_headers] first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 second_is_numbered_or_not_exist = all(is_numbered_header[1:2]) first_header_is_introduction = is_title and is_title[0] if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction: self.top_level_headers[0]["should_be_numbered"] = False for i in range(1, len(self.top_level_headers)): self.top_level_headers[i]["should_be_numbered"] = True else: for i in range(0, len(self.top_level_headers)): self.top_level_headers[i]["should_be_numbered"] = True @staticmethod def clean_title_from_tabs(tag: NavigableString): cleaned = re.sub(r"[\s\xa0]", " ", tag) this = BeautifulSoup.new_string(BeautifulSoup( features="lxml"), cleaned, NavigableString) tag.replace_with(this) def apply_func_to_last_child(self, tag: Union[NavigableString, Tag], func=None): """ works only with constructions like (((child to work with))) where child is object of NavigableString """ if type(tag) is NavigableString: func(tag) else: children = list(tag.children) if children: self.apply_func_to_last_child(children[0], func) def _process_headings(self): """ Function to process tags . Clean header from attrs and text in header from numbering and \n Returns ------- None processed tags """ header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) # clean header from attrs and text in header from numbering and \n for h_tag in header_tags: h_tag.attrs = {} if h_tag.parent.name == "li": h_tag.parent.unwrap() while h_tag.parent.name == "ol": h_tag.parent.unwrap() cleaned_title = re.sub(r"[\s\xa0]", " ", h_tag.text) if cleaned_title == "": h_tag.unwrap() else: assert h_tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \ f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings." content = list(h_tag.children) # do not take into account rubbish empty tags like , but don"t remove them content = [item for item in content if (type(item) is not NavigableString and item.text != "") or (type(item) is NavigableString)] content[0] = "" if content[0] == " " else content[0] content = [item for item in content if item != ""] for i, item in enumerate(content): if type(content[i]) is NavigableString: cleaned = re.sub(r"(\s+)+", " ", content[i]) this = BeautifulSoup.new_string(BeautifulSoup( features="lxml"), cleaned, NavigableString) content[i].replace_with(this) content[i] = this else: self.apply_func_to_last_child( content[i], self.clean_title_from_tabs) def delete_content_before_toc(self): # remove all tag upper the only in content !!! body tag is not updated toc_tag = self.html_soup.new_tag("TOC") self.content: List[Tag] = self.body_tag.find_all(recursive=False) if toc_tag in self.content: ind = self.content.index(toc_tag) + 1 self.content = self.content[ind:] def process_html(self, access=None, html_path: pathlib.Path = "", book_id: int = 0) -> Tuple[List[Tag], List[str], List[Dict[str, Union[str, bool]]]]: """Process html code to satisfy LiveCarta formatting.""" self.logger.log("Beginning of processing .html file.") self.logger.log(f"Processing TOC and headers.") self._process_toc_links() for rule in self.preset: self.logger.log(rule["preset_name"] + " process.") action = self.name2action[rule["preset_name"]] self._process_tags(self.body_tag, rule["rules"], action) self.logger.log("CSS inline style preprocessing.") self.style_processor.process_inline_styles_in_html_soup(self.html_soup) self.logger.log("CSS inline style processing.") modify_html_soup_with_css_styles(self.html_soup) # process main elements of the .html doc self.logger.log(f"Processing main elements of html.") self._process_paragraph() self.logger.log("Block quotes processing.") self._process_quotes() self.logger.log("Tables processing.") self._process_tables() self.logger.log( f"{self.tables_amount} tables have been processed.") self.logger.log("Hrefs processing.") self._process_hrefs() self.logger.log("Image processing.") self.images = process_images(access, path_to_html=html_path, book_id=book_id, body_tag=self.body_tag) self.logger.log( f"{len(self.images)} images have been processed.") self.logger.log("Footnotes processing.") self.footnotes: List[str] = process_footnotes(self.body_tag) self.logger.log( f"{len(self.footnotes)} footnotes have been processed.") self._process_div() self.top_level_headers: List[Dict[str, Union[str, bool]]]\ = self._get_top_level_headers() self._mark_introduction_headers() self._process_headings() # delete text before table of content if exists self.delete_content_before_toc() self.logger.log("End of processing .html file.") return self.content, self.footnotes, self.top_level_headers