import re import json from typing import List, Dict, Union from bs4.element import PageElement from bs4 import BeautifulSoup, Tag, NavigableString, Comment from src.util.helpers import BookLogger class HtmlEpubPreprocessor: def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None): self.preset = json.load(open(preset_path)) self.logger = logger self.name2function = { "table_wrapper": self._wrap_tags_with_table, "replacer": self._tags_to_correspond_livecarta_tag, "attr_replacer": self._replace_attrs_in_tags, "unwrapper": self._unwrap_tags, "inserter": self._insert_tags_into_correspond_tags } @staticmethod def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup], chapter_tag: BeautifulSoup): """ Function adds span with id from tag_to_be_removed because this tag will be removed(unwrapped/extract) Parameters ---------- tag_to_be_removed: Union[PageElement, BeautifulSoup] chapter_tag: BeautifulSoup Returns ------- NoReturn updated body tag """ def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: Union[List[str], str]): """Function inserts span before tag aren't supported by LiveCarta""" new_tag: Tag = chapter_tag.new_tag("span") new_tag.attrs["id"] = id_ or "" new_tag.attrs["class"] = class_ or "" new_tag.string = "\xa0" tag_to_be_removed.insert_before(new_tag) if tag_to_be_removed.attrs.get("id"): _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, id_=tag_to_be_removed.attrs["id"], class_=tag_to_be_removed.attrs.get("class")) @staticmethod def prepare_title(title_of_chapter: str) -> str: """ Function finalise processing/cleaning title Parameters ---------- title_of_chapter: str Returns ------- title: str cleaned title """ title = BeautifulSoup(title_of_chapter, features="lxml").string # clean extra whitespace characters ([\r\n\t\f\v ]) title = re.sub(r"[\s\xa0]", " ", title).strip() return title @staticmethod def _remove_comments(chapter_tag: BeautifulSoup): """ Function remove comments Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag Returns ------- NoReturn Chapter Tag without comments """ for tag in chapter_tag.find_all(): for element in tag(text=lambda text: isinstance(text, Comment)): element.extract() @staticmethod def _wrap_strings_with_p(chapter_tag: BeautifulSoup): """ Function converts headings that aren't supported by LiveCarta with

Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag Returns ------- None Chapter Tag with wrapped NavigableStrings """ for node in chapter_tag: if isinstance(node, NavigableString): content = str(node) content = re.sub(r"([\s\xa0])", " ", content).strip() if content: p_tag = chapter_tag.new_tag("p") p_tag.append(str(node)) node.replace_with(p_tag) def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]): """ Function wraps with Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag rules: List[Dict[str, List[str, Dict[str, str]]]] list of conditions when fire function Returns ------- NoReturn Chapter Tag with wrapped certain tags with
""" def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag: table = chapter_tag.new_tag("table") table.attrs["border"], table.attrs["align"], table.attrs["style"] \ = border, "center", f"width:{width}%;" tbody, tr, td = \ chapter_tag.new_tag("tbody"), chapter_tag.new_tag( "tr"), chapter_tag.new_tag("td") td.attrs["bgcolor"] = bg_color tag_to_wrap.wrap(td) td.wrap(tr) tr.wrap(tbody) tbody.wrap(table) table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) return table def process_tag_using_table(): _wrap_tag_with_table( width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get( "width") else "100", border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get( "border") else None, bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) tag_to_wrap.unwrap() for rule in rules: tags = rule["tags"] for attr in rule["attrs"]: for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags], {attr["name"]: re.compile(fr"{attr['value']}")}): process_tag_using_table() @staticmethod def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, List[Dict[str, str]]]]]]]): """ Function to replace all tags to correspond LiveCarta tags Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]] list of conditions when fire function Returns ------- NoReturn Chapter Tag with all tags replaced with LiveCarta tags """ for rule in rules: tags: List[str] = rule["tags"] tag_to_replace: str = rule["tag_to_replace"] if rule["condition"]: for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): if condition_on_tag[0] == 'parent_tags': for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): if tag.parent.select(condition_on_tag[1]): tag.name = tag_to_replace elif condition_on_tag[0] == 'child_tags': for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): tag.name = tag_to_replace elif condition_on_tag[0] == "attrs": for attr in rule["condition"]["attrs"]: for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], {attr["name"]: re.compile(fr"{attr['value']}")}): tag.name = tag_to_replace else: for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): # todo can cause appearance of \n

...

->

\n

...

\n

(section) tag.name = tag_to_replace @staticmethod def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]): """ Function to replace all tags to correspond LiveCarta tags Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag rules: List[Dict[str, Union[str, Dict[str, List[str]]]]] list of conditions when fire function Returns ------- NoReturn Chapter Tag with all tags replaced with LiveCarta tags """ for rule in rules: attr = rule["attr"] tags: List[str] = rule["condition"]["tags"] attr_to_replace = rule["attr_to_replace"] for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], {attr: re.compile(r".*")}): tag[attr_to_replace] = tag[attr] del tag[attr] def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]): """ Function unwrap tags and moves id to span Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag rules: Dict[str, List[str]] dict of tags to unwrap Returns ------- NoReturn Chapter Tag with unwrapped certain tags """ for tag_name in rules["tags"]: for tag in chapter_tag.select(tag_name): # if tag is a subtag if ">" in tag_name: tag.parent.attrs.update(tag.attrs) self._add_span_to_save_ids_for_links(tag, chapter_tag) tag.unwrap() @staticmethod def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, List[Dict[str, str]]]]]]]): """ Function inserts tags into correspond tags Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]] list of conditions when fire function Returns ------- NoReturn Chapter Tag with inserted tags """ def insert(tag: Tag): tag_to_insert = \ chapter_tag.new_tag(rule["tag_to_insert"]) # insert all items that was in tag to subtag and remove from tag for content in reversed(tag.contents): tag_to_insert.insert(0, content.extract()) # wrap subtag with items tag.append(tag_to_insert) for rule in rules: tags: List[str] = rule["tags"] if rule["condition"]: for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): if condition_on_tag[0] == 'parent_tags': for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): if tag.parent.select(condition_on_tag[1]): insert(tag) elif condition_on_tag[0] == 'child_tags': for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): insert(tag) elif condition_on_tag[0] == "attrs": for attr in rule["condition"]["attrs"]: for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], {attr["name"]: re.compile(fr"{attr['value']}")}): insert(tag) else: for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): insert(tag) def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str): """ Function - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content - adds span with id in order to Parameters ---------- chapter_tag: Union[BeautifulSoup, PageElement] Tag of the page title_of_chapter: str Chapter title Returns ------- NoReturn clean/remove headings & add span with id """ title_of_chapter = title_of_chapter.lower() for tag in chapter_tag.contents: tag: PageElement text: str = tag if isinstance(tag, NavigableString) else tag.text if re.sub(r"[\s\xa0]", "", text): text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces if not isinstance(tag, NavigableString): if title_of_chapter == text or \ (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): self._add_span_to_save_ids_for_links(tag, chapter_tag) tag.extract() return elif not self._remove_headings_content(tag, title_of_chapter): break else: tag.extract() return @staticmethod def _process_tables(chapter_tag: BeautifulSoup): """ Function preprocesses tables and tags(td|th|tr) Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag Returns ------- NoReturn Chapter Tag with processed tables """ tables = chapter_tag.find_all("table") for table in tables: for t_tag in table.find_all(re.compile("td|th|tr")): width = "" if t_tag.get("style"): width_match = re.search( r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) if width_match: size = width_match.group(1) width = size + "px" t_tag.attrs["width"] = t_tag.get("width") or width if t_tag.attrs.get("style"): t_tag.attrs["style"] = t_tag.attrs["style"].replace( "border:0;", "") if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": del t_tag.attrs["style"] if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: table.attrs["border"] = "1" @staticmethod def _class_removing(chapter_tag: BeautifulSoup): """ Function removes classes that aren't created by converter Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag Returns ------- NoReturn Chapter Tag without original classes of the book """ for tag in chapter_tag.find_all(recursive=True): if tag.attrs.get("class") \ and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): del tag.attrs["class"] def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: """ Function finalise processing/cleaning content Parameters ---------- title_str: str content_tag: Tag, soup object remove_title_from_chapter: bool Steps ---------- 1. comments removal 2. wrap NavigableString with tag

3-6. wrap tags with

replace tags with correspond LiveCarta tags unwrap tags insert tags into correspond tags 7. heading removal 8. process_tables 9. class removal Returns ------- content_tag: Tag prepared content """ # 1. remove comments self._remove_comments(content_tag) # 2. self._wrap_strings_with_p(content_tag) # 3-6. for rule in self.preset: func = self.name2function[rule["preset_name"]] func(content_tag, rule['rules']) # 7. if remove_title_from_chapter: self._remove_headings_content(content_tag, title_str) # 8. self._process_tables(content_tag) # 9. remove classes that weren't created by converter self._class_removing(content_tag) return content_tag