From d71ef441787536f9a899fa9bca0f36d036876ee9 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 2 Sep 2022 14:41:59 +0300 Subject: [PATCH] Merge all preset functions in 1 [Epub] --- src/epub_converter/html_epub_processor.py | 269 +++++++--------------- 1 file changed, 82 insertions(+), 187 deletions(-) diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index 914b683..2947e9d 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -7,16 +7,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString, Comment from src.util.helpers import BookLogger -class HtmlEpubPreprocessor: - def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None): +class HtmlEpubProcessor: + def __init__(self, preset_path: str = "presets/presets.json", logger: BookLogger = None): self.preset = json.load(open(preset_path)) self.logger = logger - self.name2function = { - "table_wrapper": self._wrap_tags_with_table, - "replacer": self._tags_to_correspond_livecarta_tag, - "attr_replacer": self._replace_attrs_in_tags, - "unwrapper": self._unwrap_tags, - "inserter": self._insert_tags_into_correspond_tags + self.name2action = { + "table_wrapper": self._process_tag_using_table, + "replacer": self._replace_tag, + "attr_replacer": self._replace_attr, + "unwrapper": self._unwrap_tag, + "inserter": self._insert_tag } @staticmethod @@ -116,208 +116,103 @@ class HtmlEpubPreprocessor: p_tag.append(str(node)) node.replace_with(p_tag) - def _wrap_tags_with_table(self, - chapter_tag: BeautifulSoup, - rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]): - """ - Function wraps with - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - rules: List[Dict[str, List[str, Dict[str, str]]]] - list of conditions when fire function - - Returns - ------- - NoReturn - Chapter Tag with wrapped certain tags with
- - """ - + def _process_tag_using_table(self, **kwargs): def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag: - table = chapter_tag.new_tag("table") + table = kwargs["chapter_tag"].new_tag("table") table.attrs["border"], table.attrs["align"], table.attrs["style"] \ = border, "center", f"width:{width}%;" tbody, tr, td = \ - chapter_tag.new_tag("tbody"), chapter_tag.new_tag( - "tr"), chapter_tag.new_tag("td") + kwargs["chapter_tag"].new_tag("tbody"), kwargs["chapter_tag"].new_tag( + "tr"), kwargs["chapter_tag"].new_tag("td") td.attrs["bgcolor"] = bg_color - tag_to_wrap.wrap(td) + kwargs["tag"].wrap(td) td.wrap(tr) tr.wrap(tbody) tbody.wrap(table) table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) return table - - def process_tag_using_table(): - _wrap_tag_with_table( - width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get( - "width") else "100", - border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get( - "border") else None, - bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) - self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) - tag_to_wrap.unwrap() - - for rule in rules: - tags = rule["tags"] - for attr in rule["attrs"]: - for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags], - {attr["name"]: re.compile(fr"{attr['value']}")}): - process_tag_using_table() + _wrap_tag_with_table( + width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get( + "width") else "100", + border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get( + "border") else None, + bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None) + self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["chapter_tag"]) + kwargs["tag"].unwrap() @staticmethod - def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, - rules: List[Dict[str, - Union[List[str], str, Dict[str, - Union[str, List[Dict[str, str]]]]]]]): + def _replace_tag(**kwargs): + tag_to_replace: str = kwargs["rule"]["tag_to_replace"] + kwargs["tag"].name = tag_to_replace + + @staticmethod + def _replace_attr(**kwargs): + attr = kwargs["rule"]["attr"] + attr_to_replace = kwargs["rule"]["attr_to_replace"] + kwargs["tag"][attr_to_replace] = kwargs["tag"][attr] + del kwargs["tag"][attr] + + @staticmethod + def _unwrap_tag(**kwargs): + kwargs["tag"].unwrap() + + @staticmethod + def _insert_tag(**kwargs): + tag_to_insert = \ + kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"]) + # insert all items that was in tag to subtag and remove from tag + for content in reversed(kwargs["tag"].contents): + tag_to_insert.insert(0, content.extract()) + # wrap subtag with items + kwargs["tag"].append(tag_to_insert) + + def _process_tags(self, + chapter_tag: BeautifulSoup, + rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], + action): """ - Function to replace all tags to correspond LiveCarta tags + Function do action with tags Parameters ---------- chapter_tag: BeautifulSoup Tag & contents of the chapter tag - rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]] + rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] list of conditions when fire function - + action: function + action what to do with tag Returns ------- NoReturn - Chapter Tag with all tags replaced with LiveCarta tags + Body Tag with processed certain tags """ for rule in rules: - tags: List[str] = rule["tags"] - tag_to_replace: str = rule["tag_to_replace"] + tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"] if rule["condition"]: for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): if condition_on_tag[0] == "parent_tags": - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): - if tag.parent.select(condition_on_tag[1]): - tag.name = tag_to_replace + for tag in chapter_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag) + for tag in tags])): + tag.parent.attrs.update(tag.attrs) + action(chapter_tag=chapter_tag, tag=tag, rule=rule) elif condition_on_tag[0] == "child_tags": - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): - if "not" in condition_on_tag[1]: - if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])): - tag.name = tag_to_replace - else: - if tag.select(condition_on_tag[1]): - tag.name = tag_to_replace + for tag in chapter_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1] + for tag in tags])): + action(chapter_tag=chapter_tag, tag=tag, rule=rule) elif condition_on_tag[0] == "attrs": for attr in rule["condition"]["attrs"]: for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], - {attr["name"]: re.compile(fr"{attr['value']}")}): - tag.name = tag_to_replace + {attr["name"]: re.compile(fr"{attr['value']}")}): + action(chapter_tag=chapter_tag, tag=tag, rule=rule) + # attr replacer + elif condition_on_tag[0] == "tags": + attr = rule["attr"] + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr: re.compile(r".*")}): + action(chapter_tag=chapter_tag, tag=tag, rule=rule) else: for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): - # todo can cause appearance of \n

...

->

\n

...

\n

(section) - tag.name = tag_to_replace - - @staticmethod - def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]): - """ - Function to replace all tags to correspond LiveCarta tags - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - rules: List[Dict[str, Union[str, Dict[str, List[str]]]]] - list of conditions when fire function - - Returns - ------- - NoReturn - Chapter Tag with all tags replaced with LiveCarta tags - - """ - for rule in rules: - attr = rule["attr"] - tags: List[str] = rule["condition"]["tags"] - attr_to_replace = rule["attr_to_replace"] - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], - {attr: re.compile(r".*")}): - tag[attr_to_replace] = tag[attr] - del tag[attr] - - def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: List[Dict[str, List[str]]]): - """ - Function unwrap tags and moves id to span - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - rules: List[Dict[str, List[str]]] - list of conditions when fire function - - Returns - ------- - NoReturn - Chapter Tag with unwrapped certain tags - - """ - for rule in rules: - for tag_name in rule["tags"]: - for tag in chapter_tag.select(tag_name): - # if tag is a subtag - if ">" in tag_name: - tag.parent.attrs.update(tag.attrs) - self._add_span_to_save_ids_for_links(tag, chapter_tag) - tag.unwrap() - - @staticmethod - def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, - rules: List[Dict[str, - Union[List[str], str, Dict[str, - Union[str, List[Dict[str, str]]]]]]]): - """ - Function inserts tags into correspond tags - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]] - list of conditions when fire function - - Returns - ------- - NoReturn - Chapter Tag with inserted tags - - """ - def insert(tag: Tag): - tag_to_insert = \ - chapter_tag.new_tag(rule["tag_to_insert"]) - # insert all items that was in tag to subtag and remove from tag - for content in reversed(tag.contents): - tag_to_insert.insert(0, content.extract()) - # wrap subtag with items - tag.append(tag_to_insert) - - for rule in rules: - tags: List[str] = rule["tags"] - if rule["condition"]: - for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): - if condition_on_tag[0] == "parent_tags": - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): - if tag.parent.select(condition_on_tag[1]): - insert(tag) - elif condition_on_tag[0] == "child_tags": - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): - if "not" in condition_on_tag[1]: - if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])): - tag.unwrap() - else: - if tag.select(condition_on_tag[1]): - tag.unwrap() - elif condition_on_tag[0] == "attrs": - for attr in rule["condition"]["attrs"]: - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], - {attr["name"]: re.compile(fr"{attr['value']}")}): - insert(tag) - else: - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): - insert(tag) + action(chapter_tag=chapter_tag, tag=tag, rule=rule) def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str): """ @@ -414,14 +309,14 @@ class HtmlEpubPreprocessor: and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): del tag.attrs["class"] - def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: + def prepare_content(self, title_str: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: """ Function finalise processing/cleaning content Parameters ---------- title_str: str - content_tag: Tag, soup object + chapter_tag: Tag, soup object remove_title_from_chapter: bool @@ -444,18 +339,18 @@ class HtmlEpubPreprocessor: """ # 1. remove comments - self._remove_comments(content_tag) + self._remove_comments(chapter_tag) # 2. - self._wrap_strings_with_p(content_tag) + self._wrap_strings_with_p(chapter_tag) # 3-6. for rule in self.preset: - func = self.name2function[rule["preset_name"]] - func(content_tag, rule["rules"]) + action = self.name2action[rule["preset_name"]] + self._process_tags(chapter_tag, rule["rules"], action) # 7. if remove_title_from_chapter: - self._remove_headings_content(content_tag, title_str) + self._remove_headings_content(chapter_tag, title_str) # 8. - self._process_tables(content_tag) + self._process_tables(chapter_tag) # 9. remove classes that weren't created by converter - self._class_removing(content_tag) - return content_tag + self._class_removing(chapter_tag) + return chapter_tag