Merge processing tags[Docx, Epub]

2022-09-06 16:26:08 +03:00
parent ea37b19c36
commit ddc45e2d04
6 changed files with 226 additions and 277 deletions
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -13,7 +13,7 @@ from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.style_preprocessor import StylePreprocessor
-from src.epub_converter.html_epub_processor import HtmlEpubProcessor
+from src.epub_converter.html_epub_processor import HTMLEpubProcessor
 from src.epub_converter.image_processing import update_images_src_links
 from src.epub_converter.footnotes_processing import preprocess_footnotes
 from src.tag_inline_style_processor import modify_html_soup_with_css_styles
@@ -21,7 +21,7 @@ from src.tag_inline_style_processor import modify_html_soup_with_css_styles

 class EpubConverter:
    def __init__(self, book_path, access=None, logger: BookLogger = None,
-                 style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
+                 style_processor: StylePreprocessor = None, html_processor: HTMLEpubProcessor = None):
        self.book_path = book_path
        self.access = access
        self.logger: BookLogger = logger
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -30,13 +30,16 @@ class EpubBook(BookSolver):
            json for LiveCarta platform

        """
-        style_processor = StylePreprocessor()
-        html_processor = HtmlEpubProcessor(
-            logger=self.logger_object)
+        html_preprocessor = HtmlPreprocessor(
+            logger=self.logger_object, preset_path="presets/epub_presets.json")
+        style_preprocessor = StylePreprocessor()
+        html_processor = HTMLEpubProcessor(logger=self.logger_object,
+                                           html_preprocessor=html_preprocessor)
        json_converter = EpubConverter(
            self.book_path, access=self.access, logger=self.logger_object,
-            style_processor=style_processor, html_processor=html_processor)
+            style_processor=style_preprocessor, html_processor=html_processor)
        content_dict = json_converter.convert_to_dict()
+
        return content_dict


--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -1,58 +1,16 @@
 import re
-import json
-from typing import List, Dict, Union
+from typing import Union
 from bs4.element import PageElement
 from bs4 import BeautifulSoup, Tag, NavigableString, Comment

 from src.util.helpers import BookLogger
+from src.html_preprocessor import _preprocess_html


-class HtmlEpubProcessor:
-    def __init__(self, preset_path: str = "presets/epub_presets.json", logger: BookLogger = None):
-        self.preset = json.load(open(preset_path))
+class HTMLEpubProcessor:
+    def __init__(self, logger: BookLogger = None, html_preprocessor=None):
        self.logger = logger
-        self.name2action = {
-            "table_wrapper": self._process_tag_using_table,
-            "replacer": self._replace_tag,
-            "attr_replacer": self._replace_attr,
-            "unwrapper": self._unwrap_tag,
-            "inserter": self._insert_tag
-        }
-
-    @staticmethod
-    def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
-                                        chapter_tag: BeautifulSoup):
-        """
-        Function adds span with id from tag_to_be_removed
-        because this tag will be removed(unwrapped/extract)
-        Parameters
-        ----------
-        tag_to_be_removed: Union[PageElement, BeautifulSoup]
-
-        chapter_tag: BeautifulSoup
-
-        Returns
-        -------
-        NoReturn
-            updated body tag
-
-        """
-        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
-                                               tag_to_be_removed: Tag,
-                                               id_: str,
-                                               class_: Union[List[str], str]):
-            """Function inserts span before tag aren't supported by LiveCarta"""
-            new_tag: Tag = chapter_tag.new_tag("span")
-            new_tag.attrs["id"] = id_ or ""
-            new_tag.attrs["class"] = class_ or ""
-            new_tag.string = "\xa0"
-            tag_to_be_removed.insert_before(new_tag)
-
-        if tag_to_be_removed.attrs.get("id"):
-            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
-                                               tag_to_be_removed=tag_to_be_removed,
-                                               id_=tag_to_be_removed.attrs["id"],
-                                               class_=tag_to_be_removed.attrs.get("class"))
+        self.html_preprocessor = html_preprocessor

    @staticmethod
    def prepare_title(title_of_chapter: str) -> str:
@@ -116,111 +74,6 @@ class HtmlEpubProcessor:
                    p_tag.append(str(node))
                    node.replace_with(p_tag)

-    def _process_tag_using_table(self, **kwargs):
-        def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
-            table = kwargs["chapter_tag"].new_tag("table")
-            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
-                = border, "center", f"width:{width}%;"
-            tbody, tr, td = \
-                kwargs["chapter_tag"].new_tag("tbody"), kwargs["chapter_tag"].new_tag(
-                    "tr"), kwargs["chapter_tag"].new_tag("td")
-            td.attrs["bgcolor"] = bg_color
-            kwargs["tag"].wrap(td)
-            td.wrap(tr)
-            tr.wrap(tbody)
-            tbody.wrap(table)
-            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
-            return table
-        _wrap_tag_with_table(
-            width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
-                "width") else "100",
-            border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
-                "border") else None,
-            bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
-        self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["chapter_tag"])
-        kwargs["tag"].unwrap()
-
-    @staticmethod
-    def _replace_tag(**kwargs):
-        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
-        kwargs["tag"].name = tag_to_replace
-
-    @staticmethod
-    def _replace_attr(**kwargs):
-        attr, attr_value =\
-            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
-        attr_to_replace, attr_value_to_replace =\
-            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
-        if attr_to_replace:
-            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
-            if attr_value_to_replace:
-                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
-            del kwargs["tag"][attr]
-        elif attr_value_to_replace:
-            kwargs["tag"].attrs[attr] = attr_value_to_replace
-
-    @staticmethod
-    def _unwrap_tag(**kwargs):
-        kwargs["tag"].unwrap()
-
-    @staticmethod
-    def _insert_tag(**kwargs):
-        tag_to_insert = \
-            kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
-        # insert all items that was in tag to subtag and remove from tag
-        for content in reversed(kwargs["tag"].contents):
-            tag_to_insert.insert(0, content.extract())
-        # wrap subtag with items
-        kwargs["tag"].append(tag_to_insert)
-
-    @staticmethod
-    def _process_tags(chapter_tag: BeautifulSoup,
-                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
-                      action):
-        """
-        Function do action with tags
-        Parameters
-        ----------
-        chapter_tag: BeautifulSoup
-            Tag & contents of the chapter tag
-        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
-            list of conditions when fire function
-        action: function
-            action what to do with tag
-        Returns
-        -------
-        NoReturn
-            Body Tag with processed certain tags
-
-        """
-        for rule in rules:
-            tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
-            if rule["condition"]:
-                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
-                    if condition_on_tag[0] == "parent_tags":
-                        for tag in chapter_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
-                                                                 for tag in tags])):
-                            tag.parent.attrs.update(tag.attrs)
-                            action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-                    elif condition_on_tag[0] == "child_tags":
-                        for tag in chapter_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
-                                                                 for tag in tags])):
-                            action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-                    elif condition_on_tag[0] == "attrs":
-                        for attr in rule["condition"]["attrs"]:
-                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
-                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
-                                action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-                    # attr replacer
-                    elif condition_on_tag[0] == "tags":
-                        attr = rule["attr"]
-                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
-                                                        {attr['name']: re.compile(fr"{attr['value']}")}):
-                            action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-            else:
-                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
-                    action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-
    def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
        """
        Function
@@ -250,7 +103,8 @@ class HtmlEpubProcessor:
                    if title_of_chapter == text or \
                            (title_of_chapter in text and
                             re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
-                        self._add_span_to_save_ids_for_links(tag, chapter_tag)
+                        self.html_preprocessor._add_span_to_save_ids_for_links(
+                            tag, chapter_tag)
                        tag.extract()
                        return
                    elif not self._remove_headings_content(tag, title_of_chapter):
@@ -350,9 +204,8 @@ class HtmlEpubProcessor:
        # 2.
        self._wrap_strings_with_p(chapter_tag)
        # 3-6.
-        for rule in self.preset:
-            action = self.name2action[rule["preset_name"]]
-            self._process_tags(chapter_tag, rule["rules"], action)
+        _preprocess_html(
+            html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
        # 7.
        if remove_title_from_chapter:
            self._remove_headings_content(chapter_tag, title_str)