Merge processing tags[Docx, Epub]

2022-09-06 16:26:08 +03:00
parent ea37b19c36
commit ddc45e2d04
6 changed files with 226 additions and 277 deletions
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -5,6 +5,7 @@ from threading import Event

 from src.book_solver import BookSolver
 from src.util.helpers import BookLogger
+from src.html_preprocessor import HtmlPreprocessor
 from src.style_preprocessor import StylePreprocessor
 from src.docx_converter.docx2libre_html import Docx2LibreHTML
 from src.docx_converter.html_docx_processor import HTMLDocxProcessor
@@ -48,10 +49,14 @@ class DocxBook(BookSolver):

        # 2. Parses and cleans html, gets list of tags, gets footnotes
        try:
-            style_processor = StylePreprocessor()
-            parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
-                                       logger=self.logger_object, style_processor=style_processor)
-            bs_tags, footnotes, top_level_headers = parser.process_html(
+            html_preprocessor = HtmlPreprocessor(
+                logger=self.logger_object, preset_path="presets/docx_presets.json")
+            style_preprocessor = StylePreprocessor()
+            html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup,
+                                               logger=self.logger_object,
+                                               html_preprocessor=html_preprocessor,
+                                               style_preprocessor=style_preprocessor)
+            bs_tags, footnotes, top_level_headers = html_processor.process_html(
                self.access, html_converter.html_path, self.book_id)
        except Exception as exc:
            self.logger_object.log(
@@ -84,10 +89,12 @@ if __name__ == "__main__":
    html_converter = Docx2LibreHTML(file_path=docx_file_path,
                                    logger=logger_object, libre_locker=locker)

-    css_processor = StylePreprocessor()
-    parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
-                               style_processor=css_processor, preset_path="../../presets/docx_presets.json")
-    content, footnotes, top_level_headers = parser.process_html(
+    html_preprocessor = HtmlPreprocessor(
+        logger=logger_object, preset_path="../../presets/docx_presets.json")
+    style_preprocessor = StylePreprocessor()
+    html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
+                                       html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
+    content, footnotes, top_level_headers = html_processor.process_html(
        html_path=html_converter.html_path, book_id=html_converter.book_id)

    json_converter = LibreHTML2JSONConverter(
--- a/src/docx_converter/html_docx_processor.py
+++ b/src/docx_converter/html_docx_processor.py
@@ -1,32 +1,23 @@
 import re
-import json
 import pathlib
 from typing import List, Tuple, Dict, Union
 from bs4 import BeautifulSoup, Tag, NavigableString

 from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
+from src.html_preprocessor import _preprocess_html
 from src.docx_converter.image_processing import process_images
 from src.docx_converter.footnotes_processing import process_footnotes
 from src.tag_inline_style_processor import modify_html_soup_with_css_styles


 class HTMLDocxProcessor:
-
-    def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
-                 style_processor, preset_path: str = "presets/docx_presets.json"):
-        self.html_soup = html_soup
-        self.body_tag = html_soup.body
+    def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
        self.logger = logger
-        self.preset = json.load(open(preset_path))
-        self.style_processor = style_processor
-        self.name2action = {
-            "wrapper": self._wrap_tag,
-            "decomposer": self._decompose_tag,
-            "replacer": self._replace_tag,
-            "attr_replacer": self._replace_attr,
-            "unwrapper": self._unwrap_tag
-        }
+        self.html_soup = html_soup
+        self.body_tag = self.html_soup.body
+        self.html_preprocessor = html_preprocessor
+        self.style_preprocessor = style_preprocessor

    def _process_toc_links(self):
        """Function to extract nodes which contains TOC links, remove links from file and detect headers."""
@@ -59,84 +50,6 @@ class HTMLDocxProcessor:
                                f"Check the structure of the file."
                                f"Tag name: {tag.name}")

-    def _wrap_tag(self, **kwargs):
-        kwargs["tag"].wrap(self.html_soup.new_tag(kwargs["rule"]["tag_to_wrap"]))
-
-    @staticmethod
-    def _decompose_tag(**kwargs):
-        kwargs["tag"].decompose()
-
-    @staticmethod
-    def _replace_tag(**kwargs):
-        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
-        kwargs["tag"].name = tag_to_replace
-
-    @staticmethod
-    def _replace_attr(**kwargs):
-        attr, attr_value =\
-            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
-        attr_to_replace, attr_value_to_replace =\
-            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
-        if attr_to_replace:
-            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
-            if attr_value_to_replace:
-                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
-            del kwargs["tag"][attr]
-        elif attr_value_to_replace:
-            kwargs["tag"].attrs[attr] = attr_value_to_replace
-
-    @staticmethod
-    def _unwrap_tag(**kwargs):
-        kwargs["tag"].unwrap()
-
-    @staticmethod
-    def _process_tags(body_tag: Tag,
-                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
-                      action):
-        """
-        Function do action with tags
-        Parameters
-        ----------
-        body_tag: Tag
-            Tag & contents of the chapter tag
-        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
-            list of conditions when fire function
-        action: function
-            action what to do with tag
-        Returns
-        -------
-        NoReturn
-            Body Tag with processed certain tags
-
-        """
-        for rule in rules:
-            tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
-            if rule["condition"]:
-                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
-                    if condition_on_tag[0] == "parent_tags":
-                        for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
-                                                              for tag in tags])):
-                            tag.parent.attrs.update(tag.attrs)
-                            action(body_tag=body_tag, tag=tag, rule=rule)
-                    elif condition_on_tag[0] == "child_tags":
-                        for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
-                                                              for tag in tags])):
-                            action(body_tag=body_tag, tag=tag, rule=rule)
-                    elif condition_on_tag[0] == "attrs":
-                        for attr in rule["condition"]["attrs"]:
-                            for tag in body_tag.find_all([re.compile(tag) for tag in tags],
-                                                         {attr["name"]: re.compile(fr"{attr['value']}")}):
-                                action(body_tag=body_tag, tag=tag, rule=rule)
-                    # attr replacer
-                    elif condition_on_tag[0] == "tags":
-                        attr = rule["attr"]
-                        for tag in body_tag.find_all([re.compile(tag) for tag in tags],
-                                                     {attr['name']: re.compile(fr"{attr['value']}")}):
-                            action(body_tag=body_tag, tag=tag, rule=rule)
-            else:
-                for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
-                    action(body_tag=body_tag, tag=tag, rule=rule)
-
    def _process_quotes(self):
        """
            Function to process block quotes.
@@ -175,14 +88,6 @@ class HTMLDocxProcessor:

                    table.replaceWith(new_div)

-    @staticmethod
-    def convert_pt_to_px(value: float) -> float:
-        value = float(value)
-        if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
-            return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
-        else:
-            return value
-
    def _process_tables(self):
        """Function to process tables. Set "border" attribute."""
        tables = self.body_tag.find_all("table")
@@ -197,7 +102,10 @@ class HTMLDocxProcessor:
                        size = match.group(1)
                        units = match.group(2)
                        if units == "pt":
-                            size = self.convert_pt_to_px(size)
+                            value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\
+                                if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\
+                                else float(size)
+                            size = value
                        sizes.append(float(size))
                width = td.get("width")
                td.attrs = {}
@@ -392,14 +300,13 @@ class HTMLDocxProcessor:
        self.logger.log(f"Processing TOC and headers.")
        self._process_toc_links()

-        for rule in self.preset:
-            self.logger.log(rule["preset_name"].title() + " process.")
-            action = self.name2action[rule["preset_name"]]
-            self._process_tags(self.body_tag, rule["rules"], action)
+        _preprocess_html(html_preprocessor=self.html_preprocessor,
+                         html_soup=self.html_soup)

        # CSS after html processing cause of <fonts> that aren't supported by html
        self.logger.log("CSS inline style preprocessing.")
-        self.style_processor.process_inline_styles_in_html_soup(self.body_tag)
+        self.style_preprocessor.process_inline_styles_in_html_soup(
+            self.body_tag)

        self.logger.log("CSS inline style processing.")
        modify_html_soup_with_css_styles(self.body_tag)