Add replacer(-two_columns,-font_to_span, preproc_headings)

2022-09-05 17:23:03 +03:00
parent 3fcff462d3
commit 9e31d3152c
2 changed files with 106 additions and 121 deletions
--- a/presets/docx_presets.json
+++ b/presets/docx_presets.json
@@ -11,12 +11,63 @@
                        {
                            "name": "title",
                            "value": "footer"
+                        },
+                        {
+                            "name": "id",
+                            "value": "^Table of Contents\\d+"
                        }
                    ]
                }
            }
        ]
    },
+    {
+        "preset_name": "replacer",
+        "rules": [
+            {
+                "tags": ["^font$"],
+                "condition": null,
+                "tag_to_replace": "span"
+            },
+            {
+                "tags": ["^h[6-9]$"],
+                "condition": null,
+                "tag_to_replace": "p"
+            },
+            {
+                "tags": ["^div$"],
+                "condition": {
+                    "parent_tags": null,
+                    "child_tags": null,
+                    "attrs": [
+                        {
+                            "name": "style",
+                            "value": "column-count: 2"
+                        }
+                    ]
+                },
+                "tag_to_replace": "p"
+            }
+        ]
+    },
+    {
+        "preset_name": "attr_replacer",
+        "rules": [
+            {
+                "attr": {
+                    "name": "style",
+                    "value": "column-count: 2"
+                },
+                "condition": {
+                    "tags": ["^p$"]
+                },
+                "attr_to_replace": {
+                    "name": null,
+                    "value": "columns2"
+                }
+            }
+        ]
+    },
    {
        "preset_name": "unwrapper",
        "rules": [
@@ -33,16 +84,7 @@
                        {
                            "name": "lang",
                            "value": "^ru-RU$"
-                        }
-                    ]
-                }
-            },
-            {
-                "tags": ["^font$"],
-                "condition": {
-                    "parent_tags": null,
-                    "child_tags": null,
-                    "attrs": [
+                        },
                        {
                            "name": "face",
                            "value": "^Times New Roman[\\w, ]+$"
@@ -53,7 +95,7 @@
            {
                "tags": ["^p$"],
                "condition": {
-                    "parent_tags": "li",
+                    "parent_tags": ":is(li)",
                    "child_tags": null,
                    "attrs": null
                }
@@ -74,8 +116,8 @@
            {
                "tags": ["^u$"],
                "condition": {
-                    "parent_tags": "a",
-                    "child_tags": "a",
+                    "parent_tags": ":is(a)",
+                    "child_tags": ":is(a)",
                    "attrs": null
                }
            },
--- a/src/docx_converter/html_docx_processor.py
+++ b/src/docx_converter/html_docx_processor.py
@@ -22,6 +22,8 @@ class HTMLDocxProcessor:
        self.style_processor = style_processor
        self.name2action = {
            "decomposer": self._decompose_tag,
+            "replacer": self._replace_tag,
+            "attr_replacer": self._replace_attr,
            "unwrapper": self._unwrap_tag
        }
        self.top_level_headers = None
@@ -59,15 +61,34 @@ class HTMLDocxProcessor:
                                f"Tag name: {tag.name}")

    @staticmethod
-    def _decompose_tag(tag):
-        tag.decompose()
+    def _decompose_tag(**kwargs):
+        kwargs["tag"].decompose()

    @staticmethod
-    def _unwrap_tag(tag):
-        tag.unwrap()
+    def _replace_tag(**kwargs):
+        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
+        kwargs["tag"].name = tag_to_replace

    @staticmethod
-    def _process_tags(body_tag: BeautifulSoup,
+    def _replace_attr(**kwargs):
+        attr, attr_value =\
+            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
+        attr_to_replace, attr_value_to_replace =\
+            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
+        if attr_to_replace:
+            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
+            if attr_value_to_replace:
+                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
+            del kwargs["tag"][attr]
+        elif attr_value_to_replace:
+            kwargs["tag"].attrs[attr] = attr_value_to_replace
+
+    @staticmethod
+    def _unwrap_tag(**kwargs):
+        kwargs["tag"].unwrap()
+
+    @staticmethod
+    def _process_tags(body_tag: Tag,
                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
                      action):
        """
@@ -87,104 +108,32 @@ class HTMLDocxProcessor:

        """
        for rule in rules:
-            tags: List[str] = rule["tags"]
+            tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
            if rule["condition"]:
                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
                    if condition_on_tag[0] == "parent_tags":
                        for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
                                                              for tag in tags])):
                            tag.parent.attrs.update(tag.attrs)
-                            action(tag)
+                            action(body_tag=body_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "child_tags":
                        for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
                                                              for tag in tags])):
-                            action(tag)
+                            action(body_tag=body_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "attrs":
                        for attr in rule["condition"]["attrs"]:
                            for tag in body_tag.find_all([re.compile(tag) for tag in tags],
                                                         {attr["name"]: re.compile(fr"{attr['value']}")}):
-                                action(tag)
+                                action(body_tag=body_tag, tag=tag, rule=rule)
+                    # attr replacer
+                    elif condition_on_tag[0] == "tags":
+                        attr = rule["attr"]
+                        for tag in body_tag.find_all([re.compile(tag) for tag in tags],
+                                                     {attr['name']: re.compile(fr"{attr['value']}")}):
+                            action(body_tag=body_tag, tag=tag, rule=rule)
            else:
                for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
-                    action(tag)
-
-    @classmethod
-    def convert_pt_to_px(cls, value: float) -> float:
-        value = float(value)
-        if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
-            return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
-        else:
-            return value
-
-    @classmethod
-    def convert_font_pt_to_px(cls, style: str) -> str:
-        """
-        Function converts point in the font-size to pixels.
-        Parameters
-        ----------
-        style: str
-            str with style to proces
-
-        Returns
-        -------
-        : str
-            str with converted style
-
-        """
-        size = re.search(r"font-size: (\d{1,3})pt", style)
-        if size is None:
-            return style
-        size = size.group(1)
-        new_size = cls.convert_pt_to_px(size)
-        if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
-            return ""
-        return re.sub(size + "pt", str(new_size) + "px", style)
-
-    def _font_to_span(self):
-        """
-        Function to convert <font> tag to <span>.
-        If font style is default, then remove this tag.
-        """
-        fonts = self.body_tag.find_all("font")
-        for font in fonts:
-            face, style, color =\
-                font.get("face"), font.get("style"), font.get("color")
-
-            font.attrs, font.name = {}, "span"
-            if style:
-                style = self.convert_font_pt_to_px(style)
-                if style != "":
-                    if color and color in LiveCartaConfig.COLORS_MAP:
-                        style += f"; color: {color};"
-                    font.attrs["style"] = style
-            elif color and color in LiveCartaConfig.COLORS_MAP:
-                font.attrs["style"] = f"color: {color};"
-
-            if len(font.attrs) == 0:
-                font.unwrap()
-
-        # on this step there should be no more <font> tags
-        assert len(self.body_tag.find_all("font")) == 0
-
-    def clean_trash(self):
-        """Function to remove all styles and tags we don"t need."""
-        # todo replacer
-        self._font_to_span()
-
-        # replace toc with empty <TOC> tag
-        tables = self.body_tag.find_all(
-            "div", id=re.compile(r"^Table of Contents\d+"))
-        for table in tables:
-            table.wrap(self.html_soup.new_tag("TOC"))
-            table.decompose()
-
-    def _preprocessing_headings(self):
-        # todo replacer
-        """Function to convert all lower level headings to p tags"""
-        pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$"
-        header_tags = self.body_tag.find_all(re.compile(pattern))
-        for tag in header_tags:
-            tag.name = "p"
+                    action(body_tag=body_tag, tag=tag, rule=rule)

    def _process_paragraph(self):
        """Function to process <p> tags (text-align and text-indent value)."""
@@ -237,16 +186,6 @@ class HTMLDocxProcessor:
            if style:
                p.attrs["style"] = style

-    def _process_two_columns(self):
-        """Function to process paragraphs which has two columns layout."""
-        # todo replacer
-        two_columns = self.body_tag.find_all("div", style="column-count: 2")
-        for div in two_columns:
-            for child in div.children:
-                if child.name == "p":
-                    child["class"] = "columns2"
-            div.unwrap()
-
    def _process_quotes(self):
        """
            Function to process block quotes.
@@ -285,6 +224,14 @@ class HTMLDocxProcessor:

                    table.replaceWith(new_div)

+    @staticmethod
+    def convert_pt_to_px(value: float) -> float:
+        value = float(value)
+        if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
+            return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
+        else:
+            return value
+
    def _process_tables(self):
        """Function to process tables. Set "border" attribute."""
        tables = self.body_tag.find_all("table")
@@ -490,24 +437,20 @@ class HTMLDocxProcessor:
        self.logger.log(f"Processing TOC and headers.")
        self._process_toc_links()

+        for rule in self.preset:
+            self.logger.log(rule["preset_name"] + " process.")
+            action = self.name2action[rule["preset_name"]]
+            self._process_tags(self.body_tag, rule["rules"], action)
+
        self.logger.log("CSS inline style preprocessing.")
        self.style_processor.process_inline_styles_in_html_soup(self.html_soup)

        self.logger.log("CSS inline style processing.")
        modify_html_soup_with_css_styles(self.html_soup)

-        for rule in self.preset:
-            self.logger.log(rule["preset_name"] + " process.")
-            action = self.name2action[rule["preset_name"]]
-            self._process_tags(self.body_tag, rule["rules"], action)
-
-        self.clean_trash()
-
        # process main elements of the .html doc
        self.logger.log(f"Processing main elements of html.")
-        self._preprocessing_headings()
        self._process_paragraph()
-        self._process_two_columns()

        self.logger.log("Block quotes processing.")
        self._process_quotes()