Merge pull request #308 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
2022-10-20 17:08:28 +03:00
parent 7169256a41 a21a4b55b3
commit 0893bdac42
6 changed files with 240 additions and 102 deletions
--- a/presets/docx_presets.json
+++ b/presets/docx_presets.json
@@ -1,5 +1,5 @@
 [
-        {
+    {
        "preset_name": "wrapper",
        "rules": [
            {
@@ -34,7 +34,17 @@
                        {
                            "name": "title",
                            "value": "footer"
-                        },
+                        }
                    ],
                    "text": null
                }
            },
            {
                "tags": ["^div$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "id",
                            "value": "^Table of Contents\\d+"
@@ -104,15 +114,44 @@
                "condition": {
                    "parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)",
                    "child_tags": null,
                    "attrs": null,
                    "text": null
                }
            },
            {
                "tags": ["^span$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "style",
                            "value": "(^background: #[\\da-fA-F]{6}$)|(^letter-spacing: -?[\\d.]+pt$)"
-                        },
+                        }
                    ],
                    "text": null
                }
            },
            {
                "tags": ["^span$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "lang",
                            "value": "^ru-RU$"
-                        },
+                        }
                    ],
                    "text": null
                }
            },
            {
                "tags": ["^span$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "face",
                            "value": "^Times New Roman[\\w, ]+$"
@@ -148,6 +187,15 @@
                "tags": ["^u$"],
                "condition": {
                    "parent_tags": ":is(a)",
                    "child_tags": null,
                    "attrs": null,
                    "text": null
                }
            },
            {
                "tags": ["^u$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": ":is(a)",
                    "attrs": null,
                    "text": null
--- a/presets/epub_presets.json
+++ b/presets/epub_presets.json
@@ -11,15 +11,42 @@
                        {
                            "name": "width",
                            "value": ".*"
-                        },
+                        }
                    ]
                }
            },
            {
                "tags": ["^div$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "border",
                            "value": ".*"
-                        },
+                        }
                    ]
                }
            },
            {
                "tags": ["^div$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "style",
                            "value": "border.*"
-                        },
+                        }
                    ]
                }
            },
            {
                "tags": ["^div$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "bgcolor",
                            "value": ".*"
@@ -69,7 +96,7 @@
            {
                "tags": ["^code$", "^kbd$", "^var$"],
                "condition": {
-                    "parent_tags": ":not(pre)",
+                    "parent_tags": ":not(pre, span)",
                    "child_tags": null,
                    "attrs": null
                },
@@ -99,6 +126,15 @@
                }
            }
        ]
    },
        {
        "preset_name": "attrs_remover",
        "rules": [
            {
                "tags": ["^sup$"],
                "condition": null
            }
        ]
    },
    {
        "preset_name": "attr_replacer",
@@ -171,4 +207,4 @@
            }
        ]
    }
-]
+]
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -1,5 +1,6 @@
 import json
 import codecs
 import logging
 from src.book_solver import BookSolver
 from src.util.helpers import BookLogger
@@ -30,11 +31,19 @@ class EpubBook(BookSolver):
            json for LiveCarta platform
        """
        html_preprocessor = HtmlPresetsProcessor(
            logger=self.logger_object, preset_path="presets/epub_presets.json")
        style_preprocessor = StyleReader()
-        html_processor = HtmlEpubProcessor(logger=self.logger_object,
+        # Parses and cleans html, gets list of tags, gets footnotes
-                                           html_preprocessor=html_preprocessor)
+        try:
            html_preprocessor = HtmlPresetsProcessor(
                logger=self.logger_object, preset_path="presets/epub_presets.json")
            html_processor = HtmlEpubProcessor(logger=self.logger_object,
                                               html_preprocessor=html_preprocessor)
        except Exception as exc:
            self.logger_object.log(
                "Error has occurred while processing .html", logging.ERROR)
            self.logger_object.log_error_to_main_log()
            self.status_wrapper.set_error()
            raise exc
        json_converter = EpubConverter(
            self.book_path, access=self.access, logger=self.logger_object,
            style_processor=style_preprocessor, html_processor=html_processor)
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -107,12 +107,10 @@ class HtmlEpubProcessor:
                                                        len(text_preparing(tag)) != 0 and
                                                        re.findall(r"^h[1-5]$", tag.name or chapter_tag.name))
        if title_in_text:
-            self.html_preprocessor._add_span_to_save_ids_for_links(
+            self.html_preprocessor.add_span_to_save_ids_for_links(title_in_text[-1], chapter_tag)
                title_in_text[-1], chapter_tag)
            title_in_text[-1].extract()
        elif text_in_title:
-            [self.html_preprocessor._add_span_to_save_ids_for_links(
+            [self.html_preprocessor.add_span_to_save_ids_for_links(tag, chapter_tag) for tag in text_in_title]
                tag, chapter_tag) for tag in text_in_title]
            [tag.extract() for tag in text_in_title]
    @staticmethod
@@ -135,12 +133,12 @@ class HtmlEpubProcessor:
                    and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
                del tag.attrs["class"]
-    def prepare_content(self, title_str: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
+    def prepare_content(self, title: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
        """
        Function finalise processing/cleaning content
        Parameters
        ----------
-        title_str: str
+        title: str
        chapter_tag: BeautifulSoup, soup object
@@ -170,7 +168,7 @@ class HtmlEpubProcessor:
        self._wrap_strings_with_p(chapter_tag)
        # 3.
        if remove_title_from_chapter:
-            self._remove_headings_content(chapter_tag, title_str)
+            self._remove_headings_content(chapter_tag, title)
        # 4.
        _process_presets(
            html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
--- a/src/html_presets_processor.py
+++ b/src/html_presets_processor.py
@@ -2,7 +2,7 @@ import re
 import json
 from bs4 import BeautifulSoup, Tag
 from bs4.element import PageElement
-from typing import List, Dict, Union
+from typing import List, Set, Dict, Union
 from src.util.helpers import BookLogger
@@ -16,15 +16,60 @@ class HtmlPresetsProcessor:
            "table_wrapper": self._process_tag_using_table,
            "decomposer": self._decompose_tag,
            "replacer": self._replace_tag,
            "attrs_remover": self._remove_attrs,
            "attr_replacer": self._replace_attr,
            "unwrapper": self._unwrap_tag,
            "inserter": self._insert_tag,
            "text_replacer": self._replace_text
        }
        self.conditions = {
            "parent_tags": self._tags_with_parent_condition,
            "child_tags": self._tags_with_child_condition,
            "attrs": self._tags_with_attrs_condition,
            "text": self._tags_with_text_condition
        }
    @staticmethod
    def _tags_with_parent_condition(**kwargs):
        found_tags: Set[Tag] = set()
        for parent_tag in kwargs["body_tag"].select(kwargs["family_condition"]):
            for tag in parent_tag.find_all([re.compile(tag) for tag in kwargs["tags"]]):
                found_tags.add(tag)
        return len(found_tags) != 0, list(found_tags)
    @staticmethod
    def _tags_with_child_condition(**kwargs):
        found_tags: Set[Tag] = set()
        for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]]):
            if tag.select(kwargs["family_condition"]):
                found_tags.add(tag)
        return len(found_tags) != 0, list(found_tags)
    @staticmethod
    def _tags_with_attrs_condition(**kwargs):
        found_tags: Set[Tag] = set()
        names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]]
        values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]]
        attr_conditions: Dict[str, str] = dict(zip(names, values))
        for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]],
                                               attr_conditions):
            found_tags.add(tag)
        return len(found_tags) != 0, list(found_tags)
    @staticmethod
    def _tags_with_text_condition(**kwargs):
        # find all tags that are in List of tags and tags that contains required text
        found_tags: Set[Tag] = set()
        for tag in kwargs["body_tag"].find_all(
                lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))",
                                    t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]),
                                                          t.text)):
            found_tags.add(tag)
        return len(found_tags) != 0, list(found_tags)
    @staticmethod
    def _wrap_tag(**kwargs):
-        kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
+        kwargs["found_tag"].wrap(kwargs["body_tag"].new_tag(
            kwargs["rule"]["tag_to_wrap"]["name"]))
    @staticmethod
@@ -34,13 +79,13 @@ class HtmlPresetsProcessor:
                parent_tag.attrs[key] = tag.attrs[key]
    def _decompose_tag(self, **kwargs):
-        if kwargs["tag"].parent:
+        if kwargs["found_tag"].parent:
-            self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent)
+            self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent)
-        kwargs["tag"].decompose()
+        kwargs["found_tag"].decompose()
    @staticmethod
-    def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
+    def add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
-                                        chapter_tag: BeautifulSoup):
+                                       chapter_tag: BeautifulSoup):
        """
        Function adds span with id from tag_to_be_removed
        because this tag will be removed(unwrapped/extract)
@@ -82,29 +127,33 @@ class HtmlPresetsProcessor:
                kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag(
                    "tr"), kwargs["body_tag"].new_tag("td")
            td.attrs["bgcolor"] = bg_color
-            kwargs["tag"].wrap(td)
+            kwargs["found_tag"].wrap(td)
            td.wrap(tr)
            tr.wrap(tbody)
            tbody.wrap(table)
            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
            return table
        _wrap_tag_with_table(
-            width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
+            width=kwargs["found_tag"].attrs["width"] if kwargs["found_tag"].attrs.get(
                "width") else "100",
-            border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
+            border=kwargs["found_tag"].attrs["border"] if kwargs["found_tag"].attrs.get(
                "border") else None,
-            bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
+            bg_color=kwargs["found_tag"].attrs["bgcolor"] if kwargs["found_tag"].attrs.get("bgcolor") else None)
-        self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"])
+        self.add_span_to_save_ids_for_links(kwargs["found_tag"], kwargs["body_tag"])
-        kwargs["tag"].unwrap()
+        kwargs["found_tag"].unwrap()
    @staticmethod
    def _replace_tag(**kwargs):
        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"]
-        kwargs["tag"].name = tag_to_replace
+        kwargs["found_tag"].name = tag_to_replace
        if kwargs["rule"]["tag_to_replace"].get("attrs"):
            dict_attributes = {attr["name"]: attr["value"]
                               for attr in kwargs["rule"]["tag_to_replace"]["attrs"]}
-            kwargs["tag"].attrs = dict_attributes
+            kwargs["found_tag"].attrs = dict_attributes
    @staticmethod
    def _remove_attrs(**kwargs):
        kwargs["found_tag"].attrs = {}
    @staticmethod
    def _replace_attr(**kwargs):
@@ -114,21 +163,21 @@ class HtmlPresetsProcessor:
        attr_to_replace, attr_value_to_replace =\
            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
        if attr_to_replace:
-            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr_name] \
+            kwargs["found_tag"][attr_to_replace] = kwargs["found_tag"][attr_name] \
-                if kwargs["tag"].get(attr_name)\
+                if kwargs["found_tag"].get(attr_name)\
                else ""
            if attr_value_to_replace:
-                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
+                kwargs["found_tag"].attrs[attr_to_replace] = attr_value_to_replace
-            del kwargs["tag"][attr_name]
+            del kwargs["found_tag"][attr_name]
        elif attr_value_to_replace:
-            kwargs["tag"].attrs[attr_name] = attr_value_to_replace
+            kwargs["found_tag"].attrs[attr_name] = attr_value_to_replace
        elif attr_name:
-            del kwargs["tag"][attr_name]
+            del kwargs["found_tag"][attr_name]
    def _unwrap_tag(self, **kwargs):
-        if kwargs["tag"].parent:
+        if kwargs["found_tag"].parent:
-            self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent)
+            self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent)
-        kwargs["tag"].unwrap()
+        kwargs["found_tag"].unwrap()
    @staticmethod
    def _insert_tag(**kwargs):
@@ -138,29 +187,29 @@ class HtmlPresetsProcessor:
            kwargs["body_tag"].new_tag(
                kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes)
        # insert all items that was in tag to subtag and remove from tag
-        for content in reversed(kwargs["tag"].contents):
+        for content in reversed(kwargs["found_tag"].contents):
            tag_to_insert.insert(0, content.extract())
        # wrap subtag with items
-        kwargs["tag"].append(tag_to_insert)
+        kwargs["found_tag"].append(tag_to_insert)
    @staticmethod
    def _replace_text(**kwargs):
-        if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["tag"].string):
+        if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["found_tag"].string):
            new_text = re.sub(re.compile(
-                kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["tag"].string)
+                kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["found_tag"].string)
-            kwargs["tag"].string.replace_with(new_text)
+            kwargs["found_tag"].string.replace_with(new_text)
-    @staticmethod
+    def process_tags(self,
-    def _process_tags(body_tag: BeautifulSoup,
+                     body_tag: BeautifulSoup,
-                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
+                     preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
-                      action):
+                     action):
        """
        Function does action with tags
        Parameters
        ----------
        body_tag: BeautifulSoup
            Tag & contents of the body tag
-        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
+        preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
            list of conditions when fire function
        action: function
            action what to do with tag
@@ -170,39 +219,34 @@ class HtmlPresetsProcessor:
            Body Tag with processed certain tags
        """
-        for rule in rules:
+        for preset_rule in preset_rules:
-            tags: List[str] = rule["tags"] if rule.get(
+            tags: List[str] = preset_rule["tags"] if preset_rule.get(
-                "tags") else rule["condition"]["tags"]
+                "tags") else preset_rule["condition"]["tags"]
-            if rule["condition"]:
+            found_tags: List[Tag] = []
-                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+            if preset_rule["condition"]:
-                    if condition_on_tag[0] == "parent_tags":
+                conditions_on_tag = tuple((k, v) for k, v in preset_rule["condition"].items() if v)
-                        for parent_tag in body_tag.select(condition_on_tag[1]):
+                for condition_on_tag in conditions_on_tag:
-                            for tag in parent_tag.find_all([re.compile(tag) for tag in tags]):
+                    condition_func = self.conditions[condition_on_tag[0]]
-                                # parent_tag != tag.parent
+                    was_found, f_tags = condition_func(body_tag=body_tag,
-                                action(body_tag=body_tag, tag=tag, rule=rule)
+                                                       tags=tags,
-                    elif condition_on_tag[0] == "child_tags":
+                                                       rule=preset_rule,
-                        for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
+                                                       family_condition=condition_on_tag[1])
-                            if tag.select(condition_on_tag[1]):
+                    found_tags = found_tags + f_tags if was_found else []
-                                action(body_tag=body_tag, tag=tag, rule=rule)
+                    if not was_found:
-                    elif condition_on_tag[0] == "attrs":
+                        break
-                        for attr in rule["condition"]["attrs"]:
+                # if there are several conditions on tags and found_tags isn't empty
-                            for tag in body_tag.find_all([re.compile(tag) for tag in tags],
+                if len(conditions_on_tag) > 1 and found_tags:
-                                                         {attr["name"]: re.compile(fr"{attr['value']}")}):
+                    # tags satisfying all conditions(>1)
-                                action(body_tag=body_tag, tag=tag, rule=rule)
+                    found_tags = [tag for tag in found_tags if found_tags.count(tag) > 1]
-                    elif condition_on_tag[0] == "text":
+                for found_tag in found_tags:
-                        # find all tags that are in List of tags and tags that contains required text
+                    action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
                        for tag in body_tag.find_all(
                                lambda t: re.search(r"(?=(" + '|'.join([tag for tag in tags]) + r"))",
                                                    t.name) and re.search(re.compile(rule["condition"]["text"]),
                                                                          t.text)):
                            action(body_tag=body_tag, tag=tag, rule=rule)
            else:
-                for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
+                for found_tag in body_tag.find_all([re.compile(tag) for tag in tags]):
-                    action(body_tag=body_tag, tag=tag, rule=rule)
+                    action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
 def _process_presets(html_preprocessor: HtmlPresetsProcessor, html_soup: BeautifulSoup):
-    for rule in html_preprocessor.preset:
+    for preset in html_preprocessor.preset:
        # html_preprocessor.logger.log(rule["preset_name"].title() + " process.")
-        action = html_preprocessor.name2action[rule["preset_name"]]
+        action = html_preprocessor.name2action[preset["preset_name"]]
-        html_preprocessor._process_tags(html_soup, rule["rules"], action)
+        html_preprocessor.process_tags(html_soup, preset["rules"], action)
--- a/src/style_reader.py
+++ b/src/style_reader.py
@@ -109,24 +109,27 @@ class StyleReader:
        return constraints_on_value, value_not_in_possible_values_list
    def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
-        for i, style in reversed(list(enumerate(split_style))):
+        try:
-            style_name, style_value = style.split(":")
+            for i, style in reversed(list(enumerate(split_style))):
-            if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
+                style_name, style_value = style.split(":")
-                # property not in LIVECARTA_STYLE_ATTRS, remove
+                if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
-                split_style.remove(style)
+                    # property not in LIVECARTA_STYLE_ATTRS, remove
-                continue
+                    split_style.remove(style)
                    continue
-            cleaned_value = self.clean_value(style_value, style_name)
+                cleaned_value = self.clean_value(style_value, style_name)
-            if all(self.style_conditions(cleaned_value, style_name)):
+                if all(self.style_conditions(cleaned_value, style_name)):
-                # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove
+                    # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove
-                split_style.remove(style)
+                    split_style.remove(style)
-                continue
+                    continue
-            else:
+                else:
-                if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
+                    if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
-                    # function that converts our data
+                        # function that converts our data
-                    func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
+                        func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
-                    style_value = func(cleaned_value)
+                        style_value = func(cleaned_value)
-            split_style[i] = style_name + ":" + style_value
+                split_style[i] = style_name + ":" + style_value
        except ValueError as ve:
            print(f"Style value isn't correct.")
        return split_style
    def build_inline_style_content(self, style: str) -> str: