Merge pull request #324 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
2022-12-12 13:21:55 +03:00
parent 41359e7906 02127d8def
commit 38d8024292
5 changed files with 31 additions and 69 deletions
--- a/preset/epub_presets.json
+++ b/preset/epub_presets.json
@@ -2,58 +2,6 @@
    {
        "preset_name": "table_wrapper",
        "rules": [
            {
                "tags": ["^div$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "width",
                            "value": ".*"
                        }
                    ]
                }
            },
            {
                "tags": ["^div$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "border",
                            "value": ".*"
                        }
                    ]
                }
            },
            {
                "tags": ["^div$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "style",
                            "value": "border.*"
                        }
                    ]
                }
            },
            {
                "tags": ["^div$"],
                "condition": {
                    "parent_tags": null,
                    "child_tags": null,
                    "attrs": [
                        {
                            "name": "bgcolor",
                            "value": ".*"
                        }
                    ]
                }
            },
            {
                "tags": ["^section$", "^blockquote$"],
                "condition": {
@@ -73,7 +21,7 @@
        "preset_name": "replacer",
        "rules": [
            {
-                "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"],
+                "tags": ["^h[6-9]$", "^figure$", "^section$", "blockquote"],
                "condition": null,
                "tag_to_replace": {
                    "name": "p"
@@ -127,7 +75,7 @@
            }
        ]
    },
-        {
+    {
        "preset_name": "attrs_remover",
        "rules": [
            {
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -52,12 +52,13 @@ class EpubBook(BookSolver):
 if __name__ == "__main__":
-    epub_file_path = f"../../books/epub/9781614382264.epub"
+    epub_file_path = f"../../books/epub/Deep_Learning_with_Python_Second_Editio.epub"
    logger_object = BookLogger(name="epub")
    logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1])
    html_preprocessor = HtmlPresetsProcessor(
        logger=logger_object, preset_path="../../preset/epub_presets.json")
    style_preprocessor = StyleReader()
    html_processor = HtmlEpubProcessor(logger=logger_object,
--- a/src/html_presets_processor.py
+++ b/src/html_presets_processor.py
@@ -2,7 +2,7 @@ import re
 import json
 from bs4 import BeautifulSoup, Tag
 from bs4.element import PageElement
-from typing import List, Set, Dict, Union
+from typing import Union
 from src.util.helpers import BookLogger
@@ -29,42 +29,49 @@ class HtmlPresetsProcessor:
            "text": self._tags_with_text_condition
        }
    @staticmethod
    def _tags_with_parent_condition(**kwargs):
-        found_tags: Set[Tag] = set()
+        found_tags: list[Tag] = list()
        # add unique id in order not to add duplicates to the
        # found_tags(because tag with subtag could duplicate found_tag)
        u_id = 0
        for parent_tag in kwargs["body_tag"].select(kwargs["family_condition"]):
            for tag in parent_tag.find_all([re.compile(tag) for tag in kwargs["tags"]]):
-                found_tags.add(tag)
+                if not tag.attrs.get("unique_id"):
                    tag.attrs["unique_id"] = u_id
                    u_id += 1
                    found_tags.append(tag)
        return len(found_tags) != 0, list(found_tags)
    @staticmethod
    def _tags_with_child_condition(**kwargs):
-        found_tags: Set[Tag] = set()
+        found_tags: list[Tag] = list()
        for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]]):
            if tag.select(kwargs["family_condition"]):
-                found_tags.add(tag)
+                found_tags.append(tag)
        return len(found_tags) != 0, list(found_tags)
    @staticmethod
    def _tags_with_attrs_condition(**kwargs):
-        found_tags: Set[Tag] = set()
+        found_tags: list[Tag] = list()
        names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]]
        values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]]
-        attr_conditions: Dict[str, str] = dict(zip(names, values))
+        attr_conditions: dict[str, re] = dict(zip(names, values))
        for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]],
                                               attr_conditions):
-            found_tags.add(tag)
+            found_tags.append(tag)
        return len(found_tags) != 0, list(found_tags)
    @staticmethod
    def _tags_with_text_condition(**kwargs):
        # find all tags that are in List of tags and tags that contains required text
-        found_tags: Set[Tag] = set()
+        found_tags: list[Tag] = list()
        for tag in kwargs["body_tag"].find_all(
                lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))",
                                    t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]),
                                                          t.text)):
-            found_tags.add(tag)
+            found_tags.append(tag)
        return len(found_tags) != 0, list(found_tags)
    @staticmethod
@@ -104,7 +111,7 @@ class HtmlPresetsProcessor:
        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
                                               tag_to_be_removed: Tag,
                                               id_: str,
-                                               class_: Union[List[str], str]):
+                                               class_: Union[list[str], str]):
            """Function inserts span before tag aren't supported by LiveCarta"""
            new_tag: Tag = chapter_tag.new_tag("span")
            new_tag.attrs["id"] = id_ or ""
@@ -201,7 +208,7 @@ class HtmlPresetsProcessor:
    def process_tags(self,
                     body_tag: BeautifulSoup,
-                     preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
+                     preset_rules: list[dict[str, Union[list[str], str, dict[str, Union[list[dict[str, str]], int, str]]]]],
                     action):
        """
        Function does action with tags
@@ -220,9 +227,9 @@ class HtmlPresetsProcessor:
        """
        for preset_rule in preset_rules:
-            tags: List[str] = preset_rule["tags"] if preset_rule.get(
+            tags: list[str] = preset_rule["tags"] if preset_rule.get(
                "tags") else preset_rule["condition"]["tags"]
-            found_tags: List[Tag] = []
+            found_tags: list[Tag] = []
            if preset_rule["condition"]:
                conditions_on_tag = tuple((k, v) for k, v in preset_rule["condition"].items() if v)
                for condition_on_tag in conditions_on_tag:
--- a/src/livecarta_config.py
+++ b/src/livecarta_config.py
@@ -71,6 +71,8 @@ class LiveCartaConfig:
        "border-left-width": [],
        "border-bottom-width": [],
        "border-top": [],
        "border-right": [],
        "border-left": [],
        "border-bottom": [],
        "list-style-type": [],
        "list-style-image": [],
--- a/src/style_reader.py
+++ b/src/style_reader.py
@@ -32,6 +32,8 @@ class StyleReader:
            "border-left-width": self.convert_tag_style_values,
            "border-bottom-width": self.convert_tag_style_values,
            "border-top": self.convert_tag_style_values,
            "border-right": self.convert_tag_style_values,
            "border-left": self.convert_tag_style_values,
            "border-bottom": self.convert_tag_style_values,
            "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
            "list-style-image": lambda x: "disc",
@@ -88,6 +90,8 @@ class StyleReader:
                values[size_number_idx] = convert_size_number(values[size_number_idx], "pt", 4 / 3)
            elif has_size.group(2) == "in":
                values[size_number_idx] = convert_size_number(values[size_number_idx], "in", 96)
            elif has_size.group(2) == "rem":
                values[size_number_idx] = convert_size_number(values[size_number_idx], "rem", 16)
        size_value = " ".join(values)
        return size_value