Annotations in Epub converter

2022-08-03 14:39:13 +03:00
parent 7453029295
commit 78e3ad8911
16 changed files with 259 additions and 192 deletions
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -1,14 +1,16 @@
 import re
 import json
-from bs4 import BeautifulSoup, NavigableString, Comment, Tag
+from typing import List, Dict, Union
+from bs4 import BeautifulSoup, Tag, NavigableString, Comment
+from bs4.element import PageElement

 from src.util.helpers import BookLogger


 class HtmlEpubPreprocessor:
-    def __init__(self, preset_path="../../presets/presets.json", logger=None):
+    def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
        self.preset = json.load(open(preset_path))
-        self.logger: BookLogger = logger
+        self.logger = logger
        self.name2function = {
            "table_wrapper": self._wrap_tags_with_table,
            "replacer": self._tags_to_correspond_livecarta_tag,
@@ -18,33 +20,37 @@ class HtmlEpubPreprocessor:
        }

    @staticmethod
-    def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
+    def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
+                                        chapter_tag: BeautifulSoup):
        """
        Function adds span with id from tag_to_be_removed
        because this tag will be removed(unwrapped/extract)
        Parameters
        ----------
-        tag_to_be_removed: Soup object
+        tag_to_be_removed: Union[PageElement, BeautifulSoup]
+
        chapter_tag: BeautifulSoup

        Returns
        -------
-        None
+        NoReturn
            updated body tag

        """
-
-        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
-                                               class_: list):
+        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
+                                               tag_to_be_removed: Tag,
+                                               id_: str,
+                                               class_: Union[List[str], str]):
            """Function inserts span before tag aren't supported by LiveCarta"""
-            new_tag = chapter_tag.new_tag("span")
+            new_tag: Tag = chapter_tag.new_tag("span")
            new_tag.attrs["id"] = id_ or ""
            new_tag.attrs["class"] = class_ or ""
            new_tag.string = "\xa0"
            tag_to_be_removed.insert_before(new_tag)

        if tag_to_be_removed.attrs.get("id"):
-            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
+            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
+                                               tag_to_be_removed=tag_to_be_removed,
                                               id_=tag_to_be_removed.attrs["id"],
                                               class_=tag_to_be_removed.attrs.get("class"))

@@ -78,7 +84,7 @@ class HtmlEpubPreprocessor:

        Returns
        -------
-        None
+        NoReturn
            Chapter Tag without comments

        """
@@ -110,27 +116,32 @@ class HtmlEpubPreprocessor:
                    p_tag.append(str(node))
                    node.replace_with(p_tag)

-    def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: list):
+    def _wrap_tags_with_table(self,
+                              chapter_tag: BeautifulSoup,
+                              rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
        """
        Function wraps <tag> with <table>
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
+        rules: List[Dict[str, List[str, Dict[str, str]]]]
+            list of conditions when fire function

        Returns
        -------
-        None
+        NoReturn
            Chapter Tag with wrapped certain tags with <table>

        """

-        def _wrap_tag_with_table(width="100", border="", bg_color=None):
+        def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
            table = chapter_tag.new_tag("table")
            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
                = border, "center", f"width:{width}%;"
            tbody, tr, td = \
-                chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
+                chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
+                    "tr"), chapter_tag.new_tag("td")
            td.attrs["bgcolor"] = bg_color
            tag_to_wrap.wrap(td)
            td.wrap(tr)
@@ -141,8 +152,10 @@ class HtmlEpubPreprocessor:

        def process_tag_using_table():
            _wrap_tag_with_table(
-                width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
-                border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
+                width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
+                    "width") else "100",
+                border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
+                    "border") else None,
                bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
            self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
            tag_to_wrap.unwrap()
@@ -155,23 +168,26 @@ class HtmlEpubPreprocessor:
                    process_tag_using_table()

    @staticmethod
-    def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: list):
+    def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
+                                          rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]):
        """
        Function to replace all tags to correspond LiveCarta tags
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
+        rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
+            list of conditions when fire function

        Returns
        -------
-        None
+        NoReturn
            Chapter Tag with all tags replaced with LiveCarta tags

        """
        for rule in rules:
-            tags = rule["tags"]
-            tag_to_replace = rule["tag_to_replace"]
+            tags: List[str] = rule["tags"]
+            tag_to_replace: str = rule["tag_to_replace"]
            if rule["condition"]:
                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
                    if condition_on_tag[0] == 'parent_tags':
@@ -193,40 +209,44 @@ class HtmlEpubPreprocessor:
                    tag.name = tag_to_replace

    @staticmethod
-    def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list):
+    def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
        """
        Function to replace all tags to correspond LiveCarta tags
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
+        rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
+            list of conditions when fire function

        Returns
        -------
-        None
+        NoReturn
            Chapter Tag with all tags replaced with LiveCarta tags

        """
        for rule in rules:
            attr = rule["attr"]
-            tags = rule["condition"]["tags"]
+            tags: List[str] = rule["condition"]["tags"]
            attr_to_replace = rule["attr_to_replace"]
            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
-                                                            {attr: re.compile(r".*")}):
+                                            {attr: re.compile(r".*")}):
                tag[attr_to_replace] = tag[attr]
                del tag[attr]

-    def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
+    def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]):
        """
        Function unwrap tags and moves id to span
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
+        rules: Dict[str, List[str]]
+            dict of tags to unwrap

        Returns
        -------
-        None
+        NoReturn
            Chapter Tag with unwrapped certain tags

        """
@@ -239,21 +259,23 @@ class HtmlEpubPreprocessor:
                tag.unwrap()

    @staticmethod
-    def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: list):
+    def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]):
        """
        Function inserts tags into correspond tags
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
+        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
+            list of conditions when fire function

        Returns
        -------
-        None
+        NoReturn
            Chapter Tag with inserted tags

        """
-        def insert(tag):
+        def insert(tag: Tag):
            tag_to_insert = \
                chapter_tag.new_tag(rule["tag_to_insert"])
            # insert all items that was in tag to subtag and remove from tag
@@ -263,7 +285,7 @@ class HtmlEpubPreprocessor:
            tag.append(tag_to_insert)

        for rule in rules:
-            tags = rule["tags"]
+            tags: List[str] = rule["tags"]
            if rule["condition"]:
                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
                    if condition_on_tag[0] == 'parent_tags':
@@ -283,29 +305,28 @@ class HtmlEpubPreprocessor:
                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                    insert(tag)

-    def _remove_headings_content(self, chapter_tag, title_of_chapter: str):
+    def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
        """
        Function
        - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
        - adds span with id in order to
        Parameters
        ----------
-        chapter_tag: soup object
+        chapter_tag: Union[BeautifulSoup, PageElement]
            Tag of the page
        title_of_chapter: str
            Chapter title

        Returns
        -------
-        None
+        NoReturn
            clean/remove headings & add span with id

        """
        title_of_chapter = title_of_chapter.lower()
-        if title_of_chapter == "chapter 1":
-            pass
        for tag in chapter_tag.contents:
-            text = tag if isinstance(tag, NavigableString) else tag.text
+            tag: PageElement
+            text: str = tag if isinstance(tag, NavigableString) else tag.text
            if re.sub(r"[\s\xa0]", "", text):
                text = re.sub(r"[\s\xa0]", " ", text).lower()
                text = text.strip()  # delete extra spaces
@@ -333,7 +354,7 @@ class HtmlEpubPreprocessor:

        Returns
        -------
-        None
+        NoReturn
            Chapter Tag with processed tables

        """
@@ -370,7 +391,7 @@ class HtmlEpubPreprocessor:

        Returns
        -------
-        None
+        NoReturn
            Chapter Tag without original classes of the book

        """
@@ -413,9 +434,9 @@ class HtmlEpubPreprocessor:
        # 2.
        self._wrap_strings_with_p(content_tag)
        # 3-6.
-        for dict in self.preset:
-            func = self.name2function[dict["preset_name"]]
-            func(content_tag, dict['rules'])
+        for rule in self.preset:
+            func = self.name2function[rule["preset_name"]]
+            func(content_tag, rule['rules'])
        # 7.
        if remove_title_from_chapter:
            self._remove_headings_content(content_tag, title_str)