comment duplicate_styles_check cause of transform

2022-07-08 18:36:09 +03:00
parent 1926377a34
commit 7d5c1bfdf2
2 changed files with 13 additions and 14 deletions
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -0,0 +1,397 @@
+import re
+from bs4 import BeautifulSoup, NavigableString, Comment, Tag
+
+from src.util.helpers import BookLogger
+
+
+class HtmlEpubPreprocessor:
+    def __init__(self, preset, logger=None):
+        self.preset = preset
+        self.logger: BookLogger = logger
+        self.name2function = {
+            "table_wrapper": self._wrap_tags_with_table,
+            "replacer": self._tags_to_correspond_livecarta_tag,
+            "unwrapper": self._unwrap_tags,
+            "inserter": self._insert_tags_into_correspond_tags
+        }
+
+    @staticmethod
+    def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
+        """
+        Function adds span with id from tag_to_be_removed
+        because this tag will be removed(unwrapped/extract)
+        Parameters
+        ----------
+        tag_to_be_removed: Soup object
+        chapter_tag: BeautifulSoup
+
+        Returns
+        -------
+        None
+            updated body tag
+
+        """
+
+        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
+                                               class_: list):
+            """Function inserts span before tag aren't supported by LiveCarta"""
+            new_tag = chapter_tag.new_tag("span")
+            new_tag.attrs["id"] = id_ or ""
+            new_tag.attrs["class"] = class_ or ""
+            new_tag.string = "\xa0"
+            tag_to_be_removed.insert_before(new_tag)
+
+        if tag_to_be_removed.attrs.get("id"):
+            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
+                                               id_=tag_to_be_removed.attrs["id"],
+                                               class_=tag_to_be_removed.attrs.get("class"))
+
+    @staticmethod
+    def prepare_title(title_of_chapter: str) -> str:
+        """
+        Function finalise processing/cleaning title
+        Parameters
+        ----------
+        title_of_chapter: str
+
+        Returns
+        -------
+        title: str
+            cleaned title
+
+        """
+        title = BeautifulSoup(title_of_chapter, features="lxml").string
+        # clean extra whitespace characters ([\r\n\t\f\v ])
+        title = re.sub(r"[\s\xa0]", " ", title).strip()
+        return title
+
+    @staticmethod
+    def _remove_comments(chapter_tag):
+        """
+        Function remove comments
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag
+
+        Returns
+        -------
+        None
+            Chapter Tag without comments
+
+        """
+        for tag in chapter_tag.find_all():
+            for element in tag(text=lambda text: isinstance(text, Comment)):
+                element.extract()
+
+    @staticmethod
+    def _wrap_strings_with_p(chapter_tag):
+        """
+        Function converts headings that aren't supported by LiveCarta with <p>
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag
+
+        Returns
+        -------
+        None
+            Chapter Tag with wrapped NavigableStrings
+
+        """
+        for node in chapter_tag:
+            if isinstance(node, NavigableString):
+                content = str(node)
+                content = re.sub(r"([\s\xa0])", " ", content).strip()
+                if content:
+                    p_tag = chapter_tag.new_tag("p")
+                    p_tag.append(str(node))
+                    node.replace_with(p_tag)
+
+    def _wrap_tags_with_table(self, chapter_tag, rules: list):
+        """
+        Function wraps <tag> with <table>
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag
+
+        Returns
+        -------
+        None
+            Chapter Tag with wrapped certain tags with <table>
+
+        """
+
+        def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
+            table = chapter_tag.new_tag("table")
+            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
+                = border, "center", f"width:{width}%;"
+            tbody, tr, td = \
+                chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
+            td.attrs["bgcolor"] = bg_color
+            tag_to_be_wrapped.wrap(td)
+            td.wrap(tr)
+            tr.wrap(tbody)
+            tbody.wrap(table)
+            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
+            return table
+
+        def process_tag_using_table(tag_to_wrap):
+            _wrap_tag_with_table(
+                chapter_tag,
+                tag_to_be_wrapped=tag_to_wrap,
+                width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
+                border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
+                bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
+            self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
+            tag_to_wrap.unwrap()
+
+        for rule in rules:
+            tags = rule["tags"]
+            for attr in rule["attrs"]:
+                for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
+                                                        {attr["name"]: re.compile(fr"{attr['value']}")}):
+                    process_tag_using_table(tag_to_wrap)
+
+    @staticmethod
+    def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list):
+        """
+        Function to replace all tags to correspond LiveCarta tags
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag
+
+        Returns
+        -------
+        None
+            Chapter Tag with all tags replaced with LiveCarta tags
+
+        """
+        for rule in rules:
+            tags = rule["tags"]
+            tag_to_replace = rule["tag_to_replace"]
+            if rule["condition"]:
+                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+                    if condition_on_tag[0] == 'parent_tags':
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                            if tag.parent.select(condition_on_tag[1]):
+                                tag.name = tag_to_replace
+                    elif condition_on_tag[0] == 'child_tags':
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                            if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
+                                tag.name = tag_to_replace
+                    elif condition_on_tag[0] == "attrs":
+                        for attr in rule["condition"]["attrs"]:
+                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
+                                tag.name = tag_to_replace
+            else:
+                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                    # todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
+                    tag.name = tag_to_replace
+
+    def _unwrap_tags(self, chapter_tag, rules: dict):
+        """
+        Function unwrap tags and moves id to span
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag
+
+        Returns
+        -------
+        None
+            Chapter Tag with unwrapped certain tags
+
+        """
+        for tag_name in rules["tags"]:
+            for tag in chapter_tag.select(tag_name):
+                # if tag is a subtag
+                if ">" in tag_name:
+                    tag.parent.attrs.update(tag.attrs)
+                self._add_span_to_save_ids_for_links(tag, chapter_tag)
+                tag.unwrap()
+
+    @staticmethod
+    def _insert_tags_into_correspond_tags(chapter_tag, rules: list):
+        """
+        Function inserts tags into correspond tags
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag
+
+        Returns
+        -------
+        None
+            Chapter Tag with inserted tags
+
+        """
+        def insert(tag, tag_to_insert):
+            # insert all items that was in tag to subtag and remove from tag
+            for content in reversed(tag.contents):
+                tag_to_insert.insert(0, content.extract())
+            # wrap subtag with items
+            tag.append(tag_to_insert)
+
+        for rule in rules:
+            tags = rule["tags"]
+            tag_to_insert = \
+                chapter_tag.new_tag(rule["tag_to_insert"])
+            if rule["condition"]:
+                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+                    if condition_on_tag[0] == 'parent_tags':
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                            if tag.parent.select(condition_on_tag[1]):
+                                insert(tag, tag_to_insert)
+                    elif condition_on_tag[0] == 'child_tags':
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                            if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
+                                insert(tag, tag_to_insert)
+                    elif condition_on_tag[0] == "attrs":
+                        for attr in rule["condition"]["attrs"]:
+                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
+                                insert(tag, tag_to_insert)
+            else:
+                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                    insert(tag, tag_to_insert)
+
+    def _remove_headings_content(self, content_tag, title_of_chapter: str):
+        """
+        Function
+        - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
+        - adds span with id in order to
+        Parameters
+        ----------
+        content_tag: soup object
+            Tag of the page
+        title_of_chapter: str
+            Chapter title
+
+        Returns
+        -------
+        None
+            clean/remove headings & add span with id
+
+        """
+        title_of_chapter = title_of_chapter.lower()
+        for tag in content_tag.contents:
+            text = tag if isinstance(tag, NavigableString) else tag.text
+            if re.sub(r"[\s\xa0]", "", text):
+                text = re.sub(r"[\s\xa0]", " ", text).lower()
+                text = text.strip()  # delete extra spaces
+                if title_of_chapter == text or \
+                        (title_of_chapter in text and
+                         re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
+                    self._add_span_to_save_ids_for_links(tag, content_tag)
+                    tag.extract()
+                    return
+                elif not isinstance(tag, NavigableString):
+                    if not self._remove_headings_content(tag, title_of_chapter):
+                        break
+
+    @staticmethod
+    def _process_tables(chapter_tag: BeautifulSoup):
+        """
+        Function preprocesses tables and tags(td|th|tr)
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag
+
+        Returns
+        -------
+        None
+            Chapter Tag with processed tables
+
+        """
+        tables = chapter_tag.find_all("table")
+        for table in tables:
+            for t_tag in table.find_all(re.compile("td|th|tr")):
+                width = ""
+                if t_tag.get("style"):
+                    width_match = re.search(
+                        r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
+                    if width_match:
+                        size = width_match.group(1)
+                        width = size + "px"
+
+                t_tag.attrs["width"] = t_tag.get("width") or width
+
+                if t_tag.attrs.get("style"):
+                    t_tag.attrs["style"] = t_tag.attrs["style"].replace(
+                        "border:0;", "")
+                    if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
+                        del t_tag.attrs["style"]
+
+            if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
+                table.attrs["border"] = "1"
+
+    @staticmethod
+    def _class_removing(chapter_tag):
+        """
+        Function removes classes that aren't created by converter
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag
+
+        Returns
+        -------
+        None
+            Chapter Tag without original classes of the book
+
+        """
+        for tag in chapter_tag.find_all(recursive=True):
+            if tag.attrs.get("class") \
+                    and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
+                del tag.attrs["class"]
+
+    def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
+        """
+        Function finalise processing/cleaning content
+        Parameters
+        ----------
+        title_str: str
+
+        content_tag: Tag, soup object
+
+        remove_title_from_chapter: bool
+
+        Steps
+        ----------
+        1. comments removal
+        2. wrap NavigableString with tag <p>
+        3-6. wrap tags with <table>
+            replace tags with correspond LiveCarta tags
+            unwrap tags
+            insert tags into correspond tags
+        7. heading removal
+        8. process_tables
+        9. class removal
+
+        Returns
+        -------
+        content_tag: str
+            prepared content
+
+        """
+        # 1. remove comments
+        self._remove_comments(content_tag)
+        # 2.
+        self._wrap_strings_with_p(content_tag)
+        # 3-6.
+        for dict in self.preset:
+            func = self.name2function[dict["preset_name"]]
+            func(content_tag, dict['rules'])
+        # 7.
+        if remove_title_from_chapter:
+            self._remove_headings_content(content_tag, title_str)
+        # 8.
+        self._process_tables(content_tag)
+        # 9. remove classes that weren't created by converter
+        self._class_removing(content_tag)
+        return str(content_tag)