BookConverter/src/epub_converter/html_epub_processor.py

import re
import json
from typing import List, Dict, Union
from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment

from src.util.helpers import BookLogger


class HtmlEpubPreprocessor:
    def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
        self.preset = json.load(open(preset_path))
        self.logger = logger
        self.name2function = {
            "table_wrapper": self._wrap_tags_with_table,
            "replacer": self._tags_to_correspond_livecarta_tag,
            "attr_replacer": self._replace_attrs_in_tags,
            "unwrapper": self._unwrap_tags,
            "inserter": self._insert_tags_into_correspond_tags
        }

    @staticmethod
    def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
                                        chapter_tag: BeautifulSoup):
        """
        Function adds span with id from tag_to_be_removed
        because this tag will be removed(unwrapped/extract)
        Parameters
        ----------
        tag_to_be_removed: Union[PageElement, BeautifulSoup]

        chapter_tag: BeautifulSoup

        Returns
        -------
        NoReturn
            updated body tag

        """
        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
                                               tag_to_be_removed: Tag,
                                               id_: str,
                                               class_: Union[List[str], str]):
            """Function inserts span before tag aren't supported by LiveCarta"""
            new_tag: Tag = chapter_tag.new_tag("span")
            new_tag.attrs["id"] = id_ or ""
            new_tag.attrs["class"] = class_ or ""
            new_tag.string = "\xa0"
            tag_to_be_removed.insert_before(new_tag)

        if tag_to_be_removed.attrs.get("id"):
            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
                                               tag_to_be_removed=tag_to_be_removed,
                                               id_=tag_to_be_removed.attrs["id"],
                                               class_=tag_to_be_removed.attrs.get("class"))

    @staticmethod
    def prepare_title(title_of_chapter: str) -> str:
        """
        Function finalise processing/cleaning title
        Parameters
        ----------
        title_of_chapter: str

        Returns
        -------
        title: str
            cleaned title

        """
        title = BeautifulSoup(title_of_chapter, features="lxml").string
        # clean extra whitespace characters ([\r\n\t\f\v ])
        title = re.sub(r"[\s\xa0]", " ", title).strip()
        return title

    @staticmethod
    def _remove_comments(chapter_tag: BeautifulSoup):
        """
        Function remove comments
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag

        Returns
        -------
        NoReturn
            Chapter Tag without comments

        """
        for tag in chapter_tag.find_all():
            for element in tag(text=lambda text: isinstance(text, Comment)):
                element.extract()

    @staticmethod
    def _wrap_strings_with_p(chapter_tag: BeautifulSoup):
        """
        Function converts headings that aren't supported by LiveCarta with <p>
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag

        Returns
        -------
        None
            Chapter Tag with wrapped NavigableStrings

        """
        for node in chapter_tag:
            if isinstance(node, NavigableString):
                content = str(node)
                content = re.sub(r"([\s\xa0])", " ", content).strip()
                if content:
                    p_tag = chapter_tag.new_tag("p")
                    p_tag.append(str(node))
                    node.replace_with(p_tag)

    def _wrap_tags_with_table(self,
                              chapter_tag: BeautifulSoup,
                              rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
        """
        Function wraps <tag> with <table>
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
        rules: List[Dict[str, List[str, Dict[str, str]]]]
            list of conditions when fire function

        Returns
        -------
        NoReturn
            Chapter Tag with wrapped certain tags with <table>

        """

        def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
            table = chapter_tag.new_tag("table")
            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
                = border, "center", f"width:{width}%;"
            tbody, tr, td = \
                chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
                    "tr"), chapter_tag.new_tag("td")
            td.attrs["bgcolor"] = bg_color
            tag_to_wrap.wrap(td)
            td.wrap(tr)
            tr.wrap(tbody)
            tbody.wrap(table)
            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
            return table

        def process_tag_using_table():
            _wrap_tag_with_table(
                width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
                    "width") else "100",
                border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
                    "border") else None,
                bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
            self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
            tag_to_wrap.unwrap()

        for rule in rules:
            tags = rule["tags"]
            for attr in rule["attrs"]:
                for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
                                                        {attr["name"]: re.compile(fr"{attr['value']}")}):
                    process_tag_using_table()

    @staticmethod
    def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
                                          rules: List[Dict[str,
                                                           Union[List[str], str, Dict[str,
                                                                                      Union[str, List[Dict[str, str]]]]]]]):
        """
        Function to replace all tags to correspond LiveCarta tags
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
        rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
            list of conditions when fire function

        Returns
        -------
        NoReturn
            Chapter Tag with all tags replaced with LiveCarta tags

        """
        for rule in rules:
            tags: List[str] = rule["tags"]
            tag_to_replace: str = rule["tag_to_replace"]
            if rule["condition"]:
                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
                    if condition_on_tag[0] == 'parent_tags':
                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                            if tag.parent.select(condition_on_tag[1]):
                                tag.name = tag_to_replace
                    elif condition_on_tag[0] == 'child_tags':
                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                            if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
                                tag.name = tag_to_replace
                    elif condition_on_tag[0] == "attrs":
                        for attr in rule["condition"]["attrs"]:
                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
                                tag.name = tag_to_replace
            else:
                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                    # todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
                    tag.name = tag_to_replace

    @staticmethod
    def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
        """
        Function to replace all tags to correspond LiveCarta tags
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
        rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
            list of conditions when fire function

        Returns
        -------
        NoReturn
            Chapter Tag with all tags replaced with LiveCarta tags

        """
        for rule in rules:
            attr = rule["attr"]
            tags: List[str] = rule["condition"]["tags"]
            attr_to_replace = rule["attr_to_replace"]
            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
                                            {attr: re.compile(r".*")}):
                tag[attr_to_replace] = tag[attr]
                del tag[attr]

    def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]):
        """
        Function unwrap tags and moves id to span
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
        rules: Dict[str, List[str]]
            dict of tags to unwrap

        Returns
        -------
        NoReturn
            Chapter Tag with unwrapped certain tags

        """
        for tag_name in rules["tags"]:
            for tag in chapter_tag.select(tag_name):
                # if tag is a subtag
                if ">" in tag_name:
                    tag.parent.attrs.update(tag.attrs)
                self._add_span_to_save_ids_for_links(tag, chapter_tag)
                tag.unwrap()

    @staticmethod
    def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup,
                                          rules: List[Dict[str,
                                                           Union[List[str], str, Dict[str,
                                                                                      Union[str, List[Dict[str, str]]]]]]]):
        """
        Function inserts tags into correspond tags
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
            list of conditions when fire function

        Returns
        -------
        NoReturn
            Chapter Tag with inserted tags

        """
        def insert(tag: Tag):
            tag_to_insert = \
                chapter_tag.new_tag(rule["tag_to_insert"])
            # insert all items that was in tag to subtag and remove from tag
            for content in reversed(tag.contents):
                tag_to_insert.insert(0, content.extract())
            # wrap subtag with items
            tag.append(tag_to_insert)

        for rule in rules:
            tags: List[str] = rule["tags"]
            if rule["condition"]:
                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
                    if condition_on_tag[0] == 'parent_tags':
                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                            if tag.parent.select(condition_on_tag[1]):
                                insert(tag)
                    elif condition_on_tag[0] == 'child_tags':
                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                            if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
                                insert(tag)
                    elif condition_on_tag[0] == "attrs":
                        for attr in rule["condition"]["attrs"]:
                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
                                insert(tag)
            else:
                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                    insert(tag)

    def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
        """
        Function
        - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
        - adds span with id in order to
        Parameters
        ----------
        chapter_tag: Union[BeautifulSoup, PageElement]
            Tag of the page
        title_of_chapter: str
            Chapter title

        Returns
        -------
        NoReturn
            clean/remove headings & add span with id

        """
        title_of_chapter = title_of_chapter.lower()
        for tag in chapter_tag.contents:
            tag: PageElement
            text: str = tag if isinstance(tag, NavigableString) else tag.text
            if re.sub(r"[\s\xa0]", "", text):
                text = re.sub(r"[\s\xa0]", " ", text).lower()
                text = text.strip()  # delete extra spaces
                if not isinstance(tag, NavigableString):
                    if title_of_chapter == text or \
                            (title_of_chapter in text and
                             re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
                        self._add_span_to_save_ids_for_links(tag, chapter_tag)
                        tag.extract()
                        return
                    elif not self._remove_headings_content(tag, title_of_chapter):
                        break
                else:
                    tag.extract()
                    return

    @staticmethod
    def _process_tables(chapter_tag: BeautifulSoup):
        """
        Function preprocesses tables and tags(td|th|tr)
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag

        Returns
        -------
        NoReturn
            Chapter Tag with processed tables

        """
        tables = chapter_tag.find_all("table")
        for table in tables:
            for t_tag in table.find_all(re.compile("td|th|tr")):
                width = ""
                if t_tag.get("style"):
                    width_match = re.search(
                        r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
                    if width_match:
                        size = width_match.group(1)
                        width = size + "px"

                t_tag.attrs["width"] = t_tag.get("width") or width

                if t_tag.attrs.get("style"):
                    t_tag.attrs["style"] = t_tag.attrs["style"].replace(
                        "border:0;", "")
                    if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
                        del t_tag.attrs["style"]

            if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
                table.attrs["border"] = "1"

    @staticmethod
    def _class_removing(chapter_tag: BeautifulSoup):
        """
        Function removes classes that aren't created by converter
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag

        Returns
        -------
        NoReturn
            Chapter Tag without original classes of the book

        """
        for tag in chapter_tag.find_all(recursive=True):
            if tag.attrs.get("class") \
                    and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
                del tag.attrs["class"]

    def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
        """
        Function finalise processing/cleaning content
        Parameters
        ----------
        title_str: str

        content_tag: Tag, soup object

        remove_title_from_chapter: bool

        Steps
        ----------
        1. comments removal
        2. wrap NavigableString with tag <p>
        3-6. wrap tags with <table>
            replace tags with correspond LiveCarta tags
            unwrap tags
            insert tags into correspond tags
        7. heading removal
        8. process_tables
        9. class removal

        Returns
        -------
        content_tag: Tag
            prepared content

        """
        # 1. remove comments
        self._remove_comments(content_tag)
        # 2.
        self._wrap_strings_with_p(content_tag)
        # 3-6.
        for rule in self.preset:
            func = self.name2function[rule["preset_name"]]
            func(content_tag, rule['rules'])
        # 7.
        if remove_title_from_chapter:
            self._remove_headings_content(content_tag, title_str)
        # 8.
        self._process_tables(content_tag)
        # 9. remove classes that weren't created by converter
        self._class_removing(content_tag)
        return content_tag