BookConverter/src/epub_converter/html_epub_preprocessor.py

import re

from bs4 import BeautifulSoup, NavigableString, Tag, Comment

from src.livecarta_config import LiveCartaConfig


def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
    """
    Function adds span with id from tag_to_be_removed
    because this tag will be removed(unwrapped/extract)
    Parameters
    ----------
    tag_to_be_removed: Soup object
    chapter_tag: BeautifulSoup

    Returns
    -------
    None
        updated body tag

    """
    def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
        """Function inserts span before tag aren't supported by livecarta"""
        new_tag = chapter_tag.new_tag("span")
        new_tag.attrs["id"] = id_ or ""
        new_tag.attrs["class"] = class_ or ""
        new_tag.string = "\xa0"
        tag_to_be_removed.insert_before(new_tag)

    if tag_to_be_removed.attrs.get("id"):
        _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
                                           id_=tag_to_be_removed.attrs["id"],
                                           class_=tag_to_be_removed.attrs.get("class"))


def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
    """
    After processing on a first_id that corresponds to current chapter,
    from initial html_soup all tags from current chapter are extracted
    Parameters
    ----------
    first_id: str
        Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
    href: str
        Name of current chapters file
    html_soup: Tag
        Soup object of current  file

    Returns
    -------
    tags: list [Tag, NavigableString]
        Chapter's tags

    """
    marked_tags = html_soup.find(
        attrs={"id": first_id, "class": "converter-chapter-mark"})
    if marked_tags:
        next_tag = marked_tags.next_sibling
        tags = []
        while next_tag:
            if not isinstance(next_tag, NavigableString) and \
                    (next_tag.attrs.get("class") == "converter-chapter-mark"):
                break
            tags.append(next_tag)
            next_tag = next_tag.next_sibling

        # remove tags between first_id and next found id
        # save them in list for next steps
        tags = [tag.extract() for tag in tags]
        html_soup.smooth()

    else:
        assert 0, f"Warning: no match for {first_id, href}"

    return tags


def prepare_title(title_of_chapter: str) -> str:
    """Function finalise processing/cleaning title"""
    title_str = BeautifulSoup(title_of_chapter, features="lxml").string
    # clean extra whitespace characters ([\r\n\t\f\v ])
    title_str = re.sub(r"[\s\xa0]", " ", title_str).strip()
    return title_str


def _remove_comments(chapter_tag):
    for tag in chapter_tag.find_all():
        for element in tag(text=lambda text: isinstance(text, Comment)):
            element.extract()


def _wrap_strings_with_p(chapter_tag):
    # Headings that are not supported by livecarta converts to <p>
    # wrap NavigableString with <p>
    for node in chapter_tag:
        if isinstance(node, NavigableString):
            content = str(node)
            content = re.sub(r"([\s\xa0])", " ", content).strip()
            if content:
                p_tag = chapter_tag.new_tag("p")
                p_tag.append(str(node))
                node.replace_with(p_tag)


def _wrap_tags_with_table(chapter_tag):
    """Function wraps <tag> with <table>"""
    def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
        table = chapter_tag.new_tag("table")
        table.attrs["border"], table.attrs["align"], table.attrs["style"] \
            = border, "center", f"width:{width}%;"
        tbody, tr, td = \
            chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
        td.attrs["bgcolor"] = bg_color
        tag_to_be_wrapped.wrap(td)
        td.wrap(tr)
        tr.wrap(tbody)
        tbody.wrap(table)
        table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
        return table

    def process_tag_using_table(tag_to_wrap):
        _wrap_tag_with_table(
            chapter_tag,
            tag_to_be_wrapped=tag_to_wrap,
            width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
            border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
            bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
        _add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
        tag_to_wrap.unwrap()

    for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
        if isinstance(attrs, tuple):
            attr, val = attrs[0], attrs[1]
            for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
                process_tag_using_table(tag_to_wrap)
        else:
            for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
                if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
                    process_tag_using_table(tag_to_wrap)


def _tags_to_correspond_livecarta_tag(chapter_tag):
    """Function to replace all tags to correspond livecarta tags"""
    for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
        for key in reg_keys:
            if isinstance(key, tuple):
                replace = key[0]
                parent, child = key[1], key[2]
                for parent_tag in chapter_tag.select(parent):
                    if replace == "parent":
                        parent_tag.name = to_replace_value
                    elif replace == "child":
                        for child_tag in parent_tag.select(child):
                            child_tag.name = to_replace_value
                            if not child_tag.attrs.get("style"):
                                child_tag.attrs["style"] =\
                                    "font-size: 14px; font-family: courier new,courier,monospace;"
            else:
                tags = chapter_tag.find_all(re.compile(key))
                for tag in tags:
                    # todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
                    tag.name = to_replace_value


def _unwrap_tags(chapter_tag):
    """Function unwrap tags and move id to span"""
    for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
        for tag in chapter_tag.select(tag_name):
            # if tag is a subtag
            if ">" in tag_name:
                parent = tag.parent
                tag.parent.attrs.update(tag.attrs)
            _add_span_to_save_ids_for_links(tag, chapter_tag)
            tag.unwrap()


def _remove_headings_content(content_tag, title_of_chapter: str):
    """
    Function
    clean/remove headings from chapter in order to avoid duplication of chapter titles in the content
    add span with id in order to
    Parameters
    ----------
    content_tag: soup object
        Tag of the page
    title_of_chapter: str
        Chapter title

    Returns
    -------
    None
        clean/remove headings & add span with id

    """
    title_of_chapter = title_of_chapter.lower()
    for tag in content_tag.contents:
        text = tag if isinstance(tag, NavigableString) else tag.text
        if re.sub(r"[\s\xa0]", "", text):
            text = re.sub(r"[\s\xa0]", " ", text).lower()
            text = text.strip() # delete extra spaces
            if title_of_chapter == text or \
                    (title_of_chapter in text and
                     re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
                _add_span_to_save_ids_for_links(tag, content_tag)
                tag.extract()
                return
            elif not isinstance(tag, NavigableString):
                if not _remove_headings_content(tag, title_of_chapter):
                    break


def _preprocess_table(chapter_tag: BeautifulSoup):
    """Function to preprocess tables and tags(td|th|tr): style"""
    tables = chapter_tag.find_all("table")
    for table in tables:
        for t_tag in table.find_all(re.compile("td|th|tr")):
            width = ""
            if t_tag.get("style"):
                width_match = re.search(
                    r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
                if width_match:
                    size = width_match.group(1)
                    width = size + "px"

            t_tag.attrs["width"] = t_tag.get("width") or width

            if t_tag.attrs.get("style"):
                t_tag.attrs["style"] = t_tag.attrs["style"].replace(
                    "border:0;", "")
                if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
                    del t_tag.attrs["style"]

        if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
            table.attrs["border"] = "1"


def _insert_tags_in_parents(chapter_tag):
    parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
    for parent_tag_name, condition in parent_tag2condition.items():
        for parent_tag in chapter_tag.select(parent_tag_name):
            if parent_tag.select(condition):
                continue
            else:
                tag_to_insert = chapter_tag.new_tag(
                    LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
                # insert all items that was in pre to code and remove from pre
                for content in reversed(parent_tag.contents):
                    tag_to_insert.insert(0, content.extract())
                # wrap code with items
                parent_tag.append(tag_to_insert)


def _class_removing(chapter_tag):
    for tag in chapter_tag.find_all(recursive=True):
        if tag.attrs.get("class") \
                and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
            del tag.attrs["class"]


def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
    """
    Function finalise processing/cleaning content
    Parameters
    ----------
    title_str: str

    content_tag: Tag, soup object

    remove_title_from_chapter: bool

    Steps
    ----------
    1. heading removal
    2. processing tags
    3. class removal

    Returns
    -------
    content_tag: str
        prepared content

    """
    # 1. remove comments
    _remove_comments(content_tag)

    # 2. wrap NavigableString with tag <p>
    _wrap_strings_with_p(content_tag)

    _wrap_tags_with_table(content_tag)

    _tags_to_correspond_livecarta_tag(content_tag)

    _unwrap_tags(content_tag)

    # 3. heading removal
    if remove_title_from_chapter:
        _remove_headings_content(content_tag, title_str)

    # 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
    _preprocess_table(content_tag)
    _insert_tags_in_parents(content_tag)

    # 5. remove classes that weren't created by converter
    _class_removing(content_tag)
    return str(content_tag)