BookConverter/src/docx_converter/html_docx_processor.py

import re
import json
import pathlib
from typing import List, Tuple, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString

from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.docx_converter.image_processing import process_images
from src.docx_converter.footnotes_processing import process_footnotes
from src.tag_inline_style_processor import modify_html_soup_with_css_styles


class HTMLDocxProcessor:

    def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
                 style_processor, preset_path: str = "presets/docx_presets.json"):
        self.body_tag = html_soup.body
        self.html_soup = html_soup
        self.logger = logger
        self.preset = json.load(open(preset_path))
        self.style_processor = style_processor
        self.name2action = {
            "decomposer": self._decompose_tag,
            "replacer": self._replace_tag,
            "attr_replacer": self._replace_attr,
            "unwrapper": self._unwrap_tag
        }

    def _process_toc_links(self):
        """Function to extract nodes which contains TOC links, remove links from file and detect headers."""
        def _check_parent_link_exist_in_toc(tag_with_link: Tag) -> bool:
            toc_links = []
            for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}):
                link_name = a_tag.attrs["name"]
                toc_item = self.body_tag.find("a", {"href": "#" + link_name})
                if toc_item:
                    toc_links.append(toc_item)
            return len(toc_links) > 0
        toc_links = self.body_tag.find_all(
            "a", {"name": re.compile(r"^_Toc\d+")})
        headers = [link.parent for link in toc_links]
        outline_level = "1"  # All the unknown outlines will be predicted as <h1>
        for tag in headers:
            if re.search(r"^h\d$", tag.name):
                tag.a.unwrap()
            elif tag.name == "p":
                exist_in_toc = _check_parent_link_exist_in_toc(tag)
                if tag in self.body_tag.find_all("p") and exist_in_toc:
                    new_tag = BeautifulSoup(
                        features="lxml").new_tag("h" + outline_level)
                    text = tag.text
                    tag.replaceWith(new_tag)
                    new_tag.string = text
            else:
                # rethink document structure when you have toc_links, other cases?
                self.logger.log(f"Something went wrong in processing toc_links."
                                f"Check the structure of the file."
                                f"Tag name: {tag.name}")

    @staticmethod
    def _decompose_tag(**kwargs):
        kwargs["tag"].decompose()

    @staticmethod
    def _replace_tag(**kwargs):
        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
        kwargs["tag"].name = tag_to_replace

    @staticmethod
    def _replace_attr(**kwargs):
        attr, attr_value =\
            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
        attr_to_replace, attr_value_to_replace =\
            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
        if attr_to_replace:
            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
            if attr_value_to_replace:
                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
            del kwargs["tag"][attr]
        elif attr_value_to_replace:
            kwargs["tag"].attrs[attr] = attr_value_to_replace

    @staticmethod
    def _unwrap_tag(**kwargs):
        kwargs["tag"].unwrap()

    @staticmethod
    def _process_tags(body_tag: Tag,
                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
                      action):
        """
        Function do action with tags
        Parameters
        ----------
        body_tag: Tag
            Tag & contents of the chapter tag
        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
            list of conditions when fire function
        action: function
            action what to do with tag
        Returns
        -------
        NoReturn
            Body Tag with processed certain tags

        """
        for rule in rules:
            tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
            if rule["condition"]:
                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
                    if condition_on_tag[0] == "parent_tags":
                        for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
                                                              for tag in tags])):
                            tag.parent.attrs.update(tag.attrs)
                            action(body_tag=body_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "child_tags":
                        for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
                                                              for tag in tags])):
                            action(body_tag=body_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "attrs":
                        for attr in rule["condition"]["attrs"]:
                            for tag in body_tag.find_all([re.compile(tag) for tag in tags],
                                                         {attr["name"]: re.compile(fr"{attr['value']}")}):
                                action(body_tag=body_tag, tag=tag, rule=rule)
                    # attr replacer
                    elif condition_on_tag[0] == "tags":
                        attr = rule["attr"]
                        for tag in body_tag.find_all([re.compile(tag) for tag in tags],
                                                     {attr['name']: re.compile(fr"{attr['value']}")}):
                            action(body_tag=body_tag, tag=tag, rule=rule)
            else:
                for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
                    action(body_tag=body_tag, tag=tag, rule=rule)

    def _process_paragraph(self):
        """Function to process <p> tags (text-align and text-indent value)."""
        # todo debug and remove if inline is enough
        paragraphs = self.body_tag.find_all("p")

        for p in paragraphs:
            # libre converts some \n into <p> with 2 </br>
            # there we remove 1 unnecessary <br>
            brs = p.find_all("br")
            text = p.text

            if brs and text == "\n\n" and len(brs) == 2:
                brs[0].decompose()

            indent_should_be_added = False
            if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")):
                indent_should_be_added = True

            align = p.get("align")
            style = p.get("style")

            if style:
                indent = re.search(r"text-indent: ([\d.]{1,4})in", style)
                margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style)
                margin_right = re.search(
                    r"margin-right: ([\d.]{1,4})in", style)
                margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style)
                margin_bottom = re.search(
                    r"margin-bottom: ([\d.]{1,4})in", style)
            else:
                indent = margin_left = margin_right = \
                    margin_top = margin_bottom = None

            if margin_left and margin_right and margin_top and margin_bottom and \
                    margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \
                    margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11":
                p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))

            p.attrs = {}
            style = ""

            if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE:
                style += f"text-align: {align};"

            if indent is not None or indent_should_be_added:
                # indent = indent.group(1)
                style += f"text-indent: {LiveCartaConfig.INDENT};"

            if style:
                p.attrs["style"] = style

    def _process_quotes(self):
        """
            Function to process block quotes.
            After docx to html conversion block quotes are stored inside table with 1 cell.
            All text is wrapped in a <i> tag.
            Such tables will be replaced with <blockquote> tags.

            <table cellpadding=\"7\" cellspacing=\"0\" width=\"614\">
                <col width=\"600\"/>
                <tr>
                    <td width=\"600\">
                        <p style=\"text-align: justify;\"><i>aaaaa</i></p>
                        <p style=\"text-align: justify;\"><br/></p>
                    </td>
                </tr>
            </table>

        """
        tables = self.body_tag.find_all("table")
        for table in tables:
            trs = table.find_all("tr")
            tds = table.find_all("td")
            if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600":
                td = tds[0]
                is_zero_border = "border: none;" in td.get("style")
                paragraphs = td.find_all("p")
                has_i_tag_or_br = [(p.i, p.br) for p in paragraphs]
                has_i_tag_or_br = [x[0] is not None or x[1] is not None
                                   for x in has_i_tag_or_br]

                if all(has_i_tag_or_br) and is_zero_border:
                    new_div = BeautifulSoup(
                        features="lxml").new_tag("blockquote")
                    for p in paragraphs:
                        new_div.append(p)

                    table.replaceWith(new_div)

    @staticmethod
    def convert_pt_to_px(value: float) -> float:
        value = float(value)
        if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
            return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
        else:
            return value

    def _process_tables(self):
        """Function to process tables. Set "border" attribute."""
        tables = self.body_tag.find_all("table")
        for table in tables:
            tds = table.find_all("td")
            sizes = []
            for td in tds:
                style = td.get("style")
                if style:
                    match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
                    if match:
                        size = match.group(1)
                        units = match.group(2)

                        if units == "pt":
                            size = self.convert_pt_to_px(size)

                        sizes.append(float(size))
                width = td.get("width")
                td.attrs = {}
                if width:
                    td.attrs["width"] = width
            if sizes:
                border_size = sum(sizes) / len(sizes)
                table.attrs["border"] = f"{border_size:.2}"

        self.tables_amount = len(tables)

    def _process_hrefs(self):
        a_tags_with_href = self.body_tag.find_all(
            "a", {"href": re.compile("^.*http.+")})

        # remove char=end of file for some editors
        for tag in a_tags_with_href:
            tag.string = tag.text.replace("\u200c", "")
            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")

        a_tags_with_href = self.body_tag.find_all(
            "a", {"href": re.compile("^(?!#sdfootnote)")})
        for tag in a_tags_with_href:
            tag.string = tag.text.replace("\u200c", "")
            tag.string = tag.text.replace("\u200b", "")  # zero-width-space
            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")

    def _process_div(self):
        # todo unwrapper
        """Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
        divs = self.body_tag.find_all("div")
        for div in divs:
            div.unwrap()

    def _get_top_level_headers(self) -> List[Dict[str, Union[str, bool]]]:
        """
        Function for gathering info about top-level chapters.

        Assume: _
            - Headers with the smallest outline(or digit in <h>) are top level chapters.
            [ It is consistent with a recursive algorithm
            for saving content to a resulted json structure,
            which happens in  header_to_json()]

        """
        headers_info = []
        header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
        headers_outline = [int(re.sub(r"^h", "", tag.name))
                           for tag in header_tags]
        if headers_outline:
            top_level_outline = min(headers_outline)
            top_level_headers = [tag for tag in header_tags
                                 if int(re.sub(r"^h", "", tag.name)) == top_level_outline]

            for tag in top_level_headers:
                if tag.parent.name == "li":
                    tag.parent.unwrap()
                    while tag.parent.name == "ol":
                        tag.parent.unwrap()

                title = tag.text
                title = re.sub(r"\s+", " ", title).strip()
                number = re.match(r"^(?:\.?\d+\.? ?)+", title)
                is_numbered = number is not None

                cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text)
                is_introduction = cleaned_title.lower() == "introduction"

                headers_info.append({
                    "title": cleaned_title,
                    "is_numbered": is_numbered,
                    "is_introduction": is_introduction})
        return headers_info

    def _mark_introduction_headers(self):
        """
        Function to find out:
        what header shouldn't be numbered and can be treated as introduction chapter
        Assume  header(s) to be introduction if:
            1. one header not numbered, before 1 numbered header
            2. it is first header from the top level list, and it equals to "introduction"

        Returns
        -------
        None
            mark each top-level header with flag should_be_numbered = true/false

        """
        is_numbered_header = [header["is_numbered"]
                              for header in self.top_level_headers]
        is_title = [header["is_introduction"]
                    for header in self.top_level_headers]

        first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
        second_is_numbered_or_not_exist = all(is_numbered_header[1:2])
        first_header_is_introduction = is_title and is_title[0]

        if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction:
            self.top_level_headers[0]["should_be_numbered"] = False
            for i in range(1, len(self.top_level_headers)):
                self.top_level_headers[i]["should_be_numbered"] = True
        else:
            for i in range(0, len(self.top_level_headers)):
                self.top_level_headers[i]["should_be_numbered"] = True

    @staticmethod
    def clean_title_from_tabs(tag: NavigableString):
        cleaned = re.sub(r"[\s\xa0]", " ", tag)
        this = BeautifulSoup.new_string(BeautifulSoup(
            features="lxml"), cleaned, NavigableString)
        tag.replace_with(this)

    def apply_func_to_last_child(self, tag: Union[NavigableString, Tag], func=None):
        """
        works only with constructions like (((child to work with)))
        where child is object of NavigableString
        """
        if type(tag) is NavigableString:
            func(tag)
        else:
            children = list(tag.children)
            if children:
                self.apply_func_to_last_child(children[0], func)

    def _process_headings(self):
        """
        Function to process tags <h>.
        Clean header from attrs and text in header from numbering and \n

        Returns
        -------
        None
            processed <h> tags

        """
        header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))

        # clean header from attrs and text in header from numbering and \n
        for h_tag in header_tags:
            h_tag.attrs = {}
            if h_tag.parent.name == "li":
                h_tag.parent.unwrap()
                while h_tag.parent.name == "ol":
                    h_tag.parent.unwrap()

            cleaned_title = re.sub(r"[\s\xa0]", " ", h_tag.text)
            if cleaned_title == "":
                h_tag.unwrap()
            else:
                assert h_tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
                    f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings."

                content = list(h_tag.children)

                # do not take into account rubbish empty tags like <a>, but don"t remove them
                content = [item for item in content if
                           (type(item) is not NavigableString and item.text != "")
                           or (type(item) is NavigableString)]

                content[0] = "" if content[0] == " " else content[0]
                content = [item for item in content if item != ""]

                for i, item in enumerate(content):
                    if type(content[i]) is NavigableString:
                        cleaned = re.sub(r"(\s+)+", " ", content[i])
                        this = BeautifulSoup.new_string(BeautifulSoup(
                            features="lxml"), cleaned, NavigableString)
                        content[i].replace_with(this)
                        content[i] = this
                    else:
                        self.apply_func_to_last_child(
                            content[i], self.clean_title_from_tabs)

    def delete_content_before_toc(self):
        # remove all tag upper the <TOC> only in content !!! body tag is not updated
        toc_tag = self.html_soup.new_tag("TOC")
        self.content: List[Tag] = self.body_tag.find_all(recursive=False)
        if toc_tag in self.content:
            ind = self.content.index(toc_tag) + 1
            self.content = self.content[ind:]

    def process_html(self,
                     access=None,
                     html_path: pathlib.Path = "",
                     book_id: int = 0) -> Tuple[List[Tag], List[str], List[Dict[str, Union[str, bool]]]]:
        """Process html code to satisfy LiveCarta formatting."""
        self.logger.log("Beginning of processing .html file.")

        self.logger.log(f"Processing TOC and headers.")
        self._process_toc_links()

        for rule in self.preset:
            self.logger.log(rule["preset_name"] + " process.")
            action = self.name2action[rule["preset_name"]]
            self._process_tags(self.body_tag, rule["rules"], action)

        self.logger.log("CSS inline style preprocessing.")
        self.style_processor.process_inline_styles_in_html_soup(self.html_soup)

        self.logger.log("CSS inline style processing.")
        modify_html_soup_with_css_styles(self.html_soup)

        # process main elements of the .html doc
        self.logger.log(f"Processing main elements of html.")
        self._process_paragraph()

        self.logger.log("Block quotes processing.")
        self._process_quotes()

        self.logger.log("Tables processing.")
        self._process_tables()
        self.logger.log(
            f"{self.tables_amount} tables have been processed.")

        self.logger.log("Hrefs processing.")
        self._process_hrefs()

        self.logger.log("Image processing.")
        self.images = process_images(access, path_to_html=html_path,
                                     book_id=book_id, body_tag=self.body_tag)
        self.logger.log(
            f"{len(self.images)} images have been processed.")

        self.logger.log("Footnotes processing.")
        self.footnotes: List[str] = process_footnotes(self.body_tag)
        self.logger.log(
            f"{len(self.footnotes)} footnotes have been processed.")

        self._process_div()

        self.top_level_headers: List[Dict[str, Union[str, bool]]]\
            = self._get_top_level_headers()
        self._mark_introduction_headers()

        self._process_headings()

        # delete text before table of content if exists
        self.delete_content_before_toc()

        self.logger.log("End of processing .html file.")

        return self.content, self.footnotes, self.top_level_headers