BookConverter/src/docx_converter/libre_html2json_converter.py

import re
import logging
from copy import copy
from typing import List, Tuple, Dict, Union
from bs4 import Tag

from src.livecarta_config import LiveCartaConfig


class LibreHtml2JsonConverter:
    def __init__(self, content: List[Tag], footnotes: List[str], top_level_headers: List[Dict[str, Union[str, bool]]],
                 logger_object, book_api_status=None):
        self.content_dict = None
        self.content = content
        self.footnotes = footnotes
        self.top_level_headers = top_level_headers
        self.logger_object = logger_object
        self.book_api_status = book_api_status

    @staticmethod
    def format_html(html_text: str) -> str:
        """
        Function to remove useless symbols from html code.
        Parameters
        ----------
        html_text: str
            text to process.

        Returns
        -------
        new_text: str
            cleaned text

        """
        new_text = re.sub(r"([\n\t])", " ", html_text)
        return new_text

    # TODO: rethink the function structure without indexes.
    def header_to_livecarta_chapter_item(self, ind: int) -> Union[Tuple[Dict[str, Union[str, List]], int], str]:
        """
        Function process header and collects all content for it.
        Parameters
        ----------
        ind: int
            index of header in content list.

        Returns
        -------
        result, ind

        """
        if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
            title = str(self.content[ind])
            title = title.replace(f"<{self.content[ind].name}>", "")
            title = title.replace(f"</{self.content[ind].name}>", "")
            title = re.sub(r"^\n", "", title)

            # extract outline from tag
            curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
            result = {
                "title": f"{title}",
                "contents": [],
                "sub_items": []
            }
            ch_content = []
            ind += 1

            while ind < len(self.content):
                # 1. next tag is a header
                if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
                    outline = int(re.sub(r"^h", "", self.content[ind].name))
                    # - recursion step until h_i > h_initial
                    if outline > curr_outline:
                        header_dict, ind = self.header_to_livecarta_chapter_item(
                            ind)
                        if ch_content:
                            result["contents"].append("".join(ch_content))
                        ch_content = []
                        result["sub_items"].append(header_dict)
                    # - current h_i <= h_initial, end of recursion
                    else:
                        # return result, ind
                        break
                # 2. next tag is not a header. add new paragraphs
                else:
                    html_str = self.format_html(str(self.content[ind]))
                    ch_content.append(html_str)
                    ind += 1

            if ch_content:
                result["contents"].append("".join(ch_content))
            return result, ind
        return ""

    @staticmethod
    def _is_empty_p_tag(tag: Tag) -> bool:
        if tag.name != "p":
            return False

        temp_tag = copy(tag)
        brs = temp_tag.find_all("br")
        for br in brs:
            br.decompose()

        text = re.sub(r"\s+", "", temp_tag.text)
        if text:
            return False
        return True

    def convert_to_dict(self):
        """Function which convert list of html nodes to appropriate json structure."""
        json_strc, ind, ch_num, ch_amt = [], 0, 0, 0

        try:
            while ind < len(self.content):
                res = {}

                if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
                    res, ind = self.header_to_livecarta_chapter_item(ind)

                else:
                    chapter_title = f"Untitled chapter {ch_num}"
                    chapter = []
                    while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
                        if not self._is_empty_p_tag(self.content[ind]):
                            chapter.append(self.format_html(
                                str(self.content[ind])))
                        ind += 1
                    if chapter:
                        res = {
                            "title": chapter_title,
                            "contents": ["".join(chapter)],
                            "sub_items": []
                        }
                        ch_num += 1

                if res:
                    json_strc.append(res)
                    ch_amt += 1
                    self.logger_object.log(
                        f"Chapter {ch_amt} has been added to structure.")
        except Exception as exc:
            self.logger_object.log(
                "Error has occurred while making json structure.", logging.ERROR)
            self.logger_object.log_error_to_main_log()
            if self.book_api_status:
                self.book_api_status.set_error()
            raise exc

        # Add is_introduction field to json structure
        # after deleting content before toc, some chapters can be deleted
        if self.top_level_headers:
            is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]
            json_strc[0]["is_introduction"] = is_first_header_introduction

        self.content_dict = {
            "content": json_strc,
            "footnotes": self.footnotes
        }

        return self.content_dict