BookConverter/src/json_converter.py

import logging
import re
import codecs
import json

from copy import copy
from config import BookConfig


class JSONConverter:
    def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
        self.content_dict = None
        self.content = content
        self.footnotes = footnotes
        self.top_level_headers = top_level_headers
        self.logger_object = logger_object
        self.book_api_status = book_api_status

    @staticmethod
    def format_html(html_text):
        """
        Function to remove useless symbols from html code.

        :param html_text: Text to process.
        :return: Cleaned text.
        """
        new_text = re.sub(r'([\n\t])', ' ', html_text)
        return new_text

    # TODO: rethink the function structure without indexes.
    def header_to_json(self, ind):
        """
        Function process header and collects all content for it.

        :param ind: Index of header in content list.
        """
        if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
            title = self.content[ind].text
            curr_outline = int(re.sub(r"^h", "", self.content[ind].name))  # extract outline from tag
            result = {
                'title': title,
                'contents': [],
                'sub_items': []
            }
            ch_content = []
            ind += 1

            while ind < len(self.content):
                # 1. next tag is a header
                if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
                    outline = int(re.sub(r"^h", "", self.content[ind].name))
                    # - recursion step until h_i > h_initial
                    if outline > curr_outline:
                        header_dict, ind = self.header_to_json(ind)
                        if ch_content:
                            result['contents'].append("".join(ch_content))
                        ch_content = []
                        result['sub_items'].append(header_dict)
                    # - current h_i <= h_initial, end of recursion
                    else:
                        # return result, ind
                        break
                # 2. next tag is not a header. add new paragraphs
                else:
                    html_str = self.format_html(str(self.content[ind]))
                    ch_content.append(html_str)
                    ind += 1

            if ch_content:
                result['contents'].append("".join(ch_content))
            return result, ind
        return ''

    @staticmethod
    def _is_empty_p_tag(tag):
        if tag.name != 'p':
            return False

        temp_tag = copy(tag)
        brs = temp_tag.find_all('br')
        for br in brs:
            br.decompose()

        text = re.sub(r'\s+', '', temp_tag.text)
        if text:
            return False

        return True

    def convert_to_json(self):
        """
        Function which convert list of html nodes to appropriate json structure.
        """
        json_strc = []
        ind = 0
        ch_num = 0
        ch_amt = 0

        try:
            while ind < len(self.content):
                res = {}

                if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
                    res, ind = self.header_to_json(ind)

                else:
                    chapter_title = f'Untitled chapter {ch_num}'
                    chapter = []
                    while ind < len(self.content) and self.content[ind].name not in BookConfig.SUPPORTED_HEADERS:
                        if not self._is_empty_p_tag(self.content[ind]):
                            chapter.append(self.format_html(str(self.content[ind])))
                        ind += 1
                    if chapter:
                        res = {
                            'title': chapter_title,
                            'contents': ["".join(chapter)],
                            'sub_items': []
                        }
                        ch_num += 1

                if res:
                    json_strc.append(res)
                    ch_amt += 1
                    self.logger_object.log(f'Chapter {ch_amt} has been added to structure.')
        except Exception as exc:
            self.logger_object.log('Error has occurred while making json structure.', logging.ERROR)
            self.logger_object.log_error_to_main_log()
            if self.book_api_status:
                self.book_api_status.set_error_status()
            raise exc

        # Add is_introduction field to json structure
        # after deleting content before toc, some chapters can be deleted
        if self.top_level_headers:
            same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
            is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']

            json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles

        self.content_dict = {
            "content": json_strc,
            "footnotes": self.footnotes
        }

        return self.content_dict