import logging import re import codecs import json from copy import copy from config import BookConfig class JSONConverter: def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None): self.content_dict = None self.content = content self.footnotes = footnotes self.top_level_headers = top_level_headers self.logger_object = logger_object self.book_api_status = book_api_status @staticmethod def format_html(html_text): """ Function to remove useless symbols from html code. :param html_text: Text to process. :return: Cleaned text. """ new_text = re.sub(r'([\n\t])', ' ', html_text) return new_text # TODO: rethink the function structure without indexes. def header_to_json(self, ind): """ Function process header and collects all content for it. :param ind: Index of header in content list. """ if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: title = self.content[ind].text curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag result = { 'title': title, 'contents': [], 'sub_items': [] } ch_content = [] ind += 1 while ind < len(self.content): # 1. next tag is a header if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: outline = int(re.sub(r"^h", "", self.content[ind].name)) # - recursion step until h_i > h_initial if outline > curr_outline: header_dict, ind = self.header_to_json(ind) if ch_content: result['contents'].append("".join(ch_content)) ch_content = [] result['sub_items'].append(header_dict) # - current h_i <= h_initial, end of recursion else: # return result, ind break # 2. next tag is not a header. add new paragraphs else: html_str = self.format_html(str(self.content[ind])) ch_content.append(html_str) ind += 1 if ch_content: result['contents'].append("".join(ch_content)) return result, ind return '' @staticmethod def _is_empty_p_tag(tag): if tag.name != 'p': return False temp_tag = copy(tag) brs = temp_tag.find_all('br') for br in brs: br.decompose() text = re.sub(r'\s+', '', temp_tag.text) if text: return False return True def convert_to_json(self): """ Function which convert list of html nodes to appropriate json structure. """ json_strc = [] ind = 0 ch_num = 0 ch_amt = 0 try: while ind < len(self.content): res = {} if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: res, ind = self.header_to_json(ind) else: chapter_title = f'Untitled chapter {ch_num}' chapter = [] while ind < len(self.content) and self.content[ind].name not in BookConfig.SUPPORTED_HEADERS: if not self._is_empty_p_tag(self.content[ind]): chapter.append(self.format_html(str(self.content[ind]))) ind += 1 if chapter: res = { 'title': chapter_title, 'contents': ["".join(chapter)], 'sub_items': [] } ch_num += 1 if res: json_strc.append(res) ch_amt += 1 self.logger_object.log(f'Chapter {ch_amt} has been added to structure.') except Exception as exc: self.logger_object.log('Error has occurred while making json structure.', logging.ERROR) self.logger_object.log_error_to_main_log() if self.book_api_status: self.book_api_status.set_error_status() raise exc # Add is_introduction field to json structure # after deleting content before toc, some chapters can be deleted if self.top_level_headers: same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title'] is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered'] json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles self.content_dict = { "content": json_strc, "footnotes": self.footnotes } return self.content_dict