import re import logging from copy import copy from typing import List, Tuple, Dict, Union from bs4 import Tag from src.livecarta_config import LiveCartaConfig class LibreHtml2JsonConverter: def __init__(self, content: List[Tag], footnotes: List[str], top_level_headers: List[Dict[str, Union[str, bool]]], logger_object, book_api_status=None): self.content_dict = None self.content = content self.footnotes = footnotes self.top_level_headers = top_level_headers self.logger_object = logger_object self.book_api_status = book_api_status @staticmethod def format_html(html_text: str) -> str: """ Function to remove useless symbols from html code. Parameters ---------- html_text: str text to process. Returns ------- new_text: str cleaned text """ new_text = re.sub(r"([\n\t])", " ", html_text) return new_text # TODO: rethink the function structure without indexes. def header_to_livecarta_chapter_item(self, ind: int) -> Union[Tuple[Dict[str, Union[str, List]], int], str]: """ Function process header and collects all content for it. Parameters ---------- ind: int index of header in content list. Returns ------- result, ind """ if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: title = str(self.content[ind]) title = title.replace(f"<{self.content[ind].name}>", "") title = title.replace(f"", "") title = re.sub(r"^\n", "", title) # extract outline from tag curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) result = { "title": f"{title}", "contents": [], "sub_items": [] } ch_content = [] ind += 1 while ind < len(self.content): # 1. next tag is a header if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: outline = int(re.sub(r"^h", "", self.content[ind].name)) # - recursion step until h_i > h_initial if outline > curr_outline: header_dict, ind = self.header_to_livecarta_chapter_item( ind) if ch_content: result["contents"].append("".join(ch_content)) ch_content = [] result["sub_items"].append(header_dict) # - current h_i <= h_initial, end of recursion else: # return result, ind break # 2. next tag is not a header. add new paragraphs else: html_str = self.format_html(str(self.content[ind])) ch_content.append(html_str) ind += 1 if ch_content: result["contents"].append("".join(ch_content)) return result, ind return "" @staticmethod def _is_empty_p_tag(tag: Tag) -> bool: if tag.name != "p": return False temp_tag = copy(tag) brs = temp_tag.find_all("br") for br in brs: br.decompose() text = re.sub(r"\s+", "", temp_tag.text) if text: return False return True def convert_to_dict(self): """Function which convert list of html nodes to appropriate json structure.""" json_strc, ind, ch_num, ch_amt = [], 0, 0, 0 try: while ind < len(self.content): res = {} if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: res, ind = self.header_to_livecarta_chapter_item(ind) else: chapter_title = f"Untitled chapter {ch_num}" chapter = [] while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: if not self._is_empty_p_tag(self.content[ind]): chapter.append(self.format_html( str(self.content[ind]))) ind += 1 if chapter: res = { "title": chapter_title, "contents": ["".join(chapter)], "sub_items": [] } ch_num += 1 if res: json_strc.append(res) ch_amt += 1 self.logger_object.log( f"Chapter {ch_amt} has been added to structure.") except Exception as exc: self.logger_object.log( "Error has occurred while making json structure.", logging.ERROR) self.logger_object.log_error_to_main_log() if self.book_api_status: self.book_api_status.set_error() raise exc # Add is_introduction field to json structure # after deleting content before toc, some chapters can be deleted if self.top_level_headers: is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"] json_strc[0]["is_introduction"] = is_first_header_introduction self.content_dict = { "content": json_strc, "footnotes": self.footnotes } return self.content_dict