This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/docx_converter/libre_html2json_converter.py
2022-09-06 16:36:35 +03:00

162 lines
5.6 KiB
Python

import re
import logging
from copy import copy
from typing import List, Tuple, Dict, Union
from bs4 import Tag
from src.livecarta_config import LiveCartaConfig
class LibreHtml2JsonConverter:
def __init__(self, content: List[Tag], footnotes: List[str], top_level_headers: List[Dict[str, Union[str, bool]]],
logger_object, book_api_status=None):
self.content_dict = None
self.content = content
self.footnotes = footnotes
self.top_level_headers = top_level_headers
self.logger_object = logger_object
self.book_api_status = book_api_status
@staticmethod
def format_html(html_text: str) -> str:
"""
Function to remove useless symbols from html code.
Parameters
----------
html_text: str
text to process.
Returns
-------
new_text: str
cleaned text
"""
new_text = re.sub(r"([\n\t])", " ", html_text)
return new_text
# TODO: rethink the function structure without indexes.
def header_to_livecarta_chapter_item(self, ind: int) -> Union[Tuple[Dict[str, Union[str, List]], int], str]:
"""
Function process header and collects all content for it.
Parameters
----------
ind: int
index of header in content list.
Returns
-------
result, ind
"""
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
title = str(self.content[ind])
title = title.replace(f"<{self.content[ind].name}>", "")
title = title.replace(f"</{self.content[ind].name}>", "")
title = re.sub(r"^\n", "", title)
# extract outline from tag
curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
result = {
"title": f"{title}",
"contents": [],
"sub_items": []
}
ch_content = []
ind += 1
while ind < len(self.content):
# 1. next tag is a header
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
outline = int(re.sub(r"^h", "", self.content[ind].name))
# - recursion step until h_i > h_initial
if outline > curr_outline:
header_dict, ind = self.header_to_livecarta_chapter_item(
ind)
if ch_content:
result["contents"].append("".join(ch_content))
ch_content = []
result["sub_items"].append(header_dict)
# - current h_i <= h_initial, end of recursion
else:
# return result, ind
break
# 2. next tag is not a header. add new paragraphs
else:
html_str = self.format_html(str(self.content[ind]))
ch_content.append(html_str)
ind += 1
if ch_content:
result["contents"].append("".join(ch_content))
return result, ind
return ""
@staticmethod
def _is_empty_p_tag(tag: Tag) -> bool:
if tag.name != "p":
return False
temp_tag = copy(tag)
brs = temp_tag.find_all("br")
for br in brs:
br.decompose()
text = re.sub(r"\s+", "", temp_tag.text)
if text:
return False
return True
def convert_to_dict(self):
"""Function which convert list of html nodes to appropriate json structure."""
json_strc, ind, ch_num, ch_amt = [], 0, 0, 0
try:
while ind < len(self.content):
res = {}
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
res, ind = self.header_to_livecarta_chapter_item(ind)
else:
chapter_title = f"Untitled chapter {ch_num}"
chapter = []
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
if not self._is_empty_p_tag(self.content[ind]):
chapter.append(self.format_html(
str(self.content[ind])))
ind += 1
if chapter:
res = {
"title": chapter_title,
"contents": ["".join(chapter)],
"sub_items": []
}
ch_num += 1
if res:
json_strc.append(res)
ch_amt += 1
self.logger_object.log(
f"Chapter {ch_amt} has been added to structure.")
except Exception as exc:
self.logger_object.log(
"Error has occurred while making json structure.", logging.ERROR)
self.logger_object.log_error_to_main_log()
if self.book_api_status:
self.book_api_status.set_error()
raise exc
# Add is_introduction field to json structure
# after deleting content before toc, some chapters can be deleted
if self.top_level_headers:
is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]
json_strc[0]["is_introduction"] = is_first_header_introduction
self.content_dict = {
"content": json_strc,
"footnotes": self.footnotes
}
return self.content_dict