forked from LiveCarta/BookConverter
162 lines
5.6 KiB
Python
162 lines
5.6 KiB
Python
import re
|
|
import logging
|
|
from copy import copy
|
|
from typing import List, Tuple, Dict, Union
|
|
from bs4 import Tag
|
|
|
|
from src.livecarta_config import LiveCartaConfig
|
|
|
|
|
|
class LibreHtml2JsonConverter:
|
|
def __init__(self, content: List[Tag], footnotes: List[str], top_level_headers: List[Dict[str, Union[str, bool]]],
|
|
logger_object, book_api_status=None):
|
|
self.content_dict = None
|
|
self.content = content
|
|
self.footnotes = footnotes
|
|
self.top_level_headers = top_level_headers
|
|
self.logger_object = logger_object
|
|
self.book_api_status = book_api_status
|
|
|
|
@staticmethod
|
|
def format_html(html_text: str) -> str:
|
|
"""
|
|
Function to remove useless symbols from html code.
|
|
Parameters
|
|
----------
|
|
html_text: str
|
|
text to process.
|
|
|
|
Returns
|
|
-------
|
|
new_text: str
|
|
cleaned text
|
|
|
|
"""
|
|
new_text = re.sub(r"([\n\t])", " ", html_text)
|
|
return new_text
|
|
|
|
# TODO: rethink the function structure without indexes.
|
|
def header_to_livecarta_chapter_item(self, ind: int) -> Union[Tuple[Dict[str, Union[str, List]], int], str]:
|
|
"""
|
|
Function process header and collects all content for it.
|
|
Parameters
|
|
----------
|
|
ind: int
|
|
index of header in content list.
|
|
|
|
Returns
|
|
-------
|
|
result, ind
|
|
|
|
"""
|
|
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
|
title = str(self.content[ind])
|
|
title = title.replace(f"<{self.content[ind].name}>", "")
|
|
title = title.replace(f"</{self.content[ind].name}>", "")
|
|
title = re.sub(r"^\n", "", title)
|
|
|
|
# extract outline from tag
|
|
curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
|
|
result = {
|
|
"title": f"{title}",
|
|
"contents": [],
|
|
"sub_items": []
|
|
}
|
|
ch_content = []
|
|
ind += 1
|
|
|
|
while ind < len(self.content):
|
|
# 1. next tag is a header
|
|
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
|
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
|
# - recursion step until h_i > h_initial
|
|
if outline > curr_outline:
|
|
header_dict, ind = self.header_to_livecarta_chapter_item(
|
|
ind)
|
|
if ch_content:
|
|
result["contents"].append("".join(ch_content))
|
|
ch_content = []
|
|
result["sub_items"].append(header_dict)
|
|
# - current h_i <= h_initial, end of recursion
|
|
else:
|
|
# return result, ind
|
|
break
|
|
# 2. next tag is not a header. add new paragraphs
|
|
else:
|
|
html_str = self.format_html(str(self.content[ind]))
|
|
ch_content.append(html_str)
|
|
ind += 1
|
|
|
|
if ch_content:
|
|
result["contents"].append("".join(ch_content))
|
|
return result, ind
|
|
return ""
|
|
|
|
@staticmethod
|
|
def _is_empty_p_tag(tag: Tag) -> bool:
|
|
if tag.name != "p":
|
|
return False
|
|
|
|
temp_tag = copy(tag)
|
|
brs = temp_tag.find_all("br")
|
|
for br in brs:
|
|
br.decompose()
|
|
|
|
text = re.sub(r"\s+", "", temp_tag.text)
|
|
if text:
|
|
return False
|
|
return True
|
|
|
|
def convert_to_dict(self):
|
|
"""Function which convert list of html nodes to appropriate json structure."""
|
|
json_strc, ind, ch_num, ch_amt = [], 0, 0, 0
|
|
|
|
try:
|
|
while ind < len(self.content):
|
|
res = {}
|
|
|
|
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
|
res, ind = self.header_to_livecarta_chapter_item(ind)
|
|
|
|
else:
|
|
chapter_title = f"Untitled chapter {ch_num}"
|
|
chapter = []
|
|
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
|
|
if not self._is_empty_p_tag(self.content[ind]):
|
|
chapter.append(self.format_html(
|
|
str(self.content[ind])))
|
|
ind += 1
|
|
if chapter:
|
|
res = {
|
|
"title": chapter_title,
|
|
"contents": ["".join(chapter)],
|
|
"sub_items": []
|
|
}
|
|
ch_num += 1
|
|
|
|
if res:
|
|
json_strc.append(res)
|
|
ch_amt += 1
|
|
self.logger_object.log(
|
|
f"Chapter {ch_amt} has been added to structure.")
|
|
except Exception as exc:
|
|
self.logger_object.log(
|
|
"Error has occurred while making json structure.", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
if self.book_api_status:
|
|
self.book_api_status.set_error()
|
|
raise exc
|
|
|
|
# Add is_introduction field to json structure
|
|
# after deleting content before toc, some chapters can be deleted
|
|
if self.top_level_headers:
|
|
is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]
|
|
json_strc[0]["is_introduction"] = is_first_header_introduction
|
|
|
|
self.content_dict = {
|
|
"content": json_strc,
|
|
"footnotes": self.footnotes
|
|
}
|
|
|
|
return self.content_dict
|