forked from LiveCarta/BookConverter
- split book class into html-preprocessor, json-converter, book(with main flow) classes. - pick out logging, law carta setup, updating status via api into separate objects - in html-preprocesser: add cleaning hrefs
146 lines
5.0 KiB
Python
146 lines
5.0 KiB
Python
import logging
|
|
import re
|
|
import codecs
|
|
import json
|
|
|
|
from copy import copy
|
|
from config import BookConfig
|
|
|
|
|
|
class JSONConverter:
|
|
def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
|
|
self.content_dict = None
|
|
self.content = content
|
|
self.footnotes = footnotes
|
|
self.top_level_headers = top_level_headers
|
|
self.logger_object = logger_object
|
|
self.book_api_status = book_api_status
|
|
|
|
@staticmethod
|
|
def format_html(html_text):
|
|
"""
|
|
Function to remove useless symbols from html code.
|
|
|
|
:param html_text: Text to process.
|
|
:return: Cleaned text.
|
|
"""
|
|
new_text = re.sub(r'([\n\t])', ' ', html_text)
|
|
return new_text
|
|
|
|
# TODO: rethink the function structure without indexes.
|
|
def header_to_json(self, ind):
|
|
"""
|
|
Function process header and collects all content for it.
|
|
|
|
:param ind: Index of header in content list.
|
|
"""
|
|
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
|
|
title = self.content[ind].text
|
|
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag
|
|
result = {
|
|
'title': title,
|
|
'contents': [],
|
|
'sub_items': []
|
|
}
|
|
ch_content = []
|
|
ind += 1
|
|
|
|
while ind < len(self.content):
|
|
# 1. next tag is a header
|
|
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
|
|
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
|
# - recursion step until h_i > h_initial
|
|
if outline > curr_outline:
|
|
header_dict, ind = self.header_to_json(ind)
|
|
if ch_content:
|
|
result['contents'].append("".join(ch_content))
|
|
ch_content = []
|
|
result['sub_items'].append(header_dict)
|
|
# - current h_i <= h_initial, end of recursion
|
|
else:
|
|
# return result, ind
|
|
break
|
|
# 2. next tag is not a header. add new paragraphs
|
|
else:
|
|
html_str = self.format_html(str(self.content[ind]))
|
|
ch_content.append(html_str)
|
|
ind += 1
|
|
|
|
if ch_content:
|
|
result['contents'].append("".join(ch_content))
|
|
return result, ind
|
|
return ''
|
|
|
|
@staticmethod
|
|
def _is_empty_p_tag(tag):
|
|
if tag.name != 'p':
|
|
return False
|
|
|
|
temp_tag = copy(tag)
|
|
brs = temp_tag.find_all('br')
|
|
for br in brs:
|
|
br.decompose()
|
|
|
|
text = re.sub(r'\s+', '', temp_tag.text)
|
|
if text:
|
|
return False
|
|
|
|
return True
|
|
|
|
def convert_to_json(self):
|
|
"""
|
|
Function which convert list of html nodes to appropriate json structure.
|
|
"""
|
|
json_strc = []
|
|
ind = 0
|
|
ch_num = 0
|
|
ch_amt = 0
|
|
|
|
try:
|
|
while ind < len(self.content):
|
|
res = {}
|
|
|
|
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
|
|
res, ind = self.header_to_json(ind)
|
|
|
|
else:
|
|
chapter_title = f'Untitled chapter {ch_num}'
|
|
chapter = []
|
|
while ind < len(self.content) and self.content[ind].name not in BookConfig.SUPPORTED_HEADERS:
|
|
if not self._is_empty_p_tag(self.content[ind]):
|
|
chapter.append(self.format_html(str(self.content[ind])))
|
|
ind += 1
|
|
if chapter:
|
|
res = {
|
|
'title': chapter_title,
|
|
'contents': ["".join(chapter)],
|
|
'sub_items': []
|
|
}
|
|
ch_num += 1
|
|
|
|
if res:
|
|
json_strc.append(res)
|
|
ch_amt += 1
|
|
self.logger_object.log(f'Chapter {ch_amt} has been added to structure.')
|
|
except Exception as exc:
|
|
self.logger_object.log('Error has occurred while making json structure.', logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
if self.book_api_status:
|
|
self.book_api_status.set_error_status()
|
|
raise exc
|
|
|
|
# Add is_introduction field to json structure
|
|
# after deleting content before toc, some chapters can be deleted
|
|
if self.top_level_headers:
|
|
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
|
|
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
|
|
|
|
json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
|
|
|
|
self.content_dict = {
|
|
"content": json_strc,
|
|
"footnotes": self.footnotes
|
|
}
|
|
|
|
return self.content_dict
|