This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/json_postprocessor.py
2021-04-26 15:25:21 +03:00

148 lines
5.2 KiB
Python

import logging
import re
from copy import copy
from livecarta_config import LawCartaConfig
class JSONConverter:
def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
self.content_dict = None
self.content = content
self.footnotes = footnotes
self.top_level_headers = top_level_headers
self.logger_object = logger_object
self.book_api_status = book_api_status
@staticmethod
def format_html(html_text):
"""
Function to remove useless symbols from html code.
:param html_text: Text to process.
:return: Cleaned text.
"""
new_text = re.sub(r'([\n\t])', ' ', html_text)
return new_text
# TODO: rethink the function structure without indexes.
def header_to_livecarta_chapter_item(self, ind) -> (dict, int):
"""
Function process header and collects all content for it.
:param ind: Index of header in content list.
"""
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
title = str(self.content[ind])
title = title.replace(f'<{self.content[ind].name}>', '')
title = title.replace(f'</{self.content[ind].name}>', '')
title = re.sub(r'^\n', '', title)
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag
result = {
'title': f'{title}',
'contents': [],
'sub_items': []
}
ch_content = []
ind += 1
while ind < len(self.content):
# 1. next tag is a header
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
outline = int(re.sub(r"^h", "", self.content[ind].name))
# - recursion step until h_i > h_initial
if outline > curr_outline:
header_dict, ind = self.header_to_livecarta_chapter_item(ind)
if ch_content:
result['contents'].append("".join(ch_content))
ch_content = []
result['sub_items'].append(header_dict)
# - current h_i <= h_initial, end of recursion
else:
# return result, ind
break
# 2. next tag is not a header. add new paragraphs
else:
html_str = self.format_html(str(self.content[ind]))
ch_content.append(html_str)
ind += 1
if ch_content:
result['contents'].append("".join(ch_content))
return result, ind
return ''
@staticmethod
def _is_empty_p_tag(tag):
if tag.name != 'p':
return False
temp_tag = copy(tag)
brs = temp_tag.find_all('br')
for br in brs:
br.decompose()
text = re.sub(r'\s+', '', temp_tag.text)
if text:
return False
return True
def convert_to_dict(self):
"""
Function which convert list of html nodes to appropriate json structure.
"""
json_strc = []
ind = 0
ch_num = 0
ch_amt = 0
try:
while ind < len(self.content):
res = {}
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
res, ind = self.header_to_livecarta_chapter_item(ind)
else:
chapter_title = f'Untitled chapter {ch_num}'
chapter = []
while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
if not self._is_empty_p_tag(self.content[ind]):
chapter.append(self.format_html(str(self.content[ind])))
ind += 1
if chapter:
res = {
'title': chapter_title,
'contents': ["".join(chapter)],
'sub_items': []
}
ch_num += 1
if res:
json_strc.append(res)
ch_amt += 1
self.logger_object.log(f'Chapter {ch_amt} has been added to structure.')
except Exception as exc:
self.logger_object.log('Error has occurred while making json structure.', logging.ERROR)
self.logger_object.log_error_to_main_log()
if self.book_api_status:
self.book_api_status.set_error_status()
raise exc
# Add is_introduction field to json structure
# after deleting content before toc, some chapters can be deleted
if self.top_level_headers:
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
json_strc[0]['is_introduction'] = is_first_header_introduction
self.content_dict = {
"content": json_strc,
"footnotes": self.footnotes
}
return self.content_dict