forked from LiveCarta/BookConverter
converter: just renaming file and functions
This commit is contained in:
147
src/json_postprocessor.py
Normal file
147
src/json_postprocessor.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import logging
|
||||
import re
|
||||
from copy import copy
|
||||
|
||||
from config import LawCartaConfig
|
||||
|
||||
|
||||
class JSONConverter:
|
||||
def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
|
||||
self.content_dict = None
|
||||
self.content = content
|
||||
self.footnotes = footnotes
|
||||
self.top_level_headers = top_level_headers
|
||||
self.logger_object = logger_object
|
||||
self.book_api_status = book_api_status
|
||||
|
||||
@staticmethod
|
||||
def format_html(html_text):
|
||||
"""
|
||||
Function to remove useless symbols from html code.
|
||||
|
||||
:param html_text: Text to process.
|
||||
:return: Cleaned text.
|
||||
"""
|
||||
new_text = re.sub(r'([\n\t])', ' ', html_text)
|
||||
return new_text
|
||||
|
||||
# TODO: rethink the function structure without indexes.
|
||||
def header_to_livecarta_chapter_item(self, ind) -> (dict, int):
|
||||
"""
|
||||
Function process header and collects all content for it.
|
||||
|
||||
:param ind: Index of header in content list.
|
||||
"""
|
||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||
title = str(self.content[ind])
|
||||
title = title.replace(f'<{self.content[ind].name}>', '')
|
||||
title = title.replace(f'</{self.content[ind].name}>', '')
|
||||
title = re.sub(r'^\n', '', title)
|
||||
|
||||
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag
|
||||
result = {
|
||||
'title': f'{title}',
|
||||
'contents': [],
|
||||
'sub_items': []
|
||||
}
|
||||
ch_content = []
|
||||
ind += 1
|
||||
|
||||
while ind < len(self.content):
|
||||
# 1. next tag is a header
|
||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
||||
# - recursion step until h_i > h_initial
|
||||
if outline > curr_outline:
|
||||
header_dict, ind = self.header_to_livecarta_chapter_item(ind)
|
||||
if ch_content:
|
||||
result['contents'].append("".join(ch_content))
|
||||
ch_content = []
|
||||
result['sub_items'].append(header_dict)
|
||||
# - current h_i <= h_initial, end of recursion
|
||||
else:
|
||||
# return result, ind
|
||||
break
|
||||
# 2. next tag is not a header. add new paragraphs
|
||||
else:
|
||||
html_str = self.format_html(str(self.content[ind]))
|
||||
ch_content.append(html_str)
|
||||
ind += 1
|
||||
|
||||
if ch_content:
|
||||
result['contents'].append("".join(ch_content))
|
||||
return result, ind
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def _is_empty_p_tag(tag):
|
||||
if tag.name != 'p':
|
||||
return False
|
||||
|
||||
temp_tag = copy(tag)
|
||||
brs = temp_tag.find_all('br')
|
||||
for br in brs:
|
||||
br.decompose()
|
||||
|
||||
text = re.sub(r'\s+', '', temp_tag.text)
|
||||
if text:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def convert_to_dict(self):
|
||||
"""
|
||||
Function which convert list of html nodes to appropriate json structure.
|
||||
"""
|
||||
json_strc = []
|
||||
ind = 0
|
||||
ch_num = 0
|
||||
ch_amt = 0
|
||||
|
||||
try:
|
||||
while ind < len(self.content):
|
||||
res = {}
|
||||
|
||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||
res, ind = self.header_to_livecarta_chapter_item(ind)
|
||||
|
||||
else:
|
||||
chapter_title = f'Untitled chapter {ch_num}'
|
||||
chapter = []
|
||||
while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
|
||||
if not self._is_empty_p_tag(self.content[ind]):
|
||||
chapter.append(self.format_html(str(self.content[ind])))
|
||||
ind += 1
|
||||
if chapter:
|
||||
res = {
|
||||
'title': chapter_title,
|
||||
'contents': ["".join(chapter)],
|
||||
'sub_items': []
|
||||
}
|
||||
ch_num += 1
|
||||
|
||||
if res:
|
||||
json_strc.append(res)
|
||||
ch_amt += 1
|
||||
self.logger_object.log(f'Chapter {ch_amt} has been added to structure.')
|
||||
except Exception as exc:
|
||||
self.logger_object.log('Error has occurred while making json structure.', logging.ERROR)
|
||||
self.logger_object.log_error_to_main_log()
|
||||
if self.book_api_status:
|
||||
self.book_api_status.set_error_status()
|
||||
raise exc
|
||||
|
||||
# Add is_introduction field to json structure
|
||||
# after deleting content before toc, some chapters can be deleted
|
||||
if self.top_level_headers:
|
||||
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
|
||||
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
|
||||
|
||||
json_strc[0]['is_introduction'] = is_first_header_introduction
|
||||
|
||||
self.content_dict = {
|
||||
"content": json_strc,
|
||||
"footnotes": self.footnotes
|
||||
}
|
||||
|
||||
return self.content_dict
|
||||
Reference in New Issue
Block a user