forked from LiveCarta/BookConverter
LAW-3261 adding is_introduction field to resulted json
This commit is contained in:
82
src/book.py
82
src/book.py
@@ -47,6 +47,7 @@ class Book:
|
|||||||
self.content = list()
|
self.content = list()
|
||||||
self.footnotes = list()
|
self.footnotes = list()
|
||||||
self.images = list()
|
self.images = list()
|
||||||
|
self.top_level_headers = None
|
||||||
self.content_dict = dict()
|
self.content_dict = dict()
|
||||||
self.tables_amount = 0
|
self.tables_amount = 0
|
||||||
|
|
||||||
@@ -593,6 +594,72 @@ class Book:
|
|||||||
for tag in header_tags:
|
for tag in header_tags:
|
||||||
tag.name = 'p'
|
tag.name = 'p'
|
||||||
|
|
||||||
|
def _get_top_level_headers(self):
|
||||||
|
"""
|
||||||
|
Function for gathering info about top-level chapters.
|
||||||
|
|
||||||
|
Assume:
|
||||||
|
- Headers with smallest outline(or digit in <h>) are top level chapters.
|
||||||
|
[ It is consistent with a recursive algorithm
|
||||||
|
for saving content to a resulted json structure,
|
||||||
|
which happens in header_to_json()]
|
||||||
|
|
||||||
|
"""
|
||||||
|
headers_info = []
|
||||||
|
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
|
||||||
|
headers_outline = [int(re.sub(r"^h", "", tag.name)) for tag in header_tags]
|
||||||
|
top_level_outline = min(headers_outline)
|
||||||
|
top_level_headers = [tag for tag in header_tags
|
||||||
|
if int(re.sub(r"^h", "", tag.name)) == top_level_outline]
|
||||||
|
|
||||||
|
for tag in top_level_headers:
|
||||||
|
if tag.parent.name == "li":
|
||||||
|
tag.parent.unwrap()
|
||||||
|
while tag.parent.name == "ol":
|
||||||
|
tag.parent.unwrap()
|
||||||
|
|
||||||
|
title = tag.text
|
||||||
|
title = re.sub(r'\s+', ' ', title).strip()
|
||||||
|
number = re.match(r'^(?:\.?\d+\.? ?)+', title)
|
||||||
|
is_numbered = number is not None
|
||||||
|
|
||||||
|
cleaned_title = self.clean_header_title(tag.text)
|
||||||
|
is_introduction = cleaned_title.lower() == 'introduction'
|
||||||
|
|
||||||
|
headers_info.append({
|
||||||
|
'title': cleaned_title,
|
||||||
|
'is_numbered': is_numbered,
|
||||||
|
'is_introduction': is_introduction})
|
||||||
|
|
||||||
|
return headers_info
|
||||||
|
|
||||||
|
def _mark_introduction_headers(self):
|
||||||
|
"""
|
||||||
|
Function to find out:
|
||||||
|
what header shouldn't be numbered and can be treated as introductive chapter
|
||||||
|
|
||||||
|
Assume header(s) to be introductive if:
|
||||||
|
1. one header not numbered, before 1 numbered header
|
||||||
|
2. it is first header from the top level list and it equals to 'introduction'
|
||||||
|
|
||||||
|
Result :
|
||||||
|
Mark each top-level header with flag should_be_numbered = true/false
|
||||||
|
"""
|
||||||
|
is_numbered_header = [header['is_numbered'] for header in self.top_level_headers]
|
||||||
|
is_title = [header['is_introduction'] for header in self.top_level_headers]
|
||||||
|
|
||||||
|
first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
|
||||||
|
second_is_numbered_or_not_exist = all(is_numbered_header[1:2])
|
||||||
|
first_header_is_introduction = is_title and is_title[0]
|
||||||
|
|
||||||
|
if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction:
|
||||||
|
self.top_level_headers[0]['should_be_numbered'] = False
|
||||||
|
for i in range(1, len(self.top_level_headers)):
|
||||||
|
self.top_level_headers[i]['should_be_numbered'] = True
|
||||||
|
else:
|
||||||
|
for i in range(0, len(self.top_level_headers)):
|
||||||
|
self.top_level_headers[i]['should_be_numbered'] = True
|
||||||
|
|
||||||
def _process_headings(self):
|
def _process_headings(self):
|
||||||
"""
|
"""
|
||||||
Function to process tags <h>.
|
Function to process tags <h>.
|
||||||
@@ -663,6 +730,10 @@ class Book:
|
|||||||
|
|
||||||
self.log(f'Processing TOC and headers.')
|
self.log(f'Processing TOC and headers.')
|
||||||
self._process_toc_links()
|
self._process_toc_links()
|
||||||
|
|
||||||
|
self.top_level_headers = self._get_top_level_headers()
|
||||||
|
self._mark_introduction_headers()
|
||||||
|
|
||||||
self._process_headings()
|
self._process_headings()
|
||||||
|
|
||||||
self.content = self.body_tag.find_all(recursive=False)
|
self.content = self.body_tag.find_all(recursive=False)
|
||||||
@@ -703,17 +774,21 @@ class Book:
|
|||||||
ind += 1
|
ind += 1
|
||||||
|
|
||||||
while ind < len(self.content):
|
while ind < len(self.content):
|
||||||
|
# 1. next tag is a header
|
||||||
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
||||||
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
||||||
|
# - recursion step until h_i > h_initial
|
||||||
if outline > curr_outline:
|
if outline > curr_outline:
|
||||||
res, ind = self.header_to_json(ind)
|
res, ind = self.header_to_json(ind)
|
||||||
if ch_content:
|
if ch_content:
|
||||||
result[title].append("".join(ch_content))
|
result[title].append("".join(ch_content))
|
||||||
ch_content = []
|
ch_content = []
|
||||||
result[title].append(res)
|
result[title].append(res)
|
||||||
|
# - current h_i <= h_initial, end of recursion
|
||||||
else:
|
else:
|
||||||
# return result, ind
|
# return result, ind
|
||||||
break
|
break
|
||||||
|
# 2. next tag is not a header. add new paragraphs
|
||||||
else:
|
else:
|
||||||
res = self.format_html(str(self.content[ind]))
|
res = self.format_html(str(self.content[ind]))
|
||||||
# result[title].append(res)
|
# result[title].append(res)
|
||||||
@@ -776,6 +851,13 @@ class Book:
|
|||||||
self.set_error_status()
|
self.set_error_status()
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
|
# add is_introduction field if first chapter is Introduction
|
||||||
|
if not self.top_level_headers[0]['should_be_numbered']:
|
||||||
|
# title should be checked as after deleting content
|
||||||
|
# before toc, some chapters can be deleted
|
||||||
|
if self.top_level_headers[0]['title'] in json_strc[0].keys():
|
||||||
|
json_strc[0]['is_introduction'] = True
|
||||||
|
|
||||||
self.content_dict = {
|
self.content_dict = {
|
||||||
"content": json_strc,
|
"content": json_strc,
|
||||||
"footnotes": self.footnotes
|
"footnotes": self.footnotes
|
||||||
|
|||||||
Reference in New Issue
Block a user