forked from LiveCarta/BookConverter
fix chapter formatting
This commit is contained in:
@@ -425,17 +425,40 @@ class HTMLPreprocessor:
|
|||||||
f'Tag name: {tag.name}')
|
f'Tag name: {tag.name}')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_header_title(title):
|
def clean_title_from_numbering(title: str):
|
||||||
"""
|
"""
|
||||||
Function to remove digits and extra spaces from headers.
|
Function to remove digits from headers.
|
||||||
|
|
||||||
:param title: Title to process.
|
|
||||||
"""
|
"""
|
||||||
title = re.sub(r'\s+', ' ', title).strip()
|
|
||||||
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
|
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
|
||||||
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
|
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
|
||||||
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
|
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
|
||||||
return title.strip()
|
return title
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def clean_tag_from_tabs(tag: NavigableString):
|
||||||
|
cleaned = re.sub(r'(\s+)+', ' ', tag)
|
||||||
|
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
|
||||||
|
tag.replace_with(this)
|
||||||
|
# print('input: ', repr(tag))
|
||||||
|
# print('test: ', repr(cleaned))
|
||||||
|
|
||||||
|
def clean_tag_from_numbering(self, tag):
|
||||||
|
cleaned = self.clean_title_from_numbering(tag)
|
||||||
|
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
|
||||||
|
tag.replace_with(this)
|
||||||
|
# print('input: ', repr(tag))
|
||||||
|
# print('test: ', repr(cleaned))
|
||||||
|
|
||||||
|
def apply_func_to_last_child(self, tag, func=None):
|
||||||
|
"""
|
||||||
|
works only with constructions like (((child to work with)))
|
||||||
|
where child is object of NavigableString
|
||||||
|
"""
|
||||||
|
if type(tag) is NavigableString:
|
||||||
|
func(tag)
|
||||||
|
else:
|
||||||
|
children = list(tag.children)
|
||||||
|
self.apply_func_to_last_child(children[0], func)
|
||||||
|
|
||||||
def _preprocessing_headings(self):
|
def _preprocessing_headings(self):
|
||||||
"""
|
"""
|
||||||
@@ -476,7 +499,7 @@ class HTMLPreprocessor:
|
|||||||
number = re.match(r'^(?:\.?\d+\.? ?)+', title)
|
number = re.match(r'^(?:\.?\d+\.? ?)+', title)
|
||||||
is_numbered = number is not None
|
is_numbered = number is not None
|
||||||
|
|
||||||
cleaned_title = self.clean_header_title(tag.text)
|
cleaned_title = self.clean_title_from_numbering(tag.text)
|
||||||
is_introduction = cleaned_title.lower() == 'introduction'
|
is_introduction = cleaned_title.lower() == 'introduction'
|
||||||
|
|
||||||
headers_info.append({
|
headers_info.append({
|
||||||
@@ -513,32 +536,27 @@ class HTMLPreprocessor:
|
|||||||
for i in range(0, len(self.top_level_headers)):
|
for i in range(0, len(self.top_level_headers)):
|
||||||
self.top_level_headers[i]['should_be_numbered'] = True
|
self.top_level_headers[i]['should_be_numbered'] = True
|
||||||
|
|
||||||
def _clean_header_by_children(self, tag, is_first_span=None):
|
|
||||||
children = tag.find_all(recursive=False)
|
|
||||||
if not children:
|
|
||||||
text = tag.text
|
|
||||||
if is_first_span:
|
|
||||||
cleaned_text = self.clean_header_title(text)
|
|
||||||
else:
|
|
||||||
cleaned_text = text # re.sub(r'\s+', ' ', text).strip()
|
|
||||||
|
|
||||||
tag.string = cleaned_text
|
|
||||||
|
|
||||||
if cleaned_text == '':
|
|
||||||
tag.unwrap()
|
|
||||||
return
|
|
||||||
|
|
||||||
for i, child in enumerate(tag.find_all(recursive=False)):
|
|
||||||
if is_first_span and i == 0:
|
|
||||||
self._clean_header_by_children(child, True)
|
|
||||||
else:
|
|
||||||
self._clean_header_by_children(child)
|
|
||||||
|
|
||||||
def _process_headings(self):
|
def _process_headings(self):
|
||||||
"""
|
"""
|
||||||
Function to process tags <h>.
|
Function to process tags <h>.
|
||||||
"""
|
"""
|
||||||
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
|
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
|
||||||
|
|
||||||
|
# 1. remove <b>, <span>
|
||||||
|
for tag in header_tags:
|
||||||
|
b_tags = tag.find_all("b")
|
||||||
|
[tag.unwrap() for tag in b_tags]
|
||||||
|
|
||||||
|
spans = tag.find_all("span")
|
||||||
|
if spans:
|
||||||
|
for span in spans:
|
||||||
|
style = span.attrs.get("style")
|
||||||
|
span.unwrap()
|
||||||
|
tag.attrs = {}
|
||||||
|
|
||||||
|
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
|
||||||
|
|
||||||
|
# 2. clean text in header from numbering and \n
|
||||||
for tag in header_tags:
|
for tag in header_tags:
|
||||||
if tag.parent.name == "li":
|
if tag.parent.name == "li":
|
||||||
tag.parent.unwrap()
|
tag.parent.unwrap()
|
||||||
@@ -546,25 +564,34 @@ class HTMLPreprocessor:
|
|||||||
tag.parent.unwrap()
|
tag.parent.unwrap()
|
||||||
|
|
||||||
title = tag.text
|
title = tag.text
|
||||||
title = self.clean_header_title(title)
|
title = self.clean_title_from_numbering(title)
|
||||||
if title == "":
|
if title == "":
|
||||||
tag.unwrap()
|
tag.unwrap()
|
||||||
else:
|
else:
|
||||||
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
|
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
|
||||||
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
||||||
|
|
||||||
self._clean_header_by_children(tag, is_first_span=True)
|
content = list(tag.children)
|
||||||
|
|
||||||
b_tags = tag.find_all("b")
|
for i, item in enumerate(content):
|
||||||
[tag.unwrap() for tag in b_tags]
|
if type(content[i]) is NavigableString:
|
||||||
|
cleaned = re.sub(r'(\s+)+', ' ', content[i])
|
||||||
|
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
|
||||||
|
content[i].replace_with(this)
|
||||||
|
content[i] = this
|
||||||
|
else:
|
||||||
|
self.apply_func_to_last_child(content[i], self.clean_tag_from_tabs)
|
||||||
|
|
||||||
spans = tag.find_all("span")
|
content[0] = '' if content[0] == ' ' else content[0]
|
||||||
if spans:
|
content = [item for item in content if item != '']
|
||||||
for span in spans:
|
|
||||||
style = span.attrs.get("style")
|
|
||||||
span.unwrap()
|
|
||||||
|
|
||||||
tag.attrs = {}
|
if type(content[0]) is NavigableString:
|
||||||
|
cleaned = self.clean_title_from_numbering(content[0])
|
||||||
|
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
|
||||||
|
content[0].replace_with(this)
|
||||||
|
content[0] = this
|
||||||
|
else:
|
||||||
|
self.apply_func_to_last_child(content[0], self.clean_tag_from_numbering)
|
||||||
|
|
||||||
def _process_lists(self):
|
def _process_lists(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
Reference in New Issue
Block a user