fix adding of heading 4

This commit is contained in:
Jeniamakarchik
2020-04-09 17:48:00 +03:00
parent 4704ab29ab
commit 86a2e0876d
2 changed files with 55 additions and 34 deletions

View File

@@ -510,6 +510,15 @@ class Book:
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
return title.strip()
def _preprocessing_headings(self):
"""
Function to convert all lower level headings to p tags
"""
header_tags = self.body_tag.find_all(re.compile("^h[4-6]$"))
for tag in header_tags:
tag.name = 'p'
print(tag)
def _process_headings(self):
"""
Function to process tags <h>.
@@ -526,11 +535,18 @@ class Book:
if title == "":
tag.unwrap()
else:
if tag.name in ["h4", "h5", "h6"]: # All the lower level headings will be transformed to h3 headings
tag.name = "h3"
assert tag.name not in ["h4", "h5", "h6"], 'Preprocessing went wrong, there is still h4-h6 headings.'
# if tag.name in ["h4", "h5", "h6"]:
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
new_tag.string = title
if new_tag.name == "p":
new_tag.attrs = tag.attrs
print(tag)
print(new_tag)
tag.replace_with(new_tag)
def write_html_from_list(self, file_name='url_test.html'):
@@ -553,6 +569,7 @@ class Book:
# process main elements of the .html doc
self.log(f'Processing main elements of html.')
self._preprocessing_headings()
self._process_paragraph()
self._process_two_columns()
self._process_quotes()
@@ -694,6 +711,10 @@ class Book:
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
json.dump(self.content_dict, f, ensure_ascii=False)
self.log('Data has been saved to .json file.')
from pprint import pprint
pprint(self.content_dict)
except Exception as exc:
self.log('Error has occurred while writing json file.', logging.ERROR)
# self.log_error_to_main_log()