Heading removal fix

This commit is contained in:
Kiryl
2022-07-20 15:44:28 +03:00
parent 20fa1bfa86
commit 4f7aa69ab3

View File

@@ -276,20 +276,25 @@ class HtmlEpubPreprocessor:
""" """
title_of_chapter = title_of_chapter.lower() title_of_chapter = title_of_chapter.lower()
if title_of_chapter == "chapter 1":
pass
for tag in chapter_tag.contents: for tag in chapter_tag.contents:
text = tag if isinstance(tag, NavigableString) else tag.text text = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text): if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower() text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces text = text.strip() # delete extra spaces
if title_of_chapter == text or \ if not isinstance(tag, NavigableString):
(title_of_chapter in text and if title_of_chapter == text or \
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): (title_of_chapter in text and
self._add_span_to_save_ids_for_links(tag, chapter_tag) re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
self._add_span_to_save_ids_for_links(tag, chapter_tag)
tag.extract()
return
elif not self._remove_headings_content(tag, title_of_chapter):
break
else:
tag.extract() tag.extract()
return return
elif not isinstance(tag, NavigableString):
if not self._remove_headings_content(tag, title_of_chapter):
break
@staticmethod @staticmethod
def _process_tables(chapter_tag: BeautifulSoup): def _process_tables(chapter_tag: BeautifulSoup):