improve heading removal

This commit is contained in:
Kiryl
2022-09-22 14:08:53 +03:00
parent f7a37b132f
commit 00308b61e7

View File

@@ -1,5 +1,5 @@
import re
from typing import Union
from typing import List, Union
from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
clean/remove headings & add span with id
"""
title_of_chapter = title_of_chapter.lower()
for tag in chapter_tag.contents:
tag: PageElement
def text_preparing(tag: PageElement):
text: str = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
if not isinstance(tag, NavigableString):
if title_of_chapter == text or \
(title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
self.html_preprocessor._add_span_to_save_ids_for_links(
tag, chapter_tag)
tag.extract()
return
elif not self._remove_headings_content(tag, title_of_chapter):
break
else:
tag.extract()
return
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
return text
title_of_chapter: str = title_of_chapter.lower()
title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
(title_of_chapter in text_preparing(tag) and
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
if title_in_text:
self.html_preprocessor._add_span_to_save_ids_for_links(
title_in_text[-1], chapter_tag)
title_in_text[-1].extract()
elif text_in_title:
[self.html_preprocessor._add_span_to_save_ids_for_links(
tag, chapter_tag) for tag in text_in_title]
[tag.extract() for tag in text_in_title]
@staticmethod
def _class_removing(chapter_tag: BeautifulSoup):