improve heading removal

This commit is contained in:
Kiryl
2022-09-22 14:08:53 +03:00
parent f7a37b132f
commit 00308b61e7

View File

@@ -1,5 +1,5 @@
import re import re
from typing import Union from typing import List, Union
from bs4.element import PageElement from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment from bs4 import BeautifulSoup, Tag, NavigableString, Comment
@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
clean/remove headings & add span with id clean/remove headings & add span with id
""" """
title_of_chapter = title_of_chapter.lower() def text_preparing(tag: PageElement):
for tag in chapter_tag.contents:
tag: PageElement
text: str = tag if isinstance(tag, NavigableString) else tag.text text: str = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower() text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces text = text.strip() # delete extra spaces
if not isinstance(tag, NavigableString): return text
if title_of_chapter == text or \
(title_of_chapter in text and title_of_chapter: str = title_of_chapter.lower()
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
(title_of_chapter in text_preparing(tag) and
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
if title_in_text:
self.html_preprocessor._add_span_to_save_ids_for_links( self.html_preprocessor._add_span_to_save_ids_for_links(
tag, chapter_tag) title_in_text[-1], chapter_tag)
tag.extract() title_in_text[-1].extract()
return elif text_in_title:
elif not self._remove_headings_content(tag, title_of_chapter): [self.html_preprocessor._add_span_to_save_ids_for_links(
break tag, chapter_tag) for tag in text_in_title]
else: [tag.extract() for tag in text_in_title]
tag.extract()
return
@staticmethod @staticmethod
def _class_removing(chapter_tag: BeautifulSoup): def _class_removing(chapter_tag: BeautifulSoup):