improve heading removal

This commit is contained in:
Kiryl
2022-09-22 14:08:53 +03:00
parent f7a37b132f
commit 00308b61e7

View File

@@ -1,5 +1,5 @@
import re import re
from typing import Union from typing import List, Union
from bs4.element import PageElement from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment from bs4 import BeautifulSoup, Tag, NavigableString, Comment
@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
clean/remove headings & add span with id clean/remove headings & add span with id
""" """
title_of_chapter = title_of_chapter.lower() def text_preparing(tag: PageElement):
for tag in chapter_tag.contents:
tag: PageElement
text: str = tag if isinstance(tag, NavigableString) else tag.text text: str = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text): text = re.sub(r"[\s\xa0]", " ", text).lower()
text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces
text = text.strip() # delete extra spaces return text
if not isinstance(tag, NavigableString):
if title_of_chapter == text or \ title_of_chapter: str = title_of_chapter.lower()
(title_of_chapter in text and title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): (title_of_chapter in text_preparing(tag) and
self.html_preprocessor._add_span_to_save_ids_for_links( re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
tag, chapter_tag)
tag.extract() text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
return if title_in_text:
elif not self._remove_headings_content(tag, title_of_chapter): self.html_preprocessor._add_span_to_save_ids_for_links(
break title_in_text[-1], chapter_tag)
else: title_in_text[-1].extract()
tag.extract() elif text_in_title:
return [self.html_preprocessor._add_span_to_save_ids_for_links(
tag, chapter_tag) for tag in text_in_title]
[tag.extract() for tag in text_in_title]
@staticmethod @staticmethod
def _class_removing(chapter_tag: BeautifulSoup): def _class_removing(chapter_tag: BeautifulSoup):