From 00308b61e7c3f623c90d89205d97a5372acddabf Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 22 Sep 2022 14:08:53 +0300 Subject: [PATCH] improve heading removal --- src/epub_converter/html_epub_processor.py | 40 +++++++++++------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index e92ac8b..40640c1 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -1,5 +1,5 @@ import re -from typing import Union +from typing import List, Union from bs4.element import PageElement from bs4 import BeautifulSoup, Tag, NavigableString, Comment @@ -92,26 +92,26 @@ class HtmlEpubProcessor: clean/remove headings & add span with id """ - title_of_chapter = title_of_chapter.lower() - for tag in chapter_tag.contents: - tag: PageElement + def text_preparing(tag: PageElement): text: str = tag if isinstance(tag, NavigableString) else tag.text - if re.sub(r"[\s\xa0]", "", text): - text = re.sub(r"[\s\xa0]", " ", text).lower() - text = text.strip() # delete extra spaces - if not isinstance(tag, NavigableString): - if title_of_chapter == text or \ - (title_of_chapter in text and - re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): - self.html_preprocessor._add_span_to_save_ids_for_links( - tag, chapter_tag) - tag.extract() - return - elif not self._remove_headings_content(tag, title_of_chapter): - break - else: - tag.extract() - return + text = re.sub(r"[\s\xa0]", " ", text).lower() + text = text.strip() # delete extra spaces + return text + + title_of_chapter: str = title_of_chapter.lower() + title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \ + (title_of_chapter in text_preparing(tag) and + re.findall(r"^h[1-3]$", tag.name or chapter_tag.name))) + + text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter)) + if title_in_text: + self.html_preprocessor._add_span_to_save_ids_for_links( + title_in_text[-1], chapter_tag) + title_in_text[-1].extract() + elif text_in_title: + [self.html_preprocessor._add_span_to_save_ids_for_links( + tag, chapter_tag) for tag in text_in_title] + [tag.extract() for tag in text_in_title] @staticmethod def _class_removing(chapter_tag: BeautifulSoup):