improve heading removal

2022-09-22 14:08:53 +03:00
parent f7a37b132f
commit 00308b61e7
1 changed files with 20 additions and 20 deletions
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -1,5 +1,5 @@
 import re
-from typing import Union
+from typing import List, Union
 from bs4.element import PageElement
 from bs4 import BeautifulSoup, Tag, NavigableString, Comment

@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
            clean/remove headings & add span with id

        """
-        title_of_chapter = title_of_chapter.lower()
-        for tag in chapter_tag.contents:
-            tag: PageElement
+        def text_preparing(tag: PageElement):
            text: str = tag if isinstance(tag, NavigableString) else tag.text
-            if re.sub(r"[\s\xa0]", "", text):
-                text = re.sub(r"[\s\xa0]", " ", text).lower()
-                text = text.strip()  # delete extra spaces
-                if not isinstance(tag, NavigableString):
-                    if title_of_chapter == text or \
-                            (title_of_chapter in text and
-                             re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
-                        self.html_preprocessor._add_span_to_save_ids_for_links(
-                            tag, chapter_tag)
-                        tag.extract()
-                        return
-                    elif not self._remove_headings_content(tag, title_of_chapter):
-                        break
-                else:
-                    tag.extract()
-                    return
+            text = re.sub(r"[\s\xa0]", " ", text).lower()
+            text = text.strip()  # delete extra spaces
+            return text
+
+        title_of_chapter: str = title_of_chapter.lower()
+        title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
+                                                (title_of_chapter in text_preparing(tag) and
+                                                 re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
+
+        text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
+        if title_in_text:
+            self.html_preprocessor._add_span_to_save_ids_for_links(
+                title_in_text[-1], chapter_tag)
+            title_in_text[-1].extract()
+        elif text_in_title:
+            [self.html_preprocessor._add_span_to_save_ids_for_links(
+                tag, chapter_tag) for tag in text_in_title]
+            [tag.extract() for tag in text_in_title]

    @staticmethod
    def _class_removing(chapter_tag: BeautifulSoup):