From 00308b61e7c3f623c90d89205d97a5372acddabf Mon Sep 17 00:00:00 2001
From: Kiryl <kiryl.miatselitsa@teqniksoft.com>
Date: Thu, 22 Sep 2022 14:08:53 +0300
Subject: [PATCH] improve heading removal

---
 src/epub_converter/html_epub_processor.py | 40 +++++++++++------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py
index e92ac8b..40640c1 100644
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -1,5 +1,5 @@
 import re
-from typing import Union
+from typing import List, Union
 from bs4.element import PageElement
 from bs4 import BeautifulSoup, Tag, NavigableString, Comment
 
@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
             clean/remove headings & add span with id
 
         """
-        title_of_chapter = title_of_chapter.lower()
-        for tag in chapter_tag.contents:
-            tag: PageElement
+        def text_preparing(tag: PageElement):
             text: str = tag if isinstance(tag, NavigableString) else tag.text
-            if re.sub(r"[\s\xa0]", "", text):
-                text = re.sub(r"[\s\xa0]", " ", text).lower()
-                text = text.strip()  # delete extra spaces
-                if not isinstance(tag, NavigableString):
-                    if title_of_chapter == text or \
-                            (title_of_chapter in text and
-                             re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
-                        self.html_preprocessor._add_span_to_save_ids_for_links(
-                            tag, chapter_tag)
-                        tag.extract()
-                        return
-                    elif not self._remove_headings_content(tag, title_of_chapter):
-                        break
-                else:
-                    tag.extract()
-                    return
+            text = re.sub(r"[\s\xa0]", " ", text).lower()
+            text = text.strip()  # delete extra spaces
+            return text
+
+        title_of_chapter: str = title_of_chapter.lower()
+        title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
+                                                (title_of_chapter in text_preparing(tag) and
+                                                 re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
+
+        text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
+        if title_in_text:
+            self.html_preprocessor._add_span_to_save_ids_for_links(
+                title_in_text[-1], chapter_tag)
+            title_in_text[-1].extract()
+        elif text_in_title:
+            [self.html_preprocessor._add_span_to_save_ids_for_links(
+                tag, chapter_tag) for tag in text_in_title]
+            [tag.extract() for tag in text_in_title]
 
     @staticmethod
     def _class_removing(chapter_tag: BeautifulSoup):