From 4f7aa69ab3f445abb5b171fc926eeb24e4a06958 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 20 Jul 2022 15:44:28 +0300 Subject: [PATCH] Heading removal fix --- src/epub_converter/html_epub_processor.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index aba8811..752c4ac 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -276,20 +276,25 @@ class HtmlEpubPreprocessor: """ title_of_chapter = title_of_chapter.lower() + if title_of_chapter == "chapter 1": + pass for tag in chapter_tag.contents: text = tag if isinstance(tag, NavigableString) else tag.text if re.sub(r"[\s\xa0]", "", text): text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces - if title_of_chapter == text or \ - (title_of_chapter in text and - re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): - self._add_span_to_save_ids_for_links(tag, chapter_tag) + if not isinstance(tag, NavigableString): + if title_of_chapter == text or \ + (title_of_chapter in text and + re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): + self._add_span_to_save_ids_for_links(tag, chapter_tag) + tag.extract() + return + elif not self._remove_headings_content(tag, title_of_chapter): + break + else: tag.extract() return - elif not isinstance(tag, NavigableString): - if not self._remove_headings_content(tag, title_of_chapter): - break @staticmethod def _process_tables(chapter_tag: BeautifulSoup):