From de1246d89096a83dc2a2ed4413abb8ca1536689d Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 22 Jun 2022 18:18:58 +0300 Subject: [PATCH] Improve remove headings content --- src/epub_converter/html_epub_preprocessor.py | 22 ++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index ed90767..c3ce356 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -190,14 +190,32 @@ def _remove_headings_content(content_tag, title_of_chapter: str): title_of_chapter = title_of_chapter.lower() for tag in content_tag.contents: text = tag if isinstance(tag, NavigableString) else tag.text - if text: + if re.sub(r'([\s\xa0])', '', text): text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces if title_of_chapter == text or \ (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name)): _add_span_to_save_ids_for_links(tag, content_tag) tag.extract() - break + elif not isinstance(tag, NavigableString): + _remove_headings_content(tag, title_of_chapter) + break + + +def _tags_to_correspond_livecarta_tag(chapter_tag): + """Function to replace all tags to correspond livecarta tags""" + for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): + for key in reg_key: + tags = chapter_tag.find_all(re.compile(key)) + for tag in tags: + tag.name = to_replace_value + +def _unwrap_tags(chapter_tag): + """Function unwrap tags and move id to span""" + for tag in LiveCartaConfig. TAGS_TO_UNWRAP: + for s in chapter_tag.find_all(tag): + _add_span_to_save_ids_for_links(s, chapter_tag) + s.unwrap() # todo remove