From 0f53caaffaff2c02efbe2a948ff1ff8f3ab23b69 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 22 Jun 2022 18:20:21 +0300 Subject: [PATCH] Replace functions working to the 1 html processing --- src/epub_converter/html_epub_preprocessor.py | 72 +++----------------- 1 file changed, 9 insertions(+), 63 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index c3ce356..e46e46d 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -34,69 +34,6 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou class_=tag_to_be_removed.attrs.get("class")) -def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup: - """ - Main function that works with structure of html. Make changes inplace. - Parameters - ---------- - chapter_tag: Tag, soup object - - Steps - ---------- - 1. Extracts tags that are not needed - 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree. - Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed. - This tag must have a chapter_tag as a parent. - Otherwise, it is wrapped with some tags. Like: -

- 3. Headings that are not supported by livecarta converts to

- 4. Wrapping NavigableString - - Returns - ------- - chapter_tag: Tag, BeautifulSoup - adjusted chapter_tag - - """ - def _tags_to_correspond_livecarta_tag(chapter_tag): - """Function to replace all tags to correspond livecarta tags""" - for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): - for key in reg_key: - tags = chapter_tag.find_all(re.compile(key)) - for tag in tags: - tag.name = to_replace_value - - def _unwrap_tags(chapter_tag): - """Function unwrap tags and move id to span""" - for tag in LiveCartaConfig. TAGS_TO_UNWRAP: - for s in chapter_tag.find_all(tag): - _add_span_to_save_ids_for_links(s, chapter_tag) - s.unwrap() - - def _mark_parent_is_body(chapter_tag): - # check marks for chapter starting are on the same level - 1st - marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"}) - - # fix marks to be on 1 level - for mark in marks: - while mark.parent != chapter_tag: - mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases - - # 1. remove comments - _remove_comments(chapter_tag) - - # 2. wrap NavigableString with tag

- _wrap_strings_with_p(chapter_tag) - - _tags_to_correspond_livecarta_tag(chapter_tag) - - _unwrap_tags(chapter_tag) - - _mark_parent_is_body(chapter_tag) - - return chapter_tag - - def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: """ After processing on a first_id that corresponds to current chapter, @@ -156,6 +93,7 @@ def _remove_comments(chapter_tag): def _wrap_strings_with_p(chapter_tag): + # Headings that are not supported by livecarta converts to

# wrap NavigableString with

for node in chapter_tag: if isinstance(node, NavigableString): @@ -408,7 +346,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro prepared content """ + # 1. remove comments + _remove_comments(content_tag) + # 2. wrap NavigableString with tag

+ _wrap_strings_with_p(content_tag) + + _tags_to_correspond_livecarta_tag(content_tag) + + _unwrap_tags(content_tag) # 3. heading removal if remove_title_from_chapter: