Replace functions working to the 1 html processing

This commit is contained in:
Kiryl
2022-06-22 18:20:21 +03:00
parent de1246d890
commit 0f53caaffa

View File

@@ -34,69 +34,6 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou
class_=tag_to_be_removed.attrs.get("class"))
def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup:
"""
Main function that works with structure of html. Make changes inplace.
Parameters
----------
chapter_tag: Tag, soup object
Steps
----------
1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed.
This tag must have a chapter_tag as a parent.
Otherwise, it is wrapped with some tags. Like:
<p> <span id="123", class="converter-chapter-mark"> </span> </p>
3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString
Returns
-------
chapter_tag: Tag, BeautifulSoup
adjusted chapter_tag
"""
def _tags_to_correspond_livecarta_tag(chapter_tag):
"""Function to replace all tags to correspond livecarta tags"""
for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items():
for key in reg_key:
tags = chapter_tag.find_all(re.compile(key))
for tag in tags:
tag.name = to_replace_value
def _unwrap_tags(chapter_tag):
"""Function unwrap tags and move id to span"""
for tag in LiveCartaConfig. TAGS_TO_UNWRAP:
for s in chapter_tag.find_all(tag):
_add_span_to_save_ids_for_links(s, chapter_tag)
s.unwrap()
def _mark_parent_is_body(chapter_tag):
# check marks for chapter starting are on the same level - 1st
marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"})
# fix marks to be on 1 level
for mark in marks:
while mark.parent != chapter_tag:
mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
# 1. remove comments
_remove_comments(chapter_tag)
# 2. wrap NavigableString with tag <p>
_wrap_strings_with_p(chapter_tag)
_tags_to_correspond_livecarta_tag(chapter_tag)
_unwrap_tags(chapter_tag)
_mark_parent_is_body(chapter_tag)
return chapter_tag
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""
After processing on a first_id that corresponds to current chapter,
@@ -156,6 +93,7 @@ def _remove_comments(chapter_tag):
def _wrap_strings_with_p(chapter_tag):
# Headings that are not supported by livecarta converts to <p>
# wrap NavigableString with <p>
for node in chapter_tag:
if isinstance(node, NavigableString):
@@ -408,7 +346,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
prepared content
"""
# 1. remove comments
_remove_comments(content_tag)
# 2. wrap NavigableString with tag <p>
_wrap_strings_with_p(content_tag)
_tags_to_correspond_livecarta_tag(content_tag)
_unwrap_tags(content_tag)
# 3. heading removal
if remove_title_from_chapter: