forked from LiveCarta/BookConverter
Replace functions working to the 1 html processing
This commit is contained in:
@@ -34,69 +34,6 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou
|
||||
class_=tag_to_be_removed.attrs.get("class"))
|
||||
|
||||
|
||||
def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup:
|
||||
"""
|
||||
Main function that works with structure of html. Make changes inplace.
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: Tag, soup object
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. Extracts tags that are not needed
|
||||
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
|
||||
Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed.
|
||||
This tag must have a chapter_tag as a parent.
|
||||
Otherwise, it is wrapped with some tags. Like:
|
||||
<p> <span id="123", class="converter-chapter-mark"> </span> </p>
|
||||
3. Headings that are not supported by livecarta converts to <p>
|
||||
4. Wrapping NavigableString
|
||||
|
||||
Returns
|
||||
-------
|
||||
chapter_tag: Tag, BeautifulSoup
|
||||
adjusted chapter_tag
|
||||
|
||||
"""
|
||||
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
||||
"""Function to replace all tags to correspond livecarta tags"""
|
||||
for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items():
|
||||
for key in reg_key:
|
||||
tags = chapter_tag.find_all(re.compile(key))
|
||||
for tag in tags:
|
||||
tag.name = to_replace_value
|
||||
|
||||
def _unwrap_tags(chapter_tag):
|
||||
"""Function unwrap tags and move id to span"""
|
||||
for tag in LiveCartaConfig. TAGS_TO_UNWRAP:
|
||||
for s in chapter_tag.find_all(tag):
|
||||
_add_span_to_save_ids_for_links(s, chapter_tag)
|
||||
s.unwrap()
|
||||
|
||||
def _mark_parent_is_body(chapter_tag):
|
||||
# check marks for chapter starting are on the same level - 1st
|
||||
marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"})
|
||||
|
||||
# fix marks to be on 1 level
|
||||
for mark in marks:
|
||||
while mark.parent != chapter_tag:
|
||||
mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
|
||||
|
||||
# 1. remove comments
|
||||
_remove_comments(chapter_tag)
|
||||
|
||||
# 2. wrap NavigableString with tag <p>
|
||||
_wrap_strings_with_p(chapter_tag)
|
||||
|
||||
_tags_to_correspond_livecarta_tag(chapter_tag)
|
||||
|
||||
_unwrap_tags(chapter_tag)
|
||||
|
||||
_mark_parent_is_body(chapter_tag)
|
||||
|
||||
return chapter_tag
|
||||
|
||||
|
||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||
"""
|
||||
After processing on a first_id that corresponds to current chapter,
|
||||
@@ -156,6 +93,7 @@ def _remove_comments(chapter_tag):
|
||||
|
||||
|
||||
def _wrap_strings_with_p(chapter_tag):
|
||||
# Headings that are not supported by livecarta converts to <p>
|
||||
# wrap NavigableString with <p>
|
||||
for node in chapter_tag:
|
||||
if isinstance(node, NavigableString):
|
||||
@@ -408,7 +346,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
prepared content
|
||||
|
||||
"""
|
||||
# 1. remove comments
|
||||
_remove_comments(content_tag)
|
||||
|
||||
# 2. wrap NavigableString with tag <p>
|
||||
_wrap_strings_with_p(content_tag)
|
||||
|
||||
_tags_to_correspond_livecarta_tag(content_tag)
|
||||
|
||||
_unwrap_tags(content_tag)
|
||||
|
||||
# 3. heading removal
|
||||
if remove_title_from_chapter:
|
||||
|
||||
Reference in New Issue
Block a user