forked from LiveCarta/BookConverter
Replace functions working to the 1 html processing
This commit is contained in:
@@ -34,69 +34,6 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou
|
|||||||
class_=tag_to_be_removed.attrs.get("class"))
|
class_=tag_to_be_removed.attrs.get("class"))
|
||||||
|
|
||||||
|
|
||||||
def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup:
|
|
||||||
"""
|
|
||||||
Main function that works with structure of html. Make changes inplace.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: Tag, soup object
|
|
||||||
|
|
||||||
Steps
|
|
||||||
----------
|
|
||||||
1. Extracts tags that are not needed
|
|
||||||
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
|
|
||||||
Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed.
|
|
||||||
This tag must have a chapter_tag as a parent.
|
|
||||||
Otherwise, it is wrapped with some tags. Like:
|
|
||||||
<p> <span id="123", class="converter-chapter-mark"> </span> </p>
|
|
||||||
3. Headings that are not supported by livecarta converts to <p>
|
|
||||||
4. Wrapping NavigableString
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
chapter_tag: Tag, BeautifulSoup
|
|
||||||
adjusted chapter_tag
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
|
||||||
"""Function to replace all tags to correspond livecarta tags"""
|
|
||||||
for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items():
|
|
||||||
for key in reg_key:
|
|
||||||
tags = chapter_tag.find_all(re.compile(key))
|
|
||||||
for tag in tags:
|
|
||||||
tag.name = to_replace_value
|
|
||||||
|
|
||||||
def _unwrap_tags(chapter_tag):
|
|
||||||
"""Function unwrap tags and move id to span"""
|
|
||||||
for tag in LiveCartaConfig. TAGS_TO_UNWRAP:
|
|
||||||
for s in chapter_tag.find_all(tag):
|
|
||||||
_add_span_to_save_ids_for_links(s, chapter_tag)
|
|
||||||
s.unwrap()
|
|
||||||
|
|
||||||
def _mark_parent_is_body(chapter_tag):
|
|
||||||
# check marks for chapter starting are on the same level - 1st
|
|
||||||
marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"})
|
|
||||||
|
|
||||||
# fix marks to be on 1 level
|
|
||||||
for mark in marks:
|
|
||||||
while mark.parent != chapter_tag:
|
|
||||||
mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
|
|
||||||
|
|
||||||
# 1. remove comments
|
|
||||||
_remove_comments(chapter_tag)
|
|
||||||
|
|
||||||
# 2. wrap NavigableString with tag <p>
|
|
||||||
_wrap_strings_with_p(chapter_tag)
|
|
||||||
|
|
||||||
_tags_to_correspond_livecarta_tag(chapter_tag)
|
|
||||||
|
|
||||||
_unwrap_tags(chapter_tag)
|
|
||||||
|
|
||||||
_mark_parent_is_body(chapter_tag)
|
|
||||||
|
|
||||||
return chapter_tag
|
|
||||||
|
|
||||||
|
|
||||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||||
"""
|
"""
|
||||||
After processing on a first_id that corresponds to current chapter,
|
After processing on a first_id that corresponds to current chapter,
|
||||||
@@ -156,6 +93,7 @@ def _remove_comments(chapter_tag):
|
|||||||
|
|
||||||
|
|
||||||
def _wrap_strings_with_p(chapter_tag):
|
def _wrap_strings_with_p(chapter_tag):
|
||||||
|
# Headings that are not supported by livecarta converts to <p>
|
||||||
# wrap NavigableString with <p>
|
# wrap NavigableString with <p>
|
||||||
for node in chapter_tag:
|
for node in chapter_tag:
|
||||||
if isinstance(node, NavigableString):
|
if isinstance(node, NavigableString):
|
||||||
@@ -408,7 +346,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
|||||||
prepared content
|
prepared content
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
# 1. remove comments
|
||||||
|
_remove_comments(content_tag)
|
||||||
|
|
||||||
|
# 2. wrap NavigableString with tag <p>
|
||||||
|
_wrap_strings_with_p(content_tag)
|
||||||
|
|
||||||
|
_tags_to_correspond_livecarta_tag(content_tag)
|
||||||
|
|
||||||
|
_unwrap_tags(content_tag)
|
||||||
|
|
||||||
# 3. heading removal
|
# 3. heading removal
|
||||||
if remove_title_from_chapter:
|
if remove_title_from_chapter:
|
||||||
|
|||||||
Reference in New Issue
Block a user