Take class removing to a function

This commit is contained in:
Kiryl
2022-06-23 18:27:23 +03:00
parent 7e380ef431
commit ffa6e90ad5

View File

@@ -96,13 +96,11 @@ def _wrap_strings_with_p(chapter_tag):
for node in chapter_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r"([\n\t\xa0])", " ", content)
# remove spaces at the beginning and at the end of the string:
content = content.strip()
content = re.sub(r"([\s\xa0])", " ", content).strip()
if content:
tag = chapter_tag.new_tag("p")
tag.append(str(node))
node.replace_with(tag)
p_tag = chapter_tag.new_tag("p")
p_tag.append(str(node))
node.replace_with(p_tag)
def _remove_headings_content(content_tag, title_of_chapter: str):
@@ -146,6 +144,7 @@ def _tags_to_correspond_livecarta_tag(chapter_tag):
for key in reg_key:
tags = chapter_tag.find_all(re.compile(key))
for tag in tags:
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
tag.name = to_replace_value
def _unwrap_tags(chapter_tag):
@@ -300,8 +299,6 @@ def _clean_wiley_block(block):
h.insert_before(BeautifulSoup(features="lxml").new_tag("br"))
def _preprocess_block_tags(chapter_tag: Tag):
"""Function preprocessing <block> tags"""
for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
@@ -323,6 +320,13 @@ def _preprocess_block_tags(chapter_tag: Tag):
_wrap_tag_with_table(chapter_tag, future_block, bg_color=color)
def _class_removing(chapter_tag):
for tag in chapter_tag.find_all(recursive=True):
if tag.attrs.get("class") \
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""
Function finalise processing/cleaning content
@@ -368,9 +372,6 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
_preprocess_div_tags(content_tag)
_preprocess_block_tags(content_tag)
# 5. remove classes that were created by converter
for tag in content_tag.find_all(recursive=True):
if hasattr(tag, "attrs") and tag.attrs.get("class") \
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
# 5. remove classes that weren't created by converter
_class_removing(content_tag)
return str(content_tag)