import re from bs4 import BeautifulSoup, NavigableString, Tag, Comment from src.livecarta_config import LiveCartaConfig def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup): """ Function adds span with id from tag_to_be_removed because this tag will be removed(unwrapped/extract) Parameters ---------- tag_to_be_removed: Soup object chapter_tag: BeautifulSoup Returns ------- None updated body tag """ def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list): """Function inserts span before tag aren't supported by livecarta""" new_tag = chapter_tag.new_tag("span") new_tag.attrs["id"] = id_ or "" new_tag.attrs["class"] = class_ or "" new_tag.string = "\xa0" tag_to_be_removed.insert_before(new_tag) if tag_to_be_removed.attrs.get("id"): _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, id_=tag_to_be_removed.attrs["id"], class_=tag_to_be_removed.attrs.get("class")) def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: """ After processing on a first_id that corresponds to current chapter, from initial html_soup all tags from current chapter are extracted Parameters ---------- first_id: str Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" href: str Name of current chapters file html_soup: Tag Soup object of current file Returns ------- tags: list [Tag, NavigableString] Chapter's tags """ marked_tags = html_soup.find( attrs={"id": first_id, "class": "converter-chapter-mark"}) if marked_tags: next_tag = marked_tags.next_sibling tags = [] while next_tag: if not isinstance(next_tag, NavigableString) and \ (next_tag.attrs.get("class") == "converter-chapter-mark"): break tags.append(next_tag) next_tag = next_tag.next_sibling # remove tags between first_id and next found id # save them in list for next steps tags = [tag.extract() for tag in tags] html_soup.smooth() else: assert 0, f"Warning: no match for {first_id, href}" return tags def prepare_title(title_of_chapter: str) -> str: """Function finalise processing/cleaning title""" title_str = BeautifulSoup(title_of_chapter, features="lxml").string # clean extra whitespace characters ([\r\n\t\f\v ]) title_str = re.sub(r"[\s\xa0]", " ", title_str).strip() return title_str def _remove_comments(chapter_tag): for tag in chapter_tag.find_all(): for element in tag(text=lambda text: isinstance(text, Comment)): element.extract() def _wrap_strings_with_p(chapter_tag): # Headings that are not supported by livecarta converts to
# wrap NavigableString with
for node in chapter_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r"([\s\xa0])", " ", content).strip()
if content:
p_tag = chapter_tag.new_tag("p")
p_tag.append(str(node))
node.replace_with(p_tag)
def _wrap_tags_with_table(chapter_tag):
"""Function wraps ... \n ... \n
_wrap_strings_with_p(content_tag)
_wrap_tags_with_table(content_tag)
_tags_to_correspond_livecarta_tag(content_tag)
_unwrap_tags(content_tag)
# 3. heading removal
if remove_title_from_chapter:
_remove_headings_content(content_tag, title_str)
# 4. processing tags ("""
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_be_wrapped.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def process_tag_using_table(tag_to_wrap):
_wrap_tag_with_table(
chapter_tag,
tag_to_be_wrapped=tag_to_wrap,
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
_add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap()
for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
if isinstance(attrs, tuple):
attr, val = attrs[0], attrs[1]
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
process_tag_using_table(tag_to_wrap)
else:
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
process_tag_using_table(tag_to_wrap)
def _tags_to_correspond_livecarta_tag(chapter_tag):
"""Function to replace all tags to correspond livecarta tags"""
for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
for key in reg_keys:
if isinstance(key, tuple):
replace = key[0]
parent, child = key[1], key[2]
for parent_tag in chapter_tag.select(parent):
if replace == "parent":
parent_tag.name = to_replace_value
elif replace == "child":
for child_tag in parent_tag.select(child):
child_tag.name = to_replace_value
if not child_tag.attrs.get("style"):
child_tag.attrs["style"] =\
"font-size: 14px; font-family: courier new,courier,monospace;"
else:
tags = chapter_tag.find_all(re.compile(key))
for tag in tags:
# todo can cause appearance of \n
,
, ,