import re from bs4 import BeautifulSoup, NavigableString, Tag, Comment from src.livecarta_config import LiveCartaConfig def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup): """ Function adds span with id from tag_to_be_removed because this tag will be removed(unwrapped/extract) Parameters ---------- tag_to_be_removed: Soup object chapter_tag: BeautifulSoup Returns ------- None updated body tag """ def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list): """Function inserts span before tag aren't supported by livecarta""" new_tag = chapter_tag.new_tag("span") new_tag.attrs["id"] = id_ or "" new_tag.attrs["class"] = class_ or "" new_tag.string = "\xa0" tag_to_be_removed.insert_before(new_tag) if tag_to_be_removed.attrs.get("id"): _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, id_=tag_to_be_removed.attrs["id"], class_=tag_to_be_removed.attrs.get("class")) def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: """ After processing on a first_id that corresponds to current chapter, from initial html_soup all tags from current chapter are extracted Parameters ---------- first_id: str Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" href: str Name of current chapters file html_soup: Tag Soup object of current file Returns ------- tags: list [Tag, NavigableString] Chapter's tags """ marked_tags = html_soup.find( attrs={"id": first_id, "class": "converter-chapter-mark"}) if marked_tags: next_tag = marked_tags.next_sibling tags = [] while next_tag: if not isinstance(next_tag, NavigableString) and \ (next_tag.attrs.get("class") == "converter-chapter-mark"): break tags.append(next_tag) next_tag = next_tag.next_sibling # remove tags between first_id and next found id # save them in list for next steps tags = [tag.extract() for tag in tags] html_soup.smooth() else: assert 0, f"Warning: no match for {first_id, href}" return tags def prepare_title(title_of_chapter: str) -> str: """Function finalise processing/cleaning title""" title_str = BeautifulSoup(title_of_chapter, features="lxml").string # clean extra whitespace characters ([\r\n\t\f\v ]) title_str = re.sub(r"[\s\xa0]", " ", title_str).strip() return title_str def _remove_comments(chapter_tag): for tag in chapter_tag.find_all(): for element in tag(text=lambda text: isinstance(text, Comment)): element.extract() def _wrap_strings_with_p(chapter_tag): # Headings that are not supported by livecarta converts to

# wrap NavigableString with

for node in chapter_tag: if isinstance(node, NavigableString): content = str(node) content = re.sub(r"([\s\xa0])", " ", content).strip() if content: p_tag = chapter_tag.new_tag("p") p_tag.append(str(node)) node.replace_with(p_tag) def _wrap_tags_with_table(chapter_tag): """Function wraps with """ def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): table = chapter_tag.new_tag("table") table.attrs["border"], table.attrs["align"], table.attrs["style"] \ = border, "center", f"width:{width}%;" tbody, tr, td = \ chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") td.attrs["bgcolor"] = bg_color tag_to_be_wrapped.wrap(td) td.wrap(tr) tr.wrap(tbody) tbody.wrap(table) table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) return table def process_tag_using_table(tag_to_wrap): _wrap_tag_with_table( chapter_tag, tag_to_be_wrapped=tag_to_wrap, width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) _add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) tag_to_wrap.unwrap() for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items(): if isinstance(attrs, tuple): attr, val = attrs[0], attrs[1] for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}): process_tag_using_table(tag_to_wrap) else: for tag_to_wrap in chapter_tag.find_all(tags_to_wrap): if any(attr_name in attrs for attr_name in tag_to_wrap.attrs): process_tag_using_table(tag_to_wrap) def _tags_to_correspond_livecarta_tag(chapter_tag): """Function to replace all tags to correspond livecarta tags""" for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items(): for key in reg_keys: if isinstance(key, tuple): replace = key[0] parent, child = key[1], key[2] for parent_tag in chapter_tag.select(parent): if replace == "parent": parent_tag.name = to_replace_value elif replace == "child": for child_tag in parent_tag.select(child): child_tag.name = to_replace_value if not child_tag.attrs.get("style"): child_tag.attrs["style"] =\ "font-size: 14px; font-family: courier new,courier,monospace;" else: tags = chapter_tag.find_all(re.compile(key)) for tag in tags: # todo can cause appearance of \n

...

->

\n

...

\n

(section) tag.name = to_replace_value def _unwrap_tags(chapter_tag): """Function unwrap tags and move id to span""" for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP: for tag in chapter_tag.select(tag_name): # if tag is a subtag if ">" in tag_name: parent = tag.parent tag.parent.attrs.update(tag.attrs) _add_span_to_save_ids_for_links(tag, chapter_tag) tag.unwrap() def _remove_headings_content(content_tag, title_of_chapter: str): """ Function clean/remove headings from chapter in order to avoid duplication of chapter titles in the content add span with id in order to Parameters ---------- content_tag: soup object Tag of the page title_of_chapter: str Chapter title Returns ------- None clean/remove headings & add span with id """ title_of_chapter = title_of_chapter.lower() for tag in content_tag.contents: text = tag if isinstance(tag, NavigableString) else tag.text if re.sub(r"[\s\xa0]", "", text): text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces if title_of_chapter == text or \ (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name or content_tag.name)): _add_span_to_save_ids_for_links(tag, content_tag) tag.extract() return elif not isinstance(tag, NavigableString): if not _remove_headings_content(tag, title_of_chapter): break def _preprocess_table(chapter_tag: BeautifulSoup): """Function to preprocess tables and tags(td|th|tr): style""" tables = chapter_tag.find_all("table") for table in tables: for t_tag in table.find_all(re.compile("td|th|tr")): width = "" if t_tag.get("style"): width_match = re.search( r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) if width_match: size = width_match.group(1) width = size + "px" t_tag.attrs["width"] = t_tag.get("width") or width if t_tag.attrs.get("style"): t_tag.attrs["style"] = t_tag.attrs["style"].replace( "border:0;", "") if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": del t_tag.attrs["style"] if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: table.attrs["border"] = "1" def _insert_tags_in_parents(chapter_tag): parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()} for parent_tag_name, condition in parent_tag2condition.items(): for parent_tag in chapter_tag.select(parent_tag_name): if parent_tag.select(condition): continue else: tag_to_insert = chapter_tag.new_tag( LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)]) # insert all items that was in pre to code and remove from pre for content in reversed(parent_tag.contents): tag_to_insert.insert(0, content.extract()) # wrap code with items parent_tag.append(tag_to_insert) def _class_removing(chapter_tag): for tag in chapter_tag.find_all(recursive=True): if tag.attrs.get("class") \ and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): del tag.attrs["class"] def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: """ Function finalise processing/cleaning content Parameters ---------- title_str: str content_tag: Tag, soup object remove_title_from_chapter: bool Steps ---------- 1. heading removal 2. processing tags 3. class removal Returns ------- content_tag: str prepared content """ # 1. remove comments _remove_comments(content_tag) # 2. wrap NavigableString with tag

_wrap_strings_with_p(content_tag) _wrap_tags_with_table(content_tag) _tags_to_correspond_livecarta_tag(content_tag) _unwrap_tags(content_tag) # 3. heading removal if remove_title_from_chapter: _remove_headings_content(content_tag, title_str) # 4. processing tags (

  • ,
  • , ,
    , 
    , ) _preprocess_table(content_tag) _insert_tags_in_parents(content_tag) # 5. remove classes that weren't created by converter _class_removing(content_tag) return str(content_tag)