Formatting

2022-06-01 16:23:53 +03:00
parent 5039417a0f
commit c0ef0b6d6e
13 changed files with 318 additions and 185 deletions
--- a/src/epub_converter/html_epub_preprocessor.py
+++ b/src/epub_converter/html_epub_preprocessor.py
@@ -71,7 +71,7 @@ def update_images_src_links(body_tag: BeautifulSoup,
    return path2aws_path


-def preprocess_table(body_tag: BeautifulSoup):
+def _preprocess_table(body_tag: BeautifulSoup):
    """Function to preprocess tables and tags(td|th|tr): style"""
    tables = body_tag.find_all("table")
    for table in tables:
@@ -99,7 +99,7 @@ def preprocess_table(body_tag: BeautifulSoup):
            table.attrs['border'] = '1'


-def process_lists(body_tag: BeautifulSoup):
+def _process_lists(body_tag: BeautifulSoup):
    """
    Function
    - process tags <li>.
@@ -121,7 +121,7 @@ def process_lists(body_tag: BeautifulSoup):
            li_tag.p.unwrap()


-def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
+def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
    """Function inserts span before tag aren't supported by livecarta"""
    new_tag = main_tag.new_tag("span")
    new_tag.attrs['id'] = id_ or ''
@@ -130,21 +130,21 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
    tag.insert_before(new_tag)


-def clean_headings_content(content: BeautifulSoup, title: str):
+def _clean_headings_content(content: BeautifulSoup, title: str):
    def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
        if tag_to_be_removed.attrs.get('id'):
-            insert_span_with_attrs_before_tag(body_tag,
-                                              tag_to_be_removed,
-                                              id_=tag_to_be_removed.attrs.get(
-                                                  'id'),
-                                              class_=tag_to_be_removed.attrs.get('class'))
+            _insert_span_with_attrs_before_tag(body_tag,
+                                               tag_to_be_removed,
+                                               id_=tag_to_be_removed.attrs.get(
+                                                   'id'),
+                                               class_=tag_to_be_removed.attrs.get('class'))

        for sub_tag in tag_to_be_removed.find_all():
            if sub_tag.attrs.get('id'):
-                insert_span_with_attrs_before_tag(body_tag,
-                                                  tag_to_be_removed,
-                                                  id_=sub_tag.attrs['id'],
-                                                  class_=sub_tag.attrs.get('class'))
+                _insert_span_with_attrs_before_tag(body_tag,
+                                                   tag_to_be_removed,
+                                                   id_=sub_tag.attrs['id'],
+                                                   class_=sub_tag.attrs.get('class'))

    title = title.lower()
    for child in content.contents:
@@ -165,7 +165,7 @@ def clean_headings_content(content: BeautifulSoup, title: str):
            break


-def heading_tag_to_p_tag(body_tag):
+def _heading_tag_to_p_tag(body_tag):
    """Function to convert all lower level headings to p tags"""
    pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
    header_tags = body_tag.find_all(re.compile(pattern))
@@ -173,7 +173,7 @@ def heading_tag_to_p_tag(body_tag):
        tag.name = 'p'


-def clean_title_from_numbering(title: str):
+def _clean_title_from_numbering(title: str):
    """Function removes numbering from titles"""
    title = re.sub(r'^(\s+)+', '', title)
    # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
@@ -182,7 +182,7 @@ def clean_title_from_numbering(title: str):
    return title


-def replace_with_livecarta_anchor_tag(anchor, i):
+def _replace_with_livecarta_anchor_tag(anchor, i):
    """Function replace noteref_tag(anchor) with new livecarta tag"""
    new_tag = BeautifulSoup(features='lxml').new_tag('sup')
    new_tag['class'] = 'footnote-element'
@@ -257,7 +257,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
        if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
            footnote_tag = footnote_tag.parent
        new_noterefs_tags.append(
-            replace_with_livecarta_anchor_tag(noteref_tag, i))
+            _replace_with_livecarta_anchor_tag(noteref_tag, i))
        content = footnote_tag.text
        # footnote_tag.decompose()
        footnotes.append(content)
@@ -292,7 +292,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):

    """

-    def preserve_class_in_aside_tag(tag_):
+    def _preserve_class_in_aside_tag(tag_):
        """to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
        # this is for Wiley books with boxes
        tag_class = tag_.attrs['class'] if not isinstance(
@@ -301,7 +301,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
            if not tag_.parent.attrs.get('class'):
                tag_.parent.attrs['class'] = tag_class

-    def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
+    def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
        """
        Function saves css style inherited from class, copies class to child <p>
        returns True, if <section> could be unwrapped
@@ -332,13 +332,13 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
        else:
            return True

-    def add_span_to_save_ids_for_links(tag_to_be_removed):
+    def _add_span_to_save_ids_for_links(tag_to_be_removed):
        if tag_to_be_removed.attrs.get('id'):
-            insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
-                                              id_=tag_to_be_removed.attrs['id'],
-                                              class_=tag_to_be_removed.attrs.get('class'))
+            _insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
+                                               id_=tag_to_be_removed.attrs['id'],
+                                               class_=tag_to_be_removed.attrs.get('class'))

-    def replace_div_tag_with_table():
+    def _replace_div_tag_with_table():
        """
        Function replace <div> with <table>:
        1. Convert div with certain classes to tables
@@ -350,11 +350,11 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
                div_class = div.attrs['class'] if not isinstance(
                    div.attrs['class'], list) else div.attrs['class'][0]
                if div_class in ['C409', 'C409a']:
-                    wrap_block_tag_with_table(
+                    _wrap_block_tag_with_table(
                        body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')

                elif div_class in ['C441', 'C816']:
-                    wrap_block_tag_with_table(
+                    _wrap_block_tag_with_table(
                        body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')

            if div.attrs.get('style'):
@@ -363,7 +363,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
                        'background-color') + len('background-color')
                    start_index_of_color = end_index + 2
                    bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
-                    wrap_block_tag_with_table(
+                    _wrap_block_tag_with_table(
                        body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
            elif div.attrs.get('style') == '':
                del div.attrs['style']
@@ -379,7 +379,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
                if all(is_not_struct_tag):
                    div.name = 'p'
                    continue
-            add_span_to_save_ids_for_links(div)
+            _add_span_to_save_ids_for_links(div)
            div.unwrap()

    # comments removal
@@ -387,18 +387,18 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
        for element in tag(text=lambda text: isinstance(text, Comment)):
            element.extract()

-    replace_div_tag_with_table()
+    _replace_div_tag_with_table()

    for s in body_tag.find_all("section"):
        could_be_unwrapped = True
        if s.attrs.get('class'):
-            could_be_unwrapped = preserve_class_in_section_tag(s)
-        add_span_to_save_ids_for_links(s)
+            could_be_unwrapped = _preserve_class_in_section_tag(s)
+        _add_span_to_save_ids_for_links(s)
        if could_be_unwrapped:
            s.unwrap()

    for s in body_tag.find_all("article"):
-        add_span_to_save_ids_for_links(s)
+        _add_span_to_save_ids_for_links(s)
        s.unwrap()

    for s in body_tag.find_all("figure"):
@@ -407,22 +407,22 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
        s.attrs['style'] = "text-align: center;"

    for s in body_tag.find_all("figcaption"):
-        add_span_to_save_ids_for_links(s)
+        _add_span_to_save_ids_for_links(s)
        s.unwrap()

    for s in body_tag.find_all("aside"):
        s.name = 'blockquote'

    for s in body_tag.find_all("main"):
-        add_span_to_save_ids_for_links(s)
+        _add_span_to_save_ids_for_links(s)
        s.unwrap()

    for s in body_tag.find_all("body"):
-        add_span_to_save_ids_for_links(s)
+        _add_span_to_save_ids_for_links(s)
        s.unwrap()

    for s in body_tag.find_all("html"):
-        add_span_to_save_ids_for_links(s)
+        _add_span_to_save_ids_for_links(s)
        s.unwrap()

    for s in body_tag.find_all("header"):
@@ -442,7 +442,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
    assert all(
        parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'

-    heading_tag_to_p_tag(body_tag)
+    _heading_tag_to_p_tag(body_tag)

    # wrap NavigableString with <p>
    for node in body_tag:
@@ -500,7 +500,7 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
    return tags


-def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
+def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
    """Function wraps <block> with <table>"""
    table = main_tag.new_tag("table")
    table.attrs['border'] = border
@@ -520,7 +520,7 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co
    return table


-def clean_wiley_block(block):
+def _clean_wiley_block(block):
    hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
    for hr in hrs:
        hr.extract()
@@ -530,30 +530,30 @@ def clean_wiley_block(block):
        h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))


-def preprocess_block_tags(chapter_tag):
+def _preprocess_block_tags(chapter_tag):
    """Function preprocessing <block> tags"""
    for block in chapter_tag.find_all("blockquote"):
        if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
-            clean_wiley_block(block)
+            _clean_wiley_block(block)

            color = '#DDDDDD' if block.attrs.get(
                'class') == 'feature1' else None
            color = '#EEEEEE' if block.attrs.get(
                'class') == 'feature2' else color
-            wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
+            _wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
            block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
            block.unwrap()

    for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
-        clean_wiley_block(future_block)
+        _clean_wiley_block(future_block)
        color = '#DDDDDD' if future_block.attrs.get(
            'class') == 'feature1' else None
        color = '#EEEEEE' if future_block.attrs.get(
            'class') == 'feature2' else color
-        wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
+        _wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)


-def prepare_formatted(text: str) -> str:
+def _prepare_formatted(text: str) -> str:
    """Function replaces special symbols with their Unicode representation"""
    text = text.replace("<", "\x3C")
    text = text.replace(">", "\x3E")
@@ -563,7 +563,7 @@ def prepare_formatted(text: str) -> str:
    return text


-def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
+def _wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
    """Function wraps <span> with <table>"""
    table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
        "tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
@@ -577,7 +577,7 @@ def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
    return table


-def preprocess_pre_tags(chapter_tag: BeautifulSoup):
+def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
    """
    Function preprocessing <pre> tags
    Parameters
@@ -601,7 +601,7 @@ def preprocess_pre_tags(chapter_tag: BeautifulSoup):
        for child in copy_contents:
            # Navigable String
            if isinstance(child, NavigableString):
-                cleaned_text = prepare_formatted(str(child))
+                cleaned_text = _prepare_formatted(str(child))
                sub_strings = re.split('\r\n|\n|\r', cleaned_text)
                for string in sub_strings[:-1]:
                    new_tag.append(NavigableString(string))
@@ -612,24 +612,24 @@ def preprocess_pre_tags(chapter_tag: BeautifulSoup):
            else:
                for sub_child in child.children:
                    if isinstance(sub_child, NavigableString):
-                        cleaned_text = prepare_formatted(str(sub_child))
+                        cleaned_text = _prepare_formatted(str(sub_child))
                        sub_child.replace_with(NavigableString(cleaned_text))
                    else:
-                        sub_child.string = prepare_formatted(sub_child.text)
+                        sub_child.string = _prepare_formatted(sub_child.text)
                cleaned_tag = child.extract()
                new_tag.append(cleaned_tag)
                if to_add_br:
                    new_tag.append(BeautifulSoup(
                        features='lxml').new_tag('br'))
        pre.replace_with(new_tag)
-        table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
+        table = _wrap_preformatted_span_with_table(chapter_tag, new_tag)
        # add <p> to save brs
        p_for_br = chapter_tag.new_tag("p")
        p_for_br.string = "\xa0"
        table.insert_after(p_for_br)


-def preprocess_code_tags(chapter_tag: BeautifulSoup):
+def _preprocess_code_tags(chapter_tag: BeautifulSoup):
    """
    Function
    - transform <code>, <kdb>, <var> tags into span
@@ -658,7 +658,7 @@ def prepare_title(title_of_chapter: str) -> str:
    title_str = BeautifulSoup(title_of_chapter, features='lxml').string
    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
    title_str = re.sub(r' +', ' ', title_str).rstrip()
-    title_str = clean_title_from_numbering(title_str)
+    title_str = _clean_title_from_numbering(title_str)
    return title_str


@@ -696,18 +696,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro

    # 2. heading removal
    if remove_title_from_chapter:
-        clean_headings_content(content_tag, title_str)
+        _clean_headings_content(content_tag, title_str)

    # 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
-    process_lists(content_tag)
-    preprocess_table(content_tag)
-    preprocess_code_tags(content_tag)
-    preprocess_pre_tags(content_tag)
-    preprocess_block_tags(content_tag)
+    _process_lists(content_tag)
+    _preprocess_table(content_tag)
+    _preprocess_code_tags(content_tag)
+    _preprocess_pre_tags(content_tag)
+    _preprocess_block_tags(content_tag)

    # 4. class removal
    for tag in content_tag.find_all(recursive=True):
        if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
                                                                                                'footnote-element']):
            del tag.attrs['class']
-    return str(content_tag)
+    return str(content_tag)