Function annotations

2022-04-29 17:44:07 +03:00
parent 8de1d0d042
commit 37533e9b67
5 changed files with 187 additions and 130 deletions
--- a/src/epub_converter/html_epub_preprocessor.py
+++ b/src/epub_converter/html_epub_preprocessor.py
@@ -9,7 +9,7 @@ from src.access import Access
 from src.livecarta_config import LiveCartaConfig


-def save_image_locally(img_file_path, img_content, book_id):
+def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
    """Function saves all images locally"""
    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    new_path = pathlib.Path(os.path.join(
@@ -24,19 +24,19 @@ def save_image_locally(img_file_path, img_content, book_id):
    return new_img_path


-def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
+def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
    """Function saves all images to Amazon web service"""
    link_path = access.send_image(
        img_file_path, doc_id=book_id, img_content=img_content)
    return link_path


-def update_images_src_links(body_tag: Tag,
+def update_images_src_links(body_tag: BeautifulSoup,
                            href2img_content: dict,
-                            path_to_html,
+                            path_to_html: str,
                            access=None,
-                            path2aws_path=None,
-                            book_id=None):
+                            path2aws_path: dict = None,
+                            book_id: str = None) -> dict:
    """Function makes dictionary image_src_path -> Amazon web service_path"""
    img_tags = body_tag.find_all('img')

@@ -99,13 +99,22 @@ def preprocess_table(body_tag: BeautifulSoup):
            table.attrs['border'] = '1'


-def process_lists(body_tag):
+def process_lists(body_tag: BeautifulSoup):
    """
-    Function to process tags <li>.
-    Unwrap <p> tags.
-    """
-    li_tags = body_tag.find_all("li")
+    Function
+    - process tags <li>.
+    - unwrap <p> tags.
+    Parameters
+    ----------
+    body_tag: Tag, soup object

+    Returns
+    -------
+    None
+
+    """
+
+    li_tags = body_tag.find_all("li")
    for li_tag in li_tags:
        if li_tag.p:
            li_tag.attrs.update(li_tag.p.attrs)
@@ -113,7 +122,7 @@ def process_lists(body_tag):


 def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
-    """Function inserts span before tag to be removed(aren't supported by livecarta)"""
+    """Function inserts span before tag aren't supported by livecarta"""
    new_tag = main_tag.new_tag("span")
    new_tag.attrs['id'] = id_ or ''
    new_tag.attrs['class'] = class_ or ''
@@ -121,8 +130,8 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
    tag.insert_before(new_tag)


-def clean_headings_content(content: Tag, title: str):
-    def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
+def clean_headings_content(content: BeautifulSoup, title: str):
+    def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
        if tag_to_be_removed.attrs.get('id'):
            insert_span_with_attrs_before_tag(body_tag,
                                              tag_to_be_removed,
@@ -194,6 +203,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note

    <p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
    <aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
+
   """
    footnotes = []
    noterefs_tags = source_html_tag.find_all(
@@ -258,21 +268,28 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
    return footnotes, new_noterefs_tags, new_footnotes_tags


-def unwrap_structural_tags(body_tag):
-    """Main function that works with structure of html. Make changes inplace.
+def unwrap_structural_tags(body_tag: BeautifulSoup):
+    """
+    Main function that works with structure of html. Make changes inplace.
+    Parameters
+    ----------
+    body_tag: Tag, soup object

+    Steps
+    ----------
    1. Extracts tags that are not needed
-
    2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
    Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
    This tag must have a body_tag as a parent.
    Otherwise, it is wrapped with some tags. Like:
        <p> <span id='123', class='converter-chapter-mark'> </span> </p>
-
    3. Headings that are not supported by livecarta converts to <p>
    4. Wrapping NavigableString
-    :param body_tag: Tag, soup object
-    :return: None
+
+    Returns
+    -------
+    None
+
    """

    def preserve_class_in_aside_tag(tag_):
@@ -284,10 +301,18 @@ def unwrap_structural_tags(body_tag):
            if not tag_.parent.attrs.get('class'):
                tag_.parent.attrs['class'] = tag_class

-    def preserve_class_in_section_tag(tag_) -> bool:
+    def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
        """
-        to save css style inherited from class, copy class to child <p>
+        Function saves css style inherited from class, copies class to child <p>
        returns True, if <section> could be unwrapped
+        Parameters
+        ----------
+        tag_: Tag, soup object
+
+        Returns
+        -------
+        None
+
        """
        # this is for Wiley books with boxes
        tag_class = tag_.attrs['class'] if not isinstance(
@@ -314,9 +339,11 @@ def unwrap_structural_tags(body_tag):
                                              class_=tag_to_be_removed.attrs.get('class'))

    def replace_div_tag_with_table():
-        """Function replace <div> with <table>:
+        """
+        Function replace <div> with <table>:
        1. Convert div with certain classes to tables
        2. Add background color to div with background-color
+
        """
        for div in body_tag.find_all("div"):
            if div.attrs.get('class'):
@@ -431,22 +458,22 @@ def unwrap_structural_tags(body_tag):
    return body_tag


-def get_tags_between_chapter_marks(first_id, href, html_soup):
+def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
    """After processing on a first_id that corresponds to current chapter,
    from initial html_soup all tags from current chapter are extracted

    Parameters
    ----------
-    first_id :
+    first_id:
        Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
-    href :
+    href:
        Name of current chapter's file
-    html_soup :
+    html_soup: Tag, soup object
        Soup object of current  file

    Returns
    -------
-    tags : list [Tag, NavigableString]
+    tags: list [Tag, NavigableString]
        Chapter's tags

    """
@@ -536,37 +563,33 @@ def prepare_formatted(text: str) -> str:
    return text


-def wrap_preformatted_span_with_table(main_tag, old_tag):
+def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
    """Function wraps <span> with <table>"""
-    table = main_tag.new_tag("table")
-    table.attrs['border'] = '1px #ccc;'
-    table.attrs['style'] = 'width:100%;'
-    tbody = main_tag.new_tag("tbody")
-    tr = main_tag.new_tag("tr")
-    td = main_tag.new_tag("td")
+    table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
+        "tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
+    table.attrs['border'], table.attrs['style'] = '1px #ccc;', 'width:100%;'
    td.attrs['bgcolor'] = '#f5f5f5'
    # td.attrs['border-radius'] = '4px'
-    old_tag.wrap(td)
+    span_tag.wrap(td)
    td.wrap(tr)
    tr.wrap(tbody)
    tbody.wrap(table)
    return table


-def preprocess_pre_tags(chapter_tag):
-    """Function preprocessing <pre> tags
+def preprocess_pre_tags(chapter_tag: BeautifulSoup):
+    """
+    Function preprocessing <pre> tags
    Parameters
    ----------
-    chapter_tag: BeautifulSoup
+    chapter_tag: Tag, soup object

    Steps
    ----------
-    1. cleaning \n
-    2. heading removal
-    3. processing tags
-    4. class removal
-    """
+    1. Process NavigableString
+    2. Process Tags and their children

+    """
    for pre in chapter_tag.find_all("pre"):
        new_tag = BeautifulSoup(features='lxml').new_tag("span")
        new_tag.attrs = pre.attrs.copy()
@@ -599,17 +622,26 @@ def preprocess_pre_tags(chapter_tag):
                                 "font-size: 14px; white-space: nowrap;"
        pre.replace_with(new_tag)
        table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
+        # add <p> to save brs
        p_for_br = chapter_tag.new_tag("p")
        p_for_br.string = "\xa0"
        table.insert_after(p_for_br)


-def preprocess_code_tags(chapter_tag: Tag):
-    """Function that
-     - transform <code>, <kdb>, <var> tags into span
-     - add code style to this tags
+def preprocess_code_tags(chapter_tag: BeautifulSoup):
    """
+    Function
+    - transform <code>, <kdb>, <var> tags into span
+    - add code style to this tags
+    Parameters
+    ----------
+    chapter_tag: Tag, soup object

+    Returns
+    -------
+    None
+
+    """
    for code in chapter_tag.find_all(re.compile("code|kbd|var")):
        code.name = "span"
        if code.parent.name == "pre":
@@ -620,7 +652,6 @@ def preprocess_code_tags(chapter_tag: Tag):
            code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'


-
 def prepare_title(title_of_chapter: str) -> str:
    """Function finalise processing/cleaning title"""
    title_str = BeautifulSoup(title_of_chapter, features='lxml').string
@@ -631,18 +662,19 @@ def prepare_title(title_of_chapter: str) -> str:


 def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
-    """Function finalise processing/cleaning content
+    """
+    Function finalise processing/cleaning content
    Parameters
    ----------
    title_str: str

-    content_tag: BeautifulSoup
+    content_tag: Tag, soup object

    remove_title_from_chapter: bool

    Steps
    ----------
-    1. cleaning \n
+    1. find \n
    2. heading removal
    3. processing tags
    4. class removal
@@ -651,9 +683,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
    -------
    content_tag: str
        prepared content
-    """

-    # 0. cleaning \n
+    """
+    # 1. find \n
    to_remove = []
    for child in content_tag.contents:
        if isinstance(child, NavigableString):
@@ -661,18 +693,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
            if s == '':
                to_remove.append(child)

-    # 1. heading removal
+    # 2. heading removal
    if remove_title_from_chapter:
        clean_headings_content(content_tag, title_str)

-    # 2. processing tags (<li>, <table>, <code>, <pre>, <block>)
+    # 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
    process_lists(content_tag)
    preprocess_table(content_tag)
    preprocess_code_tags(content_tag)
    preprocess_pre_tags(content_tag)
    preprocess_block_tags(content_tag)

-    # 3. class removal
+    # 4. class removal
    for tag in content_tag.find_all(recursive=True):
        if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
                                                                                                'footnote-element']):