Function annotations

2022-04-29 17:44:07 +03:00
parent 8de1d0d042
commit 37533e9b67
5 changed files with 187 additions and 130 deletions
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -1,7 +1,6 @@
 import re
 import json
 import codecs
-import logging
 import os
 from os.path import dirname, normpath, join
 from itertools import chain
@@ -51,7 +50,8 @@ class EpubConverter:
        # flag to be updated while ebooklib.toc is parsed
        self.id_anchor_exist_in_nav_points = False
        self.img_href2img_bytes = {}  # file path to bytes
-        self.book_image_src_path2aws_path = {}  # file path from <a> to generated aws path
+        # file path from <a> to generated aws path
+        self.book_image_src_path2aws_path = {}
        self.footnotes_contents: List[str] = []  # to be sent on server as is
        self.noterefs: List[Tag] = []  # start of the footnote
        self.footnotes: List[Tag] = []  # end of the footnote
@@ -116,7 +116,6 @@ class EpubConverter:
        return nodes

    def get_css_content(self, css_href, html_href):
-
        path_to_css_from_html = css_href
        html_folder = dirname(html_href)
        path_to_css_from_root = normpath(
@@ -132,8 +131,8 @@ class EpubConverter:
        The first is css_href2css_content. It is created to connect href of css to content of css
        The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
        ...2... = key2value
-        """

+        """
        # dictionary: href of html to related css files
        html_href2css_href: defaultdict = defaultdict(list)
        css_href2css_content: dict = {}
@@ -165,6 +164,7 @@ class EpubConverter:
        """
        This function is designed to update html_href2html_body_soup
        And add to html_inline_style css_style_content
+
        """
        for html_href in self.html_href2html_body_soup:
            if self.html_href2css_href.get(html_href):
@@ -191,8 +191,8 @@ class EpubConverter:

        :param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
        :param lvl: level of depth
-        """

+        """
        if isinstance(element, Link):
            nav_point = NavPoint(element)
            if nav_point.id:
@@ -215,7 +215,8 @@ class EpubConverter:
            sub_nodes = []
            for elem in second:
                if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
-                    self.offset_sub_nodes.append(self.build_adjacency_list_from_toc(elem, lvl))
+                    self.offset_sub_nodes.append(
+                        self.build_adjacency_list_from_toc(elem, lvl))
                else:
                    sub_nodes.append(
                        self.build_adjacency_list_from_toc(elem, lvl + 1))
@@ -239,8 +240,8 @@ class EpubConverter:
        else:
            assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'

-    def is_toc_empty(self):
-        """ Function checks is toc empty """
+    def is_toc_empty(self) -> bool:
+        """Function checks is toc empty"""
        # there is no toc in ebook or no top chapters
        if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
            return True
@@ -258,7 +259,7 @@ class EpubConverter:
            self.hrefs_added_to_toc.add(nav_point.href)

    def add_not_added_files_to_adjacency_list(self, not_added):
-        """ Function add files that not added to adjacency list """
+        """Function add files that not added to adjacency list"""
        for i, file in enumerate(not_added):
            nav_point = NavPoint(
                Section(f'To check #{i}, filename: {file}', file))
@@ -295,19 +296,26 @@ class EpubConverter:
        new_anchor_span.string = "\xa0"
        return new_anchor_span

-    def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
+    def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
        """
+        Function used to find full path to file that is parsed from tag link
        TOC: a/b/c.xhtml
-
        b/c.xhtml -> a/b/c.xhtml
        c.xhtml -> a/b/c.xhtml
+        Parameters
+        ----------
+        cur_file_path: str
+            path to current file with tag link
+        href_in_link: str
+            filename got from tag link, like file1.xhtml
+        internal_link_tag: Tag
+            tag object that is parsed now

-        Used to find full path to file that is parsed from tag link
+        Returns
+        -------
+        full_path[0]: s
+            prepared content

-        :param cur_file_path: path to current file with tag link
-        :param href_in_link: filename got from tag link, like file1.xhtml
-        :param internal_link_tag: tag object that is parsed now
-        :return:
        """
        dir_name = os.path.dirname(cur_file_path)
        normed_path = os.path.normpath(os.path.join(
@@ -331,6 +339,12 @@ class EpubConverter:
        Function
        - processing internal links in a book
        - make ids unique
+        Steps
+        ----------
+        1. rebuild ids to be unique in all documents
+        2a. process anchor which is a whole xhtml file
+        2b. process anchor which is an element in xhtml file
+
        """
        # 1. rebuild ids to be unique in all documents
        for toc_href in self.hrefs_added_to_toc:
@@ -344,7 +358,7 @@ class EpubConverter:
                new_id = self.create_unique_id(toc_href, tag.attrs['id'])
                tag.attrs['id'] = new_id

-        # 2.a) process anchor which is a whole xhtml file
+        # 2a. process anchor which is a whole xhtml file
        internal_link_reg1 = re.compile(
            r'(^(?!https?://).+\.(htm|html|xhtml)$)')
        for toc_href in self.hrefs_added_to_toc:
@@ -367,7 +381,7 @@ class EpubConverter:

                del internal_link_tag.attrs['href']

-        # 2.b) process anchor which is an element in xhtml file
+        # 2b. process anchor which is an element in xhtml file
        internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
        for toc_href in self.hrefs_added_to_toc:
            soup = self.html_href2html_body_soup[toc_href]
@@ -418,9 +432,9 @@ class EpubConverter:
                                    f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
                                    f' Old id={a_tag_id}')

-    def build_one_chapter(self, nav_point):
+    def build_one_chapter(self, nav_point: NavPoint):
        """
-        Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
+        Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)

        3 cases:
            id wraps all chapter content,
@@ -429,7 +443,13 @@ class EpubConverter:

        In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
        and id of the next chapter/subchapter
+        Parameters
+        ----------
+        nav_point: NavPoint

+        Returns
+        -------
+        None
        """
        if nav_point.id:
            soup = self.html_href2html_body_soup[nav_point.href]
@@ -446,7 +466,7 @@ class EpubConverter:
                self.build_one_chapter(sub_node)

    def define_chapters_content(self):
-        """ Function build chapters content starts from top level chapters """
+        """Function build chapters content, starts from top level chapters"""
        top_level_nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
            for point in top_level_nav_points:
@@ -483,8 +503,8 @@ class EpubConverter:
            self.logger.log(f'{indent}Chapter: {title} is prepared.')
        return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)

-    def convert_to_dict(self):
-        """ Function which convert list of html nodes to appropriate json structure. """
+    def convert_to_dict(self) -> dict:
+        """Function which convert list of html nodes to appropriate json structure"""
        top_level_nav_points = self.adjacency_list[-1]
        top_level_chapters = []

@@ -502,7 +522,7 @@ class EpubConverter:


 if __name__ == "__main__":
-    filename = '9781641051217'
+    filename = '9781614382264'
    logger_object = BookLogger(name='epub', book_id=filename)

    json_converter = EpubConverter(f'../../epub/{filename}.epub',