Wrote documentation for every func/class in .py

2021-12-10 10:53:40 +03:00
parent ef3502cd0a
commit 4b1109e6b4
13 changed files with 198 additions and 172 deletions
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -20,7 +20,7 @@ from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
 from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \
-    update_src_links_in_images, preprocess_footnotes
+    update_images_src_links, preprocess_footnotes


 class EpubConverter:
@@ -48,7 +48,7 @@ class EpubConverter:
        # flag to be updated while ebooklib.toc is parsed
        self.id_anchor_exist_in_nav_points = False
        self.img_href2img_bytes = {}  # file path to bytes
-        self.old_image_path2aws_path = {}  # file path from <a> to generated aws path
+        self.book_image_src_path2aws_path = {}  # file path from <a> to generated aws path
        self.footnotes_contents: List[str] = []  # to be sent on server as is
        self.noterefs: List[Tag] = []  # start of the footnote
        self.footnotes: List[Tag] = []  # end of the footnote
@@ -124,12 +124,12 @@ class EpubConverter:
        return css_content

    def build_html_and_css_relations(self):
-        '''
+        """
        This function is designed to get 2 dictionaries:
        The first is css_href2css_content. It is created to connect href of css to content of css
        The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
        ...2... = key2value
-        '''
+        """

        # dictionary: href of html to related css files
        html_href2css_href: defaultdict = defaultdict(list)
@@ -159,10 +159,10 @@ class EpubConverter:
        return html_href2css_href, css_href2css_content,

    def add_css_styles_to_html_soup(self):
-        '''
+        """
        This function is designed to update html_href2html_body_soup
        And add to html_inline_style css_style_content
-        '''
+        """
        for html_href in self.html_href2html_body_soup:
            if self.html_href2css_href.get(html_href):
                css = ''
@@ -179,6 +179,7 @@ class EpubConverter:

        return links

+    # t_nodes = []
    def build_adjacency_list_from_toc(self, element, lvl=0):
        """
        self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
@@ -211,25 +212,31 @@ class EpubConverter:

            sub_nodes = []
            for i in second:
+                # if 'chapter' in (i.title.lower() if isinstance(i, Link) else i[0].title.lower()):
+                #     self.t_nodes.append(self.build_adjacency_list_from_toc(i, lvl))
+                # else:
                sub_nodes.append(
                    self.build_adjacency_list_from_toc(i, lvl + 1))
-
            self.adjacency_list[nav_point] = sub_nodes
            self.hrefs_added_to_toc.add(nav_point.href)
            return nav_point

        elif isinstance(element, list) and (lvl == 0):
-            sub_nodes = []
+            nodes = []
            for i in element:
-                sub_nodes.append(
+                nodes.append(
                    self.build_adjacency_list_from_toc(i, lvl + 1))
-
-            self.adjacency_list[-1] = sub_nodes
+            #     for j in self.t_nodes:
+            #         nodes.append(j)
+            #     self.t_nodes = []
+            #
+            # self.adjacency_list[-1] = nodes

        else:
            assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'

    def is_toc_empty(self):
+        """ Function checks is toc empty """
        # there is no toc in ebook or no top chapters
        if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
            return True
@@ -247,6 +254,7 @@ class EpubConverter:
            self.hrefs_added_to_toc.add(nav_point.href)

    def add_not_added_files_to_adjacency_list(self, not_added):
+        """ Function add files that not added to adjacency list """
        for i, file in enumerate(not_added):
            nav_point = NavPoint(
                Section(f'To check #{i}, filename: {file}', file))
@@ -315,6 +323,11 @@ class EpubConverter:
        return full_path[0]

    def process_internal_links(self):
+        """
+        Function
+        - processing internal links in a book
+        - make ids unique
+        """
        # 1. rebuild ids to be unique in all documents
        for toc_href in self.hrefs_added_to_toc:
            for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
@@ -429,6 +442,7 @@ class EpubConverter:
                self.build_one_chapter(sub_node)

    def define_chapters_content(self):
+        """ Function build chapters content starts from top level chapters """
        top_level_nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
            for point in top_level_nav_points:
@@ -441,12 +455,12 @@ class EpubConverter:
                nav_point.href, nav_point.id)]
        else:
            content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
-        self.old_image_path2aws_path = update_src_links_in_images(content,
-                                                                  self.img_href2img_bytes,
-                                                                  path_to_html=nav_point.href,
-                                                                  access=self.access,
-                                                                  path2aws_path=self.old_image_path2aws_path,
-                                                                  book_id=lambda x: self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')
+        self.book_image_src_path2aws_path = update_images_src_links(content,
+                                                                    self.img_href2img_bytes,
+                                                                    path_to_html=nav_point.href,
+                                                                    access=self.access,
+                                                                    path2aws_path=self.book_image_src_path2aws_path,
+                                                                    book_id=self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')

        is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
        title_preprocessed = prepare_title(title)
@@ -466,6 +480,7 @@ class EpubConverter:
        return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)

    def convert_to_dict(self):
+        """ Function which convert list of html nodes to appropriate json structure. """
        top_level_nav_points = self.adjacency_list[-1]
        top_level_chapters = []

@@ -491,7 +506,7 @@ if __name__ == "__main__":

    logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)

-    json_converter = EpubConverter('../../epub/9781641051217.epub',
+    json_converter = EpubConverter('../../epub/9781614382263.epub',
                                   logger=logger_object)
    tmp = json_converter.convert_to_dict()