epub converter: refactor epub_converter.py

2021-09-13 21:06:01 +03:00
parent 73fa84bf76
commit 3ba7ea6bf4
1 changed files with 37 additions and 23 deletions
--- a/src/epub_converter.py
+++ b/src/epub_converter.py
@@ -28,10 +28,28 @@ class EpubConverter:
        self.access = access
        self.logger: BookLogger = logger
        self.ebooklib_book = epub.read_epub(file)
+
+        self.href2soup_html: Dict[str, BeautifulSoup] = {}  # main container for all epub .xhtml files
+        self.href2subchapter_ids = defaultdict(list)  # enumerate all subchapter id for each file
+        self.added_to_toc_hrefs = set()  # enumerate all file paths that where added to TOC
+
+        # toc tree structure stored as adj.list (NavPoint to list of  NavPoints)
+        # key = -1 for top level NavPoints
+        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
+
+        # container for all chapters soup objects
+        # here soup object is only part of the .xhtml file
+        self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
+
        self.internal_anchors = set()
+        self.id_anchor_exist_in_nav_points = False  # flag to be updated while ebooklib.toc is parsed
+        self.href2img_bytes = {}  # file path to bytes
+        self.old_image_path2_aws_path = {}  # file path from <a> to generated aws path
+        self.footnotes_contents: List[str] = []  # to be sent on server as is
+        self.noterefs: List[Tag] = []  # start of the footnote
+        self.footnotes: List[Tag] = []  # end of the footnote
+
        self.logger.log('Image processing.')
-        self.href2img_bytes = {}
-        self.old_image_path2_aws_path = {}
        for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
                       self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
            file_name = x.file_name
@@ -39,8 +57,7 @@ class EpubConverter:
            self.href2img_bytes[file_name] = content

        self.logger.log('HTML files reading.')
-        self.id_anchor_exist_in_nav_points = False
-        self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
+        self.href2soup_html = self.build_href2soup_content()

        self.logger.log('CSS files processing.')
        self.css_href2content, self.html_href2css_href = self.build_css_content()
@@ -48,9 +65,6 @@ class EpubConverter:
        self.add_css_styles2soup()

        self.logger.log('Footnotes processing.')
-        self.footnotes_contents: List[str] = []
-        self.noterefs = []
-        self.footnotes: List[Tag] = []
        for href in self.href2soup_html:
            content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href],
                                                                     self.href2soup_html)
@@ -65,19 +79,18 @@ class EpubConverter:

        self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
        self.logger.log('TOC processing.')
-        self.href2subchapter_ids = defaultdict(list)
-        self.added_to_toc_hrefs = set()
-        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}  # nav_point2nav_points
        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
        # build simple toc from spine if needed
-        if not self.is_toc_valid():
+        if self.is_toc_empty():
            self.build_adjacency_list_from_spine()
        not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
        self.logger.log(f'Html documents not added to TOC: {not_added}.')
        self.add_not_added_files_to_adjacency_list(not_added)
+        self.logger.log(f'Html internal links and structure processing.')
+        self.label_chapters_ids_with_tmp_id()
        self.process_html_soup_structure_to_line()  # used only after parsed toc, ids from toc needed
        self.process_internal_links()
-        self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
+        self.logger.log(f'Building chapters content.')
        self.define_chapters_content()

    def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
@@ -142,7 +155,7 @@ class EpubConverter:

    def build_adjacency_list_from_toc(self, element, lvl=0):
        """
-        self.adjacency_list builds based on TOC nested structure
+        self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc

        key = -1 if root, value = None if leaf

@@ -186,10 +199,10 @@ class EpubConverter:
        else:
            assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'

-    def is_toc_valid(self):
+    def is_toc_empty(self):
        if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
-            return False
-        return True
+            return True
+        return False

    def build_adjacency_list_from_spine(self):
        manifest_id2href = self.build_manifest_id2href()
@@ -207,8 +220,7 @@ class EpubConverter:
            self.adjacency_list[-1].append(nav_point)
            self.added_to_toc_hrefs.add(file)

-    def process_html_soup_structure_to_line(self):
-        # mark
+    def label_chapters_ids_with_tmp_id(self):
        for href in self.href2soup_html:
            ids = self.href2subchapter_ids[href]
            for i in ids:
@@ -219,6 +231,7 @@ class EpubConverter:
                new_h.attrs['id'] = i
                tag.insert_before(new_h)

+    def process_html_soup_structure_to_line(self):
        # go to line structure
        for href in self.href2soup_html:
            soup = self.href2soup_html[href]
@@ -236,7 +249,7 @@ class EpubConverter:
        new_anchor_span.string = "\xa0"
        return new_anchor_span

-    def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
+    def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
        """
        TOC: a/b/c.xhtml

@@ -285,7 +298,7 @@ class EpubConverter:
            for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
                a_tag_href = internal_link_tag.attrs['href']
                # find full path
-                a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
+                a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
                if not a_tag_href_matched_to_toc:
                    continue
                new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
@@ -306,7 +319,8 @@ class EpubConverter:
                a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
                # find full path
                if a_tag_href:
-                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
+                    a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href,
+                                                                                  internal_link_tag)
                else:
                    a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
                if not a_tag_href_matched_to_toc:
@@ -367,9 +381,9 @@ class EpubConverter:
                self.build_one_chapter(sub_node)

    def define_chapters_content(self):
-        nav_points = self.adjacency_list[-1]
+        top_level_nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
-            for point in nav_points:
+            for point in top_level_nav_points:
                self.build_one_chapter(point)

    def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: