From 178896d510b321267de825ecdb9ee5fd6e96edd4 Mon Sep 17 00:00:00 2001
From: shirshasa <katerinagorbac@gmail.com>
Date: Mon, 6 Sep 2021 21:01:09 +0300
Subject: [PATCH] epub converter: prettify epub_postprocessor.py

---
 src/epub_postprocessor.py | 65 +++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py
index 69c27aa..ef9b028 100644
--- a/src/epub_postprocessor.py
+++ b/src/epub_postprocessor.py
@@ -26,7 +26,7 @@ class EpubPostprocessor:
         self.file = file
         self.access = access
         self.logger: BookLogger = logger
-        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
+        self.ebooklib_book = epub.read_epub(file)
         self.internal_anchors = set()
         self.logger.log('Image processing.')
         self.href2img_bytes = {}
@@ -43,7 +43,6 @@ class EpubPostprocessor:
 
         self.logger.log('CSS files processing.')
         self.css_href2content, self.html_href2css_href = self.build_css_content()
-        # add css
         self.logger.log('CSS styles adding.')
         self.add_css_styles2soup()
 
@@ -67,7 +66,7 @@ class EpubPostprocessor:
         self.logger.log('TOC processing.')
         self.href2subchapter_ids = defaultdict(list)
         self.added_to_toc_hrefs = set()
-        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}  # k = -1 if root, v = None if leaf
+        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}  # nav_point2nav_points
         self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
         # build simple toc from spine if needed
         if not self.is_toc_valid():
@@ -75,14 +74,10 @@ class EpubPostprocessor:
         not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
         self.logger.log(f'Html documents not added to TOC: {not_added}.')
         self.add_not_added_files_to_adjacency_list(not_added)
-        # read anchored blocks, split html into separate block
-        self.unwrap_all_html_soup()  # used only after parsed toc, ids from toc needed
+        self.process_html_soup_structure_to_line()  # used only after parsed toc, ids from toc needed
         self.process_internal_links()
-        self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
-        self.build_anchor2soup()
-
-        # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
-        #     self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine
+        self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
+        self.define_chapters_content()
 
     def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
         # using EpubElements
@@ -145,7 +140,14 @@ class EpubPostprocessor:
         return links
 
     def build_adjacency_list_from_toc(self, element, lvl=0):
-        # use book.toc as a root
+        """
+        self.adjacency_list builds based on TOC nested structure
+
+        key = -1 if root, value = None if leaf
+
+        :param element: [Link, tuple, list] - element that appears in TOC( usually parsed from nav.ncx)
+        :param lvl: level of  depth
+        """
 
         if isinstance(element, Link):
             # todo: check if link exists
@@ -204,7 +206,7 @@ class EpubPostprocessor:
             self.adjacency_list[-1].append(node)
             self.added_to_toc_hrefs.add(file)
 
-    def unwrap_all_html_soup(self):
+    def process_html_soup_structure_to_line(self):
         # mark
         for href in self.href2soup_html:
             ids = self.href2subchapter_ids[href]
@@ -250,7 +252,7 @@ class EpubPostprocessor:
         return full_path[0]
 
     def process_internal_links(self):
-        # rebuild ids to be unique in all documents
+        # 1. rebuild ids to be unique in all documents
         for toc_href in self.added_to_toc_hrefs:
             for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
                 if tag.attrs.get('class') == 'converter-chapter-mark':
@@ -262,8 +264,8 @@ class EpubPostprocessor:
                 new_id = self._create_unique_id(toc_href, tag.attrs['id'])
                 tag.attrs['id'] = new_id
 
-        # ---------------------------------------------------------------------------------
-        internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)')  # anchor is a whole xhtml file
+        # 2.a) process anchor which is a whole xhtml file
+        internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)')
         for toc_href in self.added_to_toc_hrefs:
             soup = self.href2soup_html[toc_href]
             for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
@@ -282,8 +284,7 @@ class EpubPostprocessor:
 
                 del internal_link_tag.attrs['href']
 
-        # ------------------------------------------------------------------------
-        # add placeholder to all internal links
+        # 2.a) process anchor which is a an element in xhtml file
         internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
         for toc_href in self.added_to_toc_hrefs:
             soup = self.href2soup_html[toc_href]
@@ -323,22 +324,18 @@ class EpubPostprocessor:
                                     f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
                                     f' Old id={a_tag_id}')
 
-    def build_one_anchored_section(self, node):
+    def build_one_chapter(self, node):
         """
-        к этому моементу html soup уже существует в линейном виде
-        - если не в линейном - то мы не виноваты
+        Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
 
-        есть 3 случая:
-         id оборачивает весь контент,
-         id оборачивает контент чаптера и под-чаптера,
-         id только указывает на заголовок
+        3 cases:
+            id wraps all chapter content,
+            id wraps chapter's content + subchapters' content
+            id points to the start of title of a chapter
 
-        во всех 3х случаях мы знаем где начало заголовка. Поэтому
-        глава - это все теги от текущего заголовка - до какого угодно следущющего
+        In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
+        and id of the next chapter/subchapter
 
-        заголовок принимается в расчет если в toc есть указание id,тогда заголовок -
-        это любой тег с id из toc
-        :return:
         """
         if node.id:
             soup = self.href2soup_html[node.href]
@@ -346,22 +343,22 @@ class EpubPostprocessor:
             new_tree = BeautifulSoup('', 'html.parser')
             for tag in chapter_tags:
                 new_tree.append(tag)
-            self.id_anchor2soup[(node.href, node.id)] = new_tree
+            self.href_chapter_id2soup_html[(node.href, node.id)] = new_tree
 
         if self.adjacency_list.get(node):
             for sub_node in self.adjacency_list[node]:
-                self.build_one_anchored_section(sub_node)
+                self.build_one_chapter(sub_node)
 
-    def build_anchor2soup(self):
+    def define_chapters_content(self):
         nav_points = self.adjacency_list[-1]
         if self.id_anchor_exist_in_nav_points:
             for point in nav_points:
-                self.build_one_anchored_section(point)
+                self.build_one_chapter(point)
 
     def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem:
         title = node.title
         if node.id:
-            content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
+            content: BeautifulSoup = self.href_chapter_id2soup_html[(node.href, node.id)]
         else:
             content: BeautifulSoup = self.href2soup_html[node.href]