From 178896d510b321267de825ecdb9ee5fd6e96edd4 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Mon, 6 Sep 2021 21:01:09 +0300 Subject: [PATCH] epub converter: prettify epub_postprocessor.py --- src/epub_postprocessor.py | 65 +++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index 69c27aa..ef9b028 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -26,7 +26,7 @@ class EpubPostprocessor: self.file = file self.access = access self.logger: BookLogger = logger - self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib + self.ebooklib_book = epub.read_epub(file) self.internal_anchors = set() self.logger.log('Image processing.') self.href2img_bytes = {} @@ -43,7 +43,6 @@ class EpubPostprocessor: self.logger.log('CSS files processing.') self.css_href2content, self.html_href2css_href = self.build_css_content() - # add css self.logger.log('CSS styles adding.') self.add_css_styles2soup() @@ -67,7 +66,7 @@ class EpubPostprocessor: self.logger.log('TOC processing.') self.href2subchapter_ids = defaultdict(list) self.added_to_toc_hrefs = set() - self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf + self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # nav_point2nav_points self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed if not self.is_toc_valid(): @@ -75,14 +74,10 @@ class EpubPostprocessor: not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs] self.logger.log(f'Html documents not added to TOC: {not_added}.') self.add_not_added_files_to_adjacency_list(not_added) - # read anchored blocks, split html into separate block - self.unwrap_all_html_soup() # used only after parsed toc, ids from toc needed + self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed self.process_internal_links() - self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} - self.build_anchor2soup() - - # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list - # self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine + self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {} + self.define_chapters_content() def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: # using EpubElements @@ -145,7 +140,14 @@ class EpubPostprocessor: return links def build_adjacency_list_from_toc(self, element, lvl=0): - # use book.toc as a root + """ + self.adjacency_list builds based on TOC nested structure + + key = -1 if root, value = None if leaf + + :param element: [Link, tuple, list] - element that appears in TOC( usually parsed from nav.ncx) + :param lvl: level of depth + """ if isinstance(element, Link): # todo: check if link exists @@ -204,7 +206,7 @@ class EpubPostprocessor: self.adjacency_list[-1].append(node) self.added_to_toc_hrefs.add(file) - def unwrap_all_html_soup(self): + def process_html_soup_structure_to_line(self): # mark for href in self.href2soup_html: ids = self.href2subchapter_ids[href] @@ -250,7 +252,7 @@ class EpubPostprocessor: return full_path[0] def process_internal_links(self): - # rebuild ids to be unique in all documents + # 1. rebuild ids to be unique in all documents for toc_href in self.added_to_toc_hrefs: for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}): if tag.attrs.get('class') == 'converter-chapter-mark': @@ -262,8 +264,8 @@ class EpubPostprocessor: new_id = self._create_unique_id(toc_href, tag.attrs['id']) tag.attrs['id'] = new_id - # --------------------------------------------------------------------------------- - internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)') # anchor is a whole xhtml file + # 2.a) process anchor which is a whole xhtml file + internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)') for toc_href in self.added_to_toc_hrefs: soup = self.href2soup_html[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): @@ -282,8 +284,7 @@ class EpubPostprocessor: del internal_link_tag.attrs['href'] - # ------------------------------------------------------------------------ - # add placeholder to all internal links + # 2.a) process anchor which is a an element in xhtml file internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') for toc_href in self.added_to_toc_hrefs: soup = self.href2soup_html[toc_href] @@ -323,22 +324,18 @@ class EpubPostprocessor: f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f' Old id={a_tag_id}') - def build_one_anchored_section(self, node): + def build_one_chapter(self, node): """ - к этому моементу html soup уже существует в линейном виде - - если не в линейном - то мы не виноваты + Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) - есть 3 случая: - id оборачивает весь контент, - id оборачивает контент чаптера и под-чаптера, - id только указывает на заголовок + 3 cases: + id wraps all chapter content, + id wraps chapter's content + subchapters' content + id points to the start of title of a chapter - во всех 3х случаях мы знаем где начало заголовка. Поэтому - глава - это все теги от текущего заголовка - до какого угодно следущющего + In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id + and id of the next chapter/subchapter - заголовок принимается в расчет если в toc есть указание id,тогда заголовок - - это любой тег с id из toc - :return: """ if node.id: soup = self.href2soup_html[node.href] @@ -346,22 +343,22 @@ class EpubPostprocessor: new_tree = BeautifulSoup('', 'html.parser') for tag in chapter_tags: new_tree.append(tag) - self.id_anchor2soup[(node.href, node.id)] = new_tree + self.href_chapter_id2soup_html[(node.href, node.id)] = new_tree if self.adjacency_list.get(node): for sub_node in self.adjacency_list[node]: - self.build_one_anchored_section(sub_node) + self.build_one_chapter(sub_node) - def build_anchor2soup(self): + def define_chapters_content(self): nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: for point in nav_points: - self.build_one_anchored_section(point) + self.build_one_chapter(point) def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem: title = node.title if node.id: - content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)] + content: BeautifulSoup = self.href_chapter_id2soup_html[(node.href, node.id)] else: content: BeautifulSoup = self.href2soup_html[node.href]