From d31d4400e02838fa850a0730697568e79e3cb8e4 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Mon, 13 Sep 2021 18:54:59 +0300 Subject: [PATCH] epub converter: update epub_converter.py --- src/epub_converter.py | 87 +++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/src/epub_converter.py b/src/epub_converter.py index ba4ca0e..c50dc40 100644 --- a/src/epub_converter.py +++ b/src/epub_converter.py @@ -129,7 +129,7 @@ class EpubConverter: for href in self.href2soup_html: if self.html_href2css_href.get(href): css: str = self.css_href2content[self.html_href2css_href[href]] - content = self.href2soup_html[href] + content: BeautifulSoup = self.href2soup_html[href] content = add_inline_style_to_html_soup(content, css) self.href2soup_html[href] = content @@ -152,29 +152,29 @@ class EpubConverter: if isinstance(element, Link): # todo: check if link exists - node = NavPoint(element) - if node.id: + nav_point = NavPoint(element) + if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.href2subchapter_ids[node.href].append(node.id) - self.adjacency_list[node] = None - self.added_to_toc_hrefs.add(node.href) - return node + self.href2subchapter_ids[nav_point.href].append(nav_point.id) + self.adjacency_list[nav_point] = None + self.added_to_toc_hrefs.add(nav_point.href) + return nav_point elif isinstance(element, tuple): first, second = element assert isinstance(first, Section) - node = NavPoint(first) - if node.id: + nav_point = NavPoint(first) + if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.href2subchapter_ids[node.href].append(node.id) + self.href2subchapter_ids[nav_point.href].append(nav_point.id) sub_nodes = [] for i in second: sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) - self.adjacency_list[node] = sub_nodes - self.added_to_toc_hrefs.add(node.href) - return node + self.adjacency_list[nav_point] = sub_nodes + self.added_to_toc_hrefs.add(nav_point.href) + return nav_point elif isinstance(element, list) and (lvl == 0): sub_nodes = [] @@ -197,14 +197,14 @@ class EpubConverter: -1: [] } for id_, _ in self.ebooklib_book.spine: - node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) - self.adjacency_list[-1].append(node) - self.added_to_toc_hrefs.add(node.href) + nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) + self.adjacency_list[-1].append(nav_point) + self.added_to_toc_hrefs.add(nav_point.href) def add_not_added_files_to_adjacency_list(self, not_added): for i, file in enumerate(not_added): - node = NavPoint(Section(f'To check #{i}, filename: {file}', file)) - self.adjacency_list[-1].append(node) + nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file)) + self.adjacency_list[-1].append(nav_point) self.added_to_toc_hrefs.add(file) def process_html_soup_structure_to_line(self): @@ -236,18 +236,31 @@ class EpubConverter: new_anchor_span.string = "\xa0" return new_anchor_span - def match_href_to_path_from_toc(self, href, href_in_link, internal_link_tag): - dir_name = os.path.dirname(href) + def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag): + """ + TOC: a/b/c.xhtml + + b/c.xhtml -> a/b/c.xhtml + c.xhtml -> a/b/c.xhtml + + Used to find full path to file that is parsed from tag link + + :param cur_file_path: path to current file with tag link + :param href_in_link: filename got from tag link, like file1.xhtml + :param internal_link_tag: tag object that is parsed now + :return: + """ + dir_name = os.path.dirname(cur_file_path) normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/') full_path = [path for path in self.added_to_toc_hrefs if normed_path in path] if not full_path: - self.logger.log(f'Error in {href} file. No {normed_path} file found in added to TOC documents. ' + self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' f'While processing href in {internal_link_tag}.') internal_link_tag.attrs['converter-mark'] = 'bad-link' return None if len(full_path) > 1: - self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}' + self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}' f' while {internal_link_tag} processing. The first one will be chosen.') return full_path[0] @@ -326,7 +339,7 @@ class EpubConverter: f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f' Old id={a_tag_id}') - def build_one_chapter(self, node): + def build_one_chapter(self, nav_point): """ Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) @@ -339,16 +352,16 @@ class EpubConverter: and id of the next chapter/subchapter """ - if node.id: - soup = self.href2soup_html[node.href] - chapter_tags = get_tags_between_chapter_marks(first_id=node.id, href=node.href, html_soup=soup) + if nav_point.id: + soup = self.href2soup_html[nav_point.href] + chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup) new_tree = BeautifulSoup('', 'html.parser') for tag in chapter_tags: new_tree.append(tag) - self.href_chapter_id2soup_html[(node.href, node.id)] = new_tree + self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] = new_tree - if self.adjacency_list.get(node): - for sub_node in self.adjacency_list[node]: + if self.adjacency_list.get(nav_point): + for sub_node in self.adjacency_list[nav_point]: self.build_one_chapter(sub_node) def define_chapters_content(self): @@ -357,16 +370,16 @@ class EpubConverter: for point in nav_points: self.build_one_chapter(point) - def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem: - title = node.title - if node.id: - content: BeautifulSoup = self.href_chapter_id2soup_html[(node.href, node.id)] + def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: + title = nav_point.title + if nav_point.id: + content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] else: - content: BeautifulSoup = self.href2soup_html[node.href] + content: BeautifulSoup = self.href2soup_html[nav_point.href] self.old_image_path2_aws_path = update_src_links_in_images(content, self.href2img_bytes, - path_to_html=node.href, + path_to_html=nav_point.href, access=self.access, path2aws_path=self.old_image_path2_aws_path) @@ -376,8 +389,8 @@ class EpubConverter: sub_nodes = [] # warning! not EpubHtmlItems won;t be added to chapter - if self.adjacency_list.get(node): - for sub_node in self.adjacency_list[node]: + if self.adjacency_list.get(nav_point): + for sub_node in self.adjacency_list[nav_point]: sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1) sub_nodes.append(sub_chapter_item)