From ef3502cd0a68be0bbc7575e26210e6c6b8b2b81a Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 1 Dec 2021 16:08:19 +0300 Subject: [PATCH] Add processor (no stem in file) --- src/epub_converter/epub_converter.py | 24 ++++++++++++-------- src/epub_converter/html_epub_preprocessor.py | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 35ba909..4dd8dd1 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -30,9 +30,11 @@ class EpubConverter: self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file) - self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files - self.html_href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file - self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC + # main container for all epub .xhtml files + self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} + # enumerate all subchapter id for each file + self.html_href2subchapter_ids = defaultdict(list) + self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC # toc tree structure stored as adj.list (NavPoint to list of NavPoints) # key = -1 for top level NavPoints @@ -43,7 +45,8 @@ class EpubConverter: self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {} self.internal_anchors = set() - self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed + # flag to be updated while ebooklib.toc is parsed + self.id_anchor_exist_in_nav_points = False self.img_href2img_bytes = {} # file path to bytes self.old_image_path2aws_path = {} # file path from to generated aws path self.footnotes_contents: List[str] = [] # to be sent on server as is @@ -191,7 +194,8 @@ class EpubConverter: nav_point = NavPoint(element) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.html_href2subchapter_ids[nav_point.href].append(nav_point.id) + self.html_href2subchapter_ids[nav_point.href].append( + nav_point.id) self.adjacency_list[nav_point] = None self.hrefs_added_to_toc.add(nav_point.href) return nav_point @@ -202,7 +206,8 @@ class EpubConverter: nav_point = NavPoint(first) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.html_href2subchapter_ids[nav_point.href].append(nav_point.id) + self.html_href2subchapter_ids[nav_point.href].append( + nav_point.id) sub_nodes = [] for i in second: @@ -263,7 +268,8 @@ class EpubConverter: # go to line structure for html_href in self.html_href2html_body_soup: soup = self.html_href2html_body_soup[html_href] - self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup) + self.html_href2html_body_soup[html_href] = unwrap_structural_tags( + soup) @staticmethod def create_unique_id(href, id_): @@ -440,12 +446,12 @@ class EpubConverter: path_to_html=nav_point.href, access=self.access, path2aws_path=self.old_image_path2aws_path, - book_id=self.file.stem or 'book_id') + book_id=lambda x: self.file.stem if hasattr(self.file, self.file.stem) else 'book_id') is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS title_preprocessed = prepare_title(title) content_preprocessed = prepare_content(title_preprocessed, content, - remove_title_from_chapter=is_chapter) + remove_title_from_chapter=is_chapter) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter if self.adjacency_list.get(nav_point): diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index bd83a59..15e026a 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -56,7 +56,7 @@ def update_src_links_in_images(body_tag: Tag, path2aws_path[path_to_img_from_root] = new_folder else: new_folder = save_image_locally( - path_to_img_from_root, img_content, book_id) + path_to_img_from_root, img_content, 'book_id') img.attrs['src'] = str(new_folder) if img.attrs.get('width'):