diff --git a/src/epub_converter.py b/src/epub_converter.py index 056deb1..375020e 100644 --- a/src/epub_converter.py +++ b/src/epub_converter.py @@ -28,10 +28,28 @@ class EpubConverter: self.access = access self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file) + + self.href2soup_html: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files + self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file + self.added_to_toc_hrefs = set() # enumerate all file paths that where added to TOC + + # toc tree structure stored as adj.list (NavPoint to list of NavPoints) + # key = -1 for top level NavPoints + self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} + + # container for all chapters soup objects + # here soup object is only part of the .xhtml file + self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {} + self.internal_anchors = set() + self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed + self.href2img_bytes = {} # file path to bytes + self.old_image_path2_aws_path = {} # file path from to generated aws path + self.footnotes_contents: List[str] = [] # to be sent on server as is + self.noterefs: List[Tag] = [] # start of the footnote + self.footnotes: List[Tag] = [] # end of the footnote + self.logger.log('Image processing.') - self.href2img_bytes = {} - self.old_image_path2_aws_path = {} for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): file_name = x.file_name @@ -39,8 +57,7 @@ class EpubConverter: self.href2img_bytes[file_name] = content self.logger.log('HTML files reading.') - self.id_anchor_exist_in_nav_points = False - self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() + self.href2soup_html = self.build_href2soup_content() self.logger.log('CSS files processing.') self.css_href2content, self.html_href2css_href = self.build_css_content() @@ -48,9 +65,6 @@ class EpubConverter: self.add_css_styles2soup() self.logger.log('Footnotes processing.') - self.footnotes_contents: List[str] = [] - self.noterefs = [] - self.footnotes: List[Tag] = [] for href in self.href2soup_html: content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href], self.href2soup_html) @@ -65,19 +79,18 @@ class EpubConverter: self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.') self.logger.log('TOC processing.') - self.href2subchapter_ids = defaultdict(list) - self.added_to_toc_hrefs = set() - self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # nav_point2nav_points self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed - if not self.is_toc_valid(): + if self.is_toc_empty(): self.build_adjacency_list_from_spine() not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs] self.logger.log(f'Html documents not added to TOC: {not_added}.') self.add_not_added_files_to_adjacency_list(not_added) + self.logger.log(f'Html internal links and structure processing.') + self.label_chapters_ids_with_tmp_id() self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed self.process_internal_links() - self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {} + self.logger.log(f'Building chapters content.') self.define_chapters_content() def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: @@ -142,7 +155,7 @@ class EpubConverter: def build_adjacency_list_from_toc(self, element, lvl=0): """ - self.adjacency_list builds based on TOC nested structure + self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc key = -1 if root, value = None if leaf @@ -186,10 +199,10 @@ class EpubConverter: else: assert 0, f'Error. Element is not tuple/Link instance: {type(element)}' - def is_toc_valid(self): + def is_toc_empty(self): if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): - return False - return True + return True + return False def build_adjacency_list_from_spine(self): manifest_id2href = self.build_manifest_id2href() @@ -207,8 +220,7 @@ class EpubConverter: self.adjacency_list[-1].append(nav_point) self.added_to_toc_hrefs.add(file) - def process_html_soup_structure_to_line(self): - # mark + def label_chapters_ids_with_tmp_id(self): for href in self.href2soup_html: ids = self.href2subchapter_ids[href] for i in ids: @@ -219,6 +231,7 @@ class EpubConverter: new_h.attrs['id'] = i tag.insert_before(new_h) + def process_html_soup_structure_to_line(self): # go to line structure for href in self.href2soup_html: soup = self.href2soup_html[href] @@ -236,7 +249,7 @@ class EpubConverter: new_anchor_span.string = "\xa0" return new_anchor_span - def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag): + def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag): """ TOC: a/b/c.xhtml @@ -285,7 +298,7 @@ class EpubConverter: for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): a_tag_href = internal_link_tag.attrs['href'] # find full path - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) + a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) if not a_tag_href_matched_to_toc: continue new_id = self._create_unique_id(a_tag_href_matched_to_toc, '') @@ -306,7 +319,8 @@ class EpubConverter: a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#') # find full path if a_tag_href: - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) + a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, + internal_link_tag) else: a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/') if not a_tag_href_matched_to_toc: @@ -367,9 +381,9 @@ class EpubConverter: self.build_one_chapter(sub_node) def define_chapters_content(self): - nav_points = self.adjacency_list[-1] + top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: - for point in nav_points: + for point in top_level_nav_points: self.build_one_chapter(point) def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: