diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index a301b5b..d3a623a 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -94,13 +94,15 @@ class EpubConverter: self.logger.log(f"Html documents not added to TOC: {not_added}.") self.logger.log(f"Add documents not added to TOC.") self.add_not_added_files_to_adjacency_list(not_added) - self.logger.log(f"Html internal links and structure processing.") - self.label_chapters_ids_with_lc_id() - self.chapter_marks_are_same_level() - # used only after parsed toc, ids from toc needed + self.logger.log(f"Label subchapters with converter tag.") + self.label_subchapters_with_lc_tag() + self.logger.log(f"Process html internal links.") self.process_internal_links() + self.logger.log( + f"Check if converter-chapter-marks are on the same level.") + self.chapter_marks_are_same_level() self.logger.log(f"Define chapters content.") - self.define_chapters_content() + self.define_chapters_with_content() self.logger.log(f"Converting html_nodes to LiveCarta chapter items.") def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: @@ -286,14 +288,14 @@ class EpubConverter: return True return False - def build_manifest_id2html_href(self) -> dict: - links = dict() - for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): - links[item.id] = item.file_name - return links - def build_adjacency_list_from_spine(self): - manifest_id2html_href = self.build_manifest_id2html_href() + def build_manifest_id2html_href() -> dict: + links = dict() + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + links[item.id] = item.file_name + return links + + manifest_id2html_href = build_manifest_id2html_href() self.adjacency_list = { -1: [] } @@ -311,16 +313,16 @@ class EpubConverter: self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(file) - def label_chapters_ids_with_lc_id(self): + def label_subchapters_with_lc_tag(self): for html_href in self.html_href2html_body_soup: - ids = self.html_href2subchapter_ids[html_href] + ids, soup = self.html_href2subchapters_ids[html_href], \ + self.html_href2html_body_soup[html_href] for i in ids: - soup = self.html_href2html_body_soup[html_href] tag = soup.find(id=i) - new_h = soup.new_tag("tmp") - new_h.attrs["class"] = "converter-chapter-mark" - new_h.attrs["id"] = i - tag.insert_before(new_h) + tmp_tag = soup.new_tag("lc_tmp") + tmp_tag.attrs["class"] = "converter-chapter-mark" + tmp_tag.attrs["id"] = i + tag.insert_before(tmp_tag) def chapter_marks_are_same_level(self): """ @@ -401,8 +403,8 @@ class EpubConverter: Steps ---------- 1. rebuild ids to be unique in all documents - 2a. process anchor which is a whole xhtml file - 2b. process anchor which is an element in xhtml file + 2a. process anchor which is a whole htm|html|xhtml file + 2b. process anchor which is an element in htm|html|xhtml file Returns ------- @@ -410,91 +412,80 @@ class EpubConverter: process links in html """ - # 1. rebuild ids to be unique in all documents - for toc_href in self.hrefs_added_to_toc: - for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): - if tag.attrs.get("class") == "converter-chapter-mark": - continue + def make_ids_unique(): + for toc_href in self.hrefs_added_to_toc: + for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): + if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]: + new_id = self.create_unique_id(toc_href, tag.attrs["id"]) + tag.attrs["id"] = new_id - if tag.attrs.get("class") == "footnote-element": - continue + def process_file_anchor(): + for toc_href in self.hrefs_added_to_toc: + soup = self.html_href2html_body_soup[toc_href] + for internal_link_tag in soup.find_all("a", + {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}): + a_tag_href = internal_link_tag.attrs["href"] + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( + toc_href, a_tag_href, internal_link_tag) + if a_tag_href_matched_to_toc: + new_id = self.create_unique_id(a_tag_href_matched_to_toc, "") + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" + if new_id not in self.internal_anchors: + anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] + new_anchor_span = self.create_new_anchor_span(soup, new_id) + # insert a new span to the beginning of the file + anchor_soup.insert(0, new_anchor_span) + self.internal_anchors.add(new_id) + del internal_link_tag.attrs["href"] - new_id = self.create_unique_id(toc_href, tag.attrs["id"]) - tag.attrs["id"] = new_id + def process_file_element_anchor(): + for toc_href in self.hrefs_added_to_toc: + soup = self.html_href2html_body_soup[toc_href] + # process_file_element_anchor + for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}): + a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#") + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( + toc_href, a_tag_href, internal_link_tag) if a_tag_href \ + else os.path.normpath(toc_href).replace("\\", "/") + if a_tag_href_matched_to_toc: + new_id = self.create_unique_id( + a_tag_href_matched_to_toc, a_tag_id) - # 2a. process anchor which is a whole xhtml file - internal_link_reg1 = re.compile( - r"(^(?!https?://).+\.(htm|html|xhtml)$)") - for toc_href in self.hrefs_added_to_toc: - soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all("a", {"href": internal_link_reg1}): - a_tag_href = internal_link_tag.attrs["href"] - # find full path - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( - toc_href, a_tag_href, internal_link_tag) - if not a_tag_href_matched_to_toc: - continue - new_id = self.create_unique_id(a_tag_href_matched_to_toc, "") - internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" - if new_id not in self.internal_anchors: - anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - new_anchor_span = self.create_new_anchor_span(soup, new_id) - # insert a new span to the beginning of the file - anchor_soup.insert(0, new_anchor_span) - self.internal_anchors.add(new_id) + anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] + anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \ + anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote + if anchor_tags: + if len(anchor_tags) > 1: + self.logger.log(f"Warning in {toc_href}: multiple anchors:" + f"{len(anchor_tags)} found.\n" + f"{anchor_tags}\n" + f"While processing {internal_link_tag}") - del internal_link_tag.attrs["href"] - - # 2b. process anchor which is an element in xhtml file - internal_link_reg2 = re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)") - for toc_href in self.hrefs_added_to_toc: - soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all("a", {"href": internal_link_reg2}): - a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split( - "#") - # find full path - if a_tag_href: - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, - internal_link_tag) - else: - a_tag_href_matched_to_toc = os.path.normpath( - toc_href).replace("\\", "/") - - if not a_tag_href_matched_to_toc: - continue - - new_id = self.create_unique_id( - a_tag_href_matched_to_toc, a_tag_id) - - anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - anchor_tags = anchor_soup.find_all(attrs={"id": new_id, }) - anchor_tags = anchor_tags or anchor_soup.find_all( - attrs={"id": a_tag_id}) # if link is a footnote - - if anchor_tags: - if len(anchor_tags) > 1: - self.logger.log(f"Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n" - f"{anchor_tags}\n" - f" While processing {internal_link_tag}") - - anchor_tag = anchor_tags[0] - assert anchor_tag.attrs["id"] in [new_id, a_tag_id] - # if anchor is found we could add placeholder for link creation on server side. - internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" - # create span to have cyclic links, link has 1 type of class, anchor another - if anchor_tag.attrs["id"] not in self.internal_anchors: - new_anchor_span = self.create_new_anchor_span( - soup, new_id) - anchor_tag.insert_before(new_anchor_span) - self.internal_anchors.add(new_id) - del anchor_tag.attrs["id"] - del internal_link_tag.attrs["href"] - - else: - internal_link_tag.attrs["converter-mark"] = "bad-link" - self.logger.log(f"Error in {toc_href}. While processing {internal_link_tag} no anchor found." - f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file." - f" Old id={a_tag_id}") + anchor_tag = anchor_tags[0] + assert anchor_tag.attrs["id"] in [new_id, a_tag_id] + # if anchor is found we could add placeholder for link creation on server side. + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" + # create span to have cyclic links, link has 1 type of class, anchor another + if anchor_tag.attrs["id"] not in self.internal_anchors: + new_anchor_span = self.create_new_anchor_span( + soup, new_id) + anchor_tag.insert_before(new_anchor_span) + self.internal_anchors.add(new_id) + del anchor_tag.attrs["id"] + del internal_link_tag.attrs["href"] + else: + internal_link_tag.attrs["converter-mark"] = "bad-link" + self.logger.log(f"Error in {toc_href}." + f" While processing {internal_link_tag} no anchor found." + f" Should be anchor with new id={new_id} in" + f" {a_tag_href_matched_to_toc} file." + f" Old id={a_tag_id}") + # 1. make ids to be unique in all documents + make_ids_unique() + # 2a. process anchor which is a whole htm|html|xhtml file + process_file_anchor() + # 2b. process anchor which is an element in htm|html|xhtml file + process_file_element_anchor() @staticmethod def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: @@ -606,10 +597,14 @@ class EpubConverter: book_id=self.file_path.stem if hasattr(self.file_path, "stem") else "book_id") + indent = " " * lvl + self.logger.log(indent + f"Chapter: {title} is processing.") is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS - title_preprocessed = self.html_preprocessor.prepare_title(title) - content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content, - remove_title_from_chapter=is_chapter) + self.logger.log(indent + "Process title.") + title_preprocessed = self.html_processor.prepare_title(title) + self.logger.log(indent + "Process content.") + content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content, + remove_title_from_chapter=is_chapter) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter # if it doesn't have subchapters @@ -618,10 +613,6 @@ class EpubConverter: sub_chapter_item = self.html_node_to_livecarta_chapter_item( sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) - - if self.logger: - indent = " " * lvl - self.logger.log(f"{indent}Chapter: {title} is prepared.") return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) def convert_to_dict(self) -> dict: @@ -644,17 +635,18 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/Modern_Java_in_Action.epub" + epub_file_path = "../../epub/9781641050234.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\ .get_preset_json() - css_preprocessor = CSSPreprocessor(logger=logger_object) - html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object) + css_processor = CSSPreprocessor() + html_processor = HtmlEpubPreprocessor( + preset=preset, logger=logger_object) json_converter = EpubConverter(epub_file_path, logger=logger_object, - css_preprocessor=css_preprocessor, html_processor=html_preprocessor) + css_processor=css_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: