rewrite process internal links

2022-07-08 18:37:17 +03:00
parent 7d5c1bfdf2
commit 5036445c05
1 changed files with 105 additions and 113 deletions
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -94,13 +94,15 @@ class EpubConverter:
        self.logger.log(f"Html documents not added to TOC: {not_added}.")
        self.logger.log(f"Add documents not added to TOC.")
        self.add_not_added_files_to_adjacency_list(not_added)
-        self.logger.log(f"Html internal links and structure processing.")
-        self.label_chapters_ids_with_lc_id()
-        self.chapter_marks_are_same_level()
-        # used only after parsed toc, ids from toc needed
+        self.logger.log(f"Label subchapters with converter tag.")
+        self.label_subchapters_with_lc_tag()
+        self.logger.log(f"Process html internal links.")
        self.process_internal_links()
+        self.logger.log(
+            f"Check if converter-chapter-marks are on the same level.")
+        self.chapter_marks_are_same_level()
        self.logger.log(f"Define chapters content.")
-        self.define_chapters_content()
+        self.define_chapters_with_content()
        self.logger.log(f"Converting html_nodes to LiveCarta chapter items.")

    def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
@@ -286,14 +288,14 @@ class EpubConverter:
            return True
        return False

-    def build_manifest_id2html_href(self) -> dict:
-        links = dict()
-        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
-            links[item.id] = item.file_name
-        return links
-
    def build_adjacency_list_from_spine(self):
-        manifest_id2html_href = self.build_manifest_id2html_href()
+        def build_manifest_id2html_href() -> dict:
+            links = dict()
+            for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+                links[item.id] = item.file_name
+            return links
+
+        manifest_id2html_href = build_manifest_id2html_href()
        self.adjacency_list = {
            -1: []
        }
@@ -311,16 +313,16 @@ class EpubConverter:
            self.adjacency_list[-1].append(nav_point)
            self.hrefs_added_to_toc.add(file)

-    def label_chapters_ids_with_lc_id(self):
+    def label_subchapters_with_lc_tag(self):
        for html_href in self.html_href2html_body_soup:
-            ids = self.html_href2subchapter_ids[html_href]
+            ids, soup = self.html_href2subchapters_ids[html_href], \
+                  self.html_href2html_body_soup[html_href]
            for i in ids:
-                soup = self.html_href2html_body_soup[html_href]
                tag = soup.find(id=i)
-                new_h = soup.new_tag("tmp")
-                new_h.attrs["class"] = "converter-chapter-mark"
-                new_h.attrs["id"] = i
-                tag.insert_before(new_h)
+                tmp_tag = soup.new_tag("lc_tmp")
+                tmp_tag.attrs["class"] = "converter-chapter-mark"
+                tmp_tag.attrs["id"] = i
+                tag.insert_before(tmp_tag)

    def chapter_marks_are_same_level(self):
        """
@@ -401,8 +403,8 @@ class EpubConverter:
        Steps
        ----------
        1. rebuild ids to be unique in all documents
-        2a. process anchor which is a whole xhtml file
-        2b. process anchor which is an element in xhtml file
+        2a. process anchor which is a whole htm|html|xhtml file
+        2b. process anchor which is an element in htm|html|xhtml file

        Returns
        -------
@@ -410,91 +412,80 @@ class EpubConverter:
            process links in html

        """
-        # 1. rebuild ids to be unique in all documents
-        for toc_href in self.hrefs_added_to_toc:
-            for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
-                if tag.attrs.get("class") == "converter-chapter-mark":
-                    continue
+        def make_ids_unique():
+            for toc_href in self.hrefs_added_to_toc:
+                for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
+                    if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
+                        new_id = self.create_unique_id(toc_href, tag.attrs["id"])
+                        tag.attrs["id"] = new_id

-                if tag.attrs.get("class") == "footnote-element":
-                    continue
+        def process_file_anchor():
+            for toc_href in self.hrefs_added_to_toc:
+                soup = self.html_href2html_body_soup[toc_href]
+                for internal_link_tag in soup.find_all("a",
+                                                       {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
+                    a_tag_href = internal_link_tag.attrs["href"]
+                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
+                        toc_href, a_tag_href, internal_link_tag)
+                    if a_tag_href_matched_to_toc:
+                        new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
+                        internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
+                        if new_id not in self.internal_anchors:
+                            anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
+                            new_anchor_span = self.create_new_anchor_span(soup, new_id)
+                            # insert a new span to the beginning of the file
+                            anchor_soup.insert(0, new_anchor_span)
+                            self.internal_anchors.add(new_id)
+                        del internal_link_tag.attrs["href"]

-                new_id = self.create_unique_id(toc_href, tag.attrs["id"])
-                tag.attrs["id"] = new_id
+        def process_file_element_anchor():
+            for toc_href in self.hrefs_added_to_toc:
+                soup = self.html_href2html_body_soup[toc_href]
+                # process_file_element_anchor
+                for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
+                    a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#")
+                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
+                        toc_href, a_tag_href, internal_link_tag) if a_tag_href \
+                        else os.path.normpath(toc_href).replace("\\", "/")
+                    if a_tag_href_matched_to_toc:
+                        new_id = self.create_unique_id(
+                            a_tag_href_matched_to_toc, a_tag_id)

-        # 2a. process anchor which is a whole xhtml file
-        internal_link_reg1 = re.compile(
-            r"(^(?!https?://).+\.(htm|html|xhtml)$)")
-        for toc_href in self.hrefs_added_to_toc:
-            soup = self.html_href2html_body_soup[toc_href]
-            for internal_link_tag in soup.find_all("a", {"href": internal_link_reg1}):
-                a_tag_href = internal_link_tag.attrs["href"]
-                # find full path
-                a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
-                    toc_href, a_tag_href, internal_link_tag)
-                if not a_tag_href_matched_to_toc:
-                    continue
-                new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
-                internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
-                if new_id not in self.internal_anchors:
-                    anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
-                    new_anchor_span = self.create_new_anchor_span(soup, new_id)
-                    # insert a new span to the beginning of the file
-                    anchor_soup.insert(0, new_anchor_span)
-                    self.internal_anchors.add(new_id)
+                        anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
+                        anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
+                                      anchor_soup.find_all(attrs={"id": a_tag_id})  # if link is a footnote
+                        if anchor_tags:
+                            if len(anchor_tags) > 1:
+                                self.logger.log(f"Warning in {toc_href}: multiple anchors:"
+                                                f"{len(anchor_tags)} found.\n"
+                                                f"{anchor_tags}\n"
+                                                f"While processing {internal_link_tag}")

-                del internal_link_tag.attrs["href"]
-
-        # 2b. process anchor which is an element in xhtml file
-        internal_link_reg2 = re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")
-        for toc_href in self.hrefs_added_to_toc:
-            soup = self.html_href2html_body_soup[toc_href]
-            for internal_link_tag in soup.find_all("a", {"href": internal_link_reg2}):
-                a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
-                    "#")
-                # find full path
-                if a_tag_href:
-                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
-                                                                                 internal_link_tag)
-                else:
-                    a_tag_href_matched_to_toc = os.path.normpath(
-                        toc_href).replace("\\", "/")
-
-                if not a_tag_href_matched_to_toc:
-                    continue
-
-                new_id = self.create_unique_id(
-                    a_tag_href_matched_to_toc, a_tag_id)
-
-                anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
-                anchor_tags = anchor_soup.find_all(attrs={"id": new_id, })
-                anchor_tags = anchor_tags or anchor_soup.find_all(
-                    attrs={"id": a_tag_id})  # if link is a footnote
-
-                if anchor_tags:
-                    if len(anchor_tags) > 1:
-                        self.logger.log(f"Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n"
-                                        f"{anchor_tags}\n"
-                                        f" While processing {internal_link_tag}")
-
-                    anchor_tag = anchor_tags[0]
-                    assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
-                    # if anchor is found we could add placeholder for link creation on server side.
-                    internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
-                    # create span to have cyclic links, link has 1 type of class, anchor another
-                    if anchor_tag.attrs["id"] not in self.internal_anchors:
-                        new_anchor_span = self.create_new_anchor_span(
-                            soup, new_id)
-                        anchor_tag.insert_before(new_anchor_span)
-                        self.internal_anchors.add(new_id)
-                        del anchor_tag.attrs["id"]
-                    del internal_link_tag.attrs["href"]
-
-                else:
-                    internal_link_tag.attrs["converter-mark"] = "bad-link"
-                    self.logger.log(f"Error in {toc_href}. While processing {internal_link_tag} no anchor found."
-                                    f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
-                                    f" Old id={a_tag_id}")
+                            anchor_tag = anchor_tags[0]
+                            assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
+                            # if anchor is found we could add placeholder for link creation on server side.
+                            internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
+                            # create span to have cyclic links, link has 1 type of class, anchor another
+                            if anchor_tag.attrs["id"] not in self.internal_anchors:
+                                new_anchor_span = self.create_new_anchor_span(
+                                    soup, new_id)
+                                anchor_tag.insert_before(new_anchor_span)
+                                self.internal_anchors.add(new_id)
+                                del anchor_tag.attrs["id"]
+                            del internal_link_tag.attrs["href"]
+                        else:
+                            internal_link_tag.attrs["converter-mark"] = "bad-link"
+                            self.logger.log(f"Error in {toc_href}."
+                                            f" While processing {internal_link_tag} no anchor found."
+                                            f" Should be anchor with new id={new_id} in"
+                                            f" {a_tag_href_matched_to_toc} file."
+                                            f" Old id={a_tag_id}")
+        # 1. make ids to be unique in all documents
+        make_ids_unique()
+        # 2a. process anchor which is a whole htm|html|xhtml file
+        process_file_anchor()
+        # 2b. process anchor which is an element in htm|html|xhtml file
+        process_file_element_anchor()

    @staticmethod
    def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
@@ -606,10 +597,14 @@ class EpubConverter:
                                                                    book_id=self.file_path.stem
                                                                    if hasattr(self.file_path, "stem") else "book_id")

+        indent = " " * lvl
+        self.logger.log(indent + f"Chapter: {title} is processing.")
        is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
-        title_preprocessed = self.html_preprocessor.prepare_title(title)
-        content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
-                                                                      remove_title_from_chapter=is_chapter)
+        self.logger.log(indent + "Process title.")
+        title_preprocessed = self.html_processor.prepare_title(title)
+        self.logger.log(indent + "Process content.")
+        content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
+                                                                   remove_title_from_chapter=is_chapter)
        sub_nodes = []
        # warning! not EpubHtmlItems won't be added to chapter
        # if it doesn't have subchapters
@@ -618,10 +613,6 @@ class EpubConverter:
                sub_chapter_item = self.html_node_to_livecarta_chapter_item(
                    sub_node, lvl + 1)
                sub_nodes.append(sub_chapter_item)
-
-        if self.logger:
-            indent = " " * lvl
-            self.logger.log(f"{indent}Chapter: {title} is prepared.")
        return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)

    def convert_to_dict(self) -> dict:
@@ -644,17 +635,18 @@ class EpubConverter:


 if __name__ == "__main__":
-    epub_file_path = "../../epub/Modern_Java_in_Action.epub"
+    epub_file_path = "../../epub/9781641050234.epub"
    logger_object = BookLogger(
        name="epub", book_id=epub_file_path.split("/")[-1])

    preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
        .get_preset_json()
-    css_preprocessor = CSSPreprocessor(logger=logger_object)
-    html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
+    css_processor = CSSPreprocessor()
+    html_processor = HtmlEpubPreprocessor(
+        preset=preset, logger=logger_object)

    json_converter = EpubConverter(epub_file_path, logger=logger_object,
-                                   css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
+                                   css_processor=css_processor, html_processor=html_processor)
    content_dict = json_converter.convert_to_dict()

    with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: