Annotations in Epub converter

2022-08-03 14:39:13 +03:00
parent 7453029295
commit 78e3ad8911
16 changed files with 259 additions and 192 deletions
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -9,8 +9,8 @@ from pathlib import Path
 from itertools import chain
 from premailer import transform
 from collections import defaultdict
-from typing import Dict, Union, List
-from bs4 import BeautifulSoup, NavigableString, Tag
+from typing import List, Tuple, Dict, Union
+from bs4 import BeautifulSoup, Tag, NavigableString

 from src.util.helpers import BookLogger
 from src.epub_converter.css_processor import CSSPreprocessor
@@ -39,7 +39,8 @@ class EpubConverter:

        # toc tree structure stored as adj.list (NavPoint to list of NavPoints)
        # key = -1 for top level NavPoints
-        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
+        self.adjacency_list: Dict[Union[NavPoint, -1],
+                                  Union[List[NavPoint], None]] = {}

        # list to offset Chapter_i on 1st level
        self.offset_sub_nodes = []
@@ -70,7 +71,8 @@ class EpubConverter:
                                            BeautifulSoup] = self.build_href2soup_content()

        self.logger.log("CSS inline style processing.")
-        self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
+        self.css_processor.process_inline_styles_in_html_soup(
+            self.html_href2html_body_soup)
        self.logger.log("CSS files processing.")
        self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
        self.logger.log("CSS styles fusion(inline+file).")
@@ -107,7 +109,6 @@ class EpubConverter:
    def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
        # using EpubElements
        # for now just for HTML objects, as it is the simplest chapter
-
        nodes = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            html_body_text = item.get_body_content()
@@ -116,7 +117,7 @@ class EpubConverter:
            nodes[item.file_name] = soup
        return nodes

-    def build_html_and_css_relations(self) -> tuple[dict, dict]:
+    def build_html_and_css_relations(self) -> Tuple[Dict[str, List[str]], Dict[str, str]]:
        """
        Function is designed to get 2 dictionaries:
        The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them
@@ -130,8 +131,8 @@ class EpubConverter:

        """
        # dictionary: href of html to related css files
-        html_href2css_href: defaultdict = defaultdict(list)
-        css_href2css_content: dict = {}
+        html_href2css_href: Dict[str, List[str]] = defaultdict(list)
+        css_href2css_content: Dict[str, str] = {}

        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            html_content = item.content
@@ -213,7 +214,9 @@ class EpubConverter:
                    html_content, css)
                self.html_href2html_body_soup[html_href] = html_content

-    def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
+    def build_adjacency_list_from_toc(self,
+                                      element: Union[Link, Tuple[Section, List], List[Union[Link, Tuple]]],
+                                      lvl: int = 0) -> NavPoint:
        """
        Function
        self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
@@ -304,7 +307,7 @@ class EpubConverter:
            self.adjacency_list[-1].append(nav_point)
            self.hrefs_added_to_toc.add(nav_point.href)

-    def add_not_added_files_to_adjacency_list(self, not_added: list):
+    def add_not_added_files_to_adjacency_list(self, not_added: List[str]):
        """Function add files that not added to adjacency list"""
        for i, file in enumerate(not_added):
            nav_point = NavPoint(
@@ -315,7 +318,7 @@ class EpubConverter:
    def label_subchapters_with_lc_tag(self):
        for html_href in self.html_href2html_body_soup:
            ids, soup = self.html_href2subchapters_ids[html_href], \
-                  self.html_href2html_body_soup[html_href]
+                self.html_href2html_body_soup[html_href]
            for i in ids:
                tag = soup.find(id=i)
                tmp_tag = soup.new_tag("lc_tmp")
@@ -345,10 +348,13 @@ class EpubConverter:
                    mark.parent.unwrap()

    @staticmethod
-    def create_unique_id(href, id_):
+    def create_unique_id(href: str, id_: str) -> str:
        return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)

-    def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
+    def match_href_to_path_from_toc(self,
+                                    cur_file_path: str,
+                                    href_in_link: str,
+                                    internal_link_tag: Tag) -> Union[None, str]:
        """
        Function used to find full path to file that is parsed from tag link
        TOC: a/b/c.xhtml
@@ -387,7 +393,7 @@ class EpubConverter:
        return full_path[0]

    @staticmethod
-    def create_new_anchor_span(soup, id_):
+    def create_new_anchor_span(soup: BeautifulSoup, id_: str) -> Tag:
        new_anchor_span = soup.new_tag("span")
        new_anchor_span.attrs["id"] = id_
        new_anchor_span.attrs["class"] = "link-anchor"
@@ -415,7 +421,8 @@ class EpubConverter:
            for toc_href in self.hrefs_added_to_toc:
                for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
                    if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
-                        new_id = self.create_unique_id(toc_href, tag.attrs["id"])
+                        new_id = self.create_unique_id(
+                            toc_href, tag.attrs["id"])
                        tag.attrs["id"] = new_id

        def process_file_anchor():
@@ -427,11 +434,13 @@ class EpubConverter:
                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
                        toc_href, a_tag_href, internal_link_tag)
                    if a_tag_href_matched_to_toc:
-                        new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
+                        new_id = self.create_unique_id(
+                            a_tag_href_matched_to_toc, "")
                        internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
                        if new_id not in self.internal_anchors:
                            anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
-                            new_anchor_span = self.create_new_anchor_span(soup, new_id)
+                            new_anchor_span = self.create_new_anchor_span(
+                                soup, new_id)
                            # insert a new span to the beginning of the file
                            anchor_soup.insert(0, new_anchor_span)
                            self.internal_anchors.add(new_id)
@@ -442,7 +451,8 @@ class EpubConverter:
                soup = self.html_href2html_body_soup[toc_href]
                # process_file_element_anchor
                for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
-                    a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#")
+                    a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
+                        "#")
                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
                        toc_href, a_tag_href, internal_link_tag) if a_tag_href \
                        else path.normpath(toc_href).replace("\\", "/")
@@ -452,7 +462,8 @@ class EpubConverter:

                        anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
                        anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
-                                      anchor_soup.find_all(attrs={"id": a_tag_id})  # if link is a footnote
+                            anchor_soup.find_all(
+                                attrs={"id": a_tag_id})  # if link is a footnote
                        if anchor_tags:
                            if len(anchor_tags) > 1:
                                self.logger.log(f"Warning in {toc_href}: multiple anchors:"
@@ -487,7 +498,9 @@ class EpubConverter:
        process_file_element_anchor()

    @staticmethod
-    def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
+    def get_tags_between_chapter_marks(first_id: str,
+                                       href: str,
+                                       html_soup: BeautifulSoup) -> List[Union[Tag, NavigableString]]:
        """
        Get tags between LiveCarta chapter marks
        Parameters
@@ -568,7 +581,7 @@ class EpubConverter:
            for tl_nav_point in top_level_nav_points:
                self.detect_one_chapter(tl_nav_point)

-    def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
+    def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl: int = 1) -> ChapterItem:
        """
        Function prepare style, tags to json structure
        Parameters
@@ -584,18 +597,18 @@ class EpubConverter:
            built chapter

        """
-        title = nav_point.title
+        title: str = nav_point.title
        content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
            if nav_point.id else self.html_href2html_body_soup[nav_point.href]

-        indent = " " * lvl
+        indent: str = " " * lvl
        self.logger.log(indent + f"Chapter: {title} is processing.")
-        is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
+        is_chapter: bool = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
        self.logger.log(indent + "Process title.")
-        title_preprocessed = self.html_processor.prepare_title(title)
+        title_preprocessed: str = self.html_processor.prepare_title(title)
        self.logger.log(indent + "Process content.")
-        content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
-                                                                   remove_title_from_chapter=is_chapter)
+        content_preprocessed: BeautifulSoup = self.html_processor.prepare_content(
+            title_preprocessed, content, remove_title_from_chapter=is_chapter)

        self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
                                                                    self.img_href2img_bytes,
@@ -613,7 +626,7 @@ class EpubConverter:
                sub_nodes.append(sub_chapter_item)
        return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes)

-    def convert_to_dict(self) -> dict:
+    def convert_to_dict(self) -> Dict[str, List[Dict[str, Union[List, str]]]]:
        """Function which convert list of html nodes to appropriate json structure"""
        top_level_nav_points = self.adjacency_list[-1]
        top_level_chapters = []
@@ -633,7 +646,7 @@ class EpubConverter:


 if __name__ == "__main__":
-    epub_file_path = "../../books/epub/9780763774134.epub"
+    epub_file_path = "../../books/epub/9781119646044.epub"
    logger_object = BookLogger(
        name="epub", book_id=epub_file_path.split("/")[-1])