epub converter: add css processing

2021-04-22 17:26:17 +03:00
parent 8f284651c4
commit e0e64a0c38
3 changed files with 229 additions and 31 deletions
--- a/src/epub_postprocessor.py
+++ b/src/epub_postprocessor.py
@@ -10,8 +10,7 @@ from ebooklib.epub import Link, Section

 from src.data_objects import ChapterItem, NavPoint
 from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
-    preprocess_image, preprocess_footnotes
-
+    update_src_links_in_images, preprocess_footnotes

 # epub3 examples:
 # https://github.com/IDPF/epub3-samples
@@ -27,12 +26,15 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_
 # поиск toc в epublib:
 # если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
 # если его там нет, пробуют искать nav tag в manifest -> EpubNav.
+from src.util.css_reader import clean_css, add_inline_style_to_html_soup
+

 class EpubPostprocessor:
    def __init__(self, file, access=None):
        self.file = file
        self.access = access
        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
+        # read images
        self.href2img_bytes = {}
        for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
            file_name = x.file_name
@@ -42,21 +44,26 @@ class EpubPostprocessor:
        # read html
        self.id_anchor_exist_in_nav_points = False
        self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
+        # read css
+        self.html_href2css_href = {}
+        self.css_href2content = {}
+        self.build_css_content()
+        # add css
+        self.add_css_styles2soup()
+        # read footnotes
        self.footnotes = []
        for href in self.href2soup_html:
-            self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html,
-                                                       noteref_attr_name='data-type'))
-        # если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
-        # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
+            self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html))
+        # read toc
        self.href2ids = defaultdict(list)
        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}  # k = -1 if root, v = None if leaf
        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
-        self.mark_and_line_href2soup_html()  # used only after parsed toc, ids from toc needed
-        self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
-
+        # build simple toc from spine if needed
        if not self.is_toc_valid():
            self.build_adjacency_list_from_spine()
-
+        # read anchored blocks, split html into separate block
+        self.mark_and_line_href2soup_html()  # used only after parsed toc, ids from toc needed
+        self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
        self.build_anchor2soup()

        # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
@@ -68,12 +75,37 @@ class EpubPostprocessor:
        # todo: check if other chapters exist
        nodes = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
-            html_text = item.get_body_content()
-            soup = BeautifulSoup(html_text, features='lxml')
+            html_body_text = item.get_body_content()
+            soup = BeautifulSoup(html_body_text, features='lxml')
            nodes[item.file_name] = soup

        return nodes

+    def build_css_content(self):
+        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+            html_text = item.content
+            soup = BeautifulSoup(html_text, features='lxml')
+            for tag in soup.find_all('link', attrs={"type": "text/css"}):
+                css_href = tag.attrs.get('href')
+                self.html_href2css_href[item.file_name] = css_href
+                if css_href not in self.css_href2content:
+                    print(css_href)
+                    css_content: str = self.ebooklib_book.get_item_with_href(css_href).get_content().decode()
+                    self.css_href2content[css_href] = clean_css(css_content)
+
+            for i, tag in enumerate(soup.find_all('style')):
+                css_content = tag.string
+                self.html_href2css_href[item.file_name] = f'href{i}'
+                self.css_href2content[f'href{i}'] = clean_css(css_content)
+
+    def add_css_styles2soup(self):
+        for href in self.href2soup_html:
+            if self.html_href2css_href.get(href):
+                css: str = self.css_href2content[self.html_href2css_href[href]]
+                content = self.href2soup_html[href]
+                content = add_inline_style_to_html_soup(content, css)
+                self.href2soup_html[href] = content
+
    def build_manifest_id2href(self):
        links = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
@@ -193,7 +225,7 @@ class EpubPostprocessor:
        else:
            content: BeautifulSoup = self.href2soup_html[node.href]

-        preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
+        update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)

        sub_nodes = []