epub converter: add headers and image processing

2021-04-15 14:45:41 +03:00
parent 880b045de0
commit 5e58cb3d92
2 changed files with 104 additions and 30 deletions
--- a/src/epub_converter.py
+++ b/src/epub_converter.py
@@ -1,6 +1,7 @@
 import codecs
 import json
 import re
+import os
 from collections import defaultdict
 from typing import Dict, Union

@@ -8,17 +9,31 @@ import ebooklib
 from bs4 import BeautifulSoup
 from ebooklib import epub
 from ebooklib.epub import Link, Section
+from ebooklib.utils import debug

 from src.data_objects import ChapterItem, NavPoint
-from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids
+from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
+    preprocess_image
+
+
+# todo: https://docs.python.org/3/howto/unicode.html


 class EpubBookAdapter:
    def __init__(self, file):
        self.file = file
        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
+        self.href2img_bytes = {}
+
+        for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
+            debug(x)
+            file_name = x.file_name
+            content = x.content
+            # todo: check how file path is count in lib
+            self.href2img_bytes[file_name] = content
+
        self.id_anchor_exist_in_nav_points = False
-        self.href2soup_html = self.build_href2soup_content()
+        self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
        # если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
        # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
        self.href2ids = defaultdict(list)
@@ -164,11 +179,13 @@ class EpubBookAdapter:
    def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
        title = node.title
        if node.id:
-            content = self.id_anchor2soup[(node.href, node.id)]
+            content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
        else:
-            content = self.href2soup_html[node.href]
-        content_preprocessed = str(content)  # todo self.preprocess_html(content, node.id)
-        content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed)
+            content: BeautifulSoup = self.href2soup_html[node.href]
+
+        preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None)
+        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
+
        sub_nodes = []
        # warning! not EpubHtmlItems won;t be added to chapter
        if self.adjacency_list.get(node):
@@ -196,9 +213,6 @@ if __name__ == "__main__":
        "content": l
    }

-    output_file = open('output.out', 'w')
-    output_file.write(str(tmp))
-
    with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
        json.dump(tmp, f, ensure_ascii=False)