epub converter: add access object for image processing

- update headings cleaning - add h tag removal -
2021-04-21 17:27:50 +03:00
parent dce0f871a8
commit ea0814fb4c
2 changed files with 42 additions and 17 deletions
--- a/src/epub_postprocessor.py
+++ b/src/epub_postprocessor.py
@@ -1,7 +1,5 @@
 import codecs
 import json
-import re
-import os
 from collections import defaultdict
 from typing import Dict, Union

@@ -9,7 +7,6 @@ import ebooklib
 from bs4 import BeautifulSoup
 from ebooklib import epub
 from ebooklib.epub import Link, Section
-from ebooklib.utils import debug

 from src.data_objects import ChapterItem, NavPoint
 from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
@@ -27,19 +24,22 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_
 # todo: https://docs.python.org/3/howto/unicode.html


+# поиск toc в epublib:
+# если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
+# если его там нет, пробуют искать nav tag в manifest -> EpubNav.
+
 class EpubPostprocessor:
-    def __init__(self, file):
+    def __init__(self, file, access=None):
        self.file = file
+        self.access = access
        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
        self.href2img_bytes = {}
-
        for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
-            debug(x)
            file_name = x.file_name
            content = x.content
            # todo: check how file path is count in lib
            self.href2img_bytes[file_name] = content
-
+        # read html
        self.id_anchor_exist_in_nav_points = False
        self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
        self.footnotes = []
@@ -193,7 +193,7 @@ class EpubPostprocessor:
        else:
            content: BeautifulSoup = self.href2soup_html[node.href]

-        preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None)
+        preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)

        sub_nodes = []