diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index 6d32940..dc83b62 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -1,7 +1,5 @@ import codecs import json -import re -import os from collections import defaultdict from typing import Dict, Union @@ -9,7 +7,6 @@ import ebooklib from bs4 import BeautifulSoup from ebooklib import epub from ebooklib.epub import Link, Section -from ebooklib.utils import debug from src.data_objects import ChapterItem, NavPoint from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \ @@ -27,19 +24,22 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ # todo: https://docs.python.org/3/howto/unicode.html +# поиск toc в epublib: +# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap +# если его там нет, пробуют искать nav tag в manifest -> EpubNav. + class EpubPostprocessor: - def __init__(self, file): + def __init__(self, file, access=None): self.file = file + self.access = access self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib self.href2img_bytes = {} - for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE): - debug(x) file_name = x.file_name content = x.content # todo: check how file path is count in lib self.href2img_bytes[file_name] = content - + # read html self.id_anchor_exist_in_nav_points = False self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() self.footnotes = [] @@ -193,7 +193,7 @@ class EpubPostprocessor: else: content: BeautifulSoup = self.href2soup_html[node.href] - preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None) + preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access) title_preprocessed, content_preprocessed = prepare_title_and_content(title, content) sub_nodes = [] diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index c615f35..ef372c3 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -6,6 +6,7 @@ from typing import List from bs4 import BeautifulSoup, NavigableString, Tag from src.access import Access +from src.config import LawCartaConfig def save_image_locally(img_file_path, img_content, book_id): @@ -54,10 +55,6 @@ def preprocess_table(): pass -def preprocess_quote(): - pass - - def _process_lists(body_tag): """ Function to process tags