forked from LiveCarta/BookConverter
epub converter: add headers and image processing
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import codecs
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Union
|
||||
|
||||
@@ -8,17 +9,31 @@ import ebooklib
|
||||
from bs4 import BeautifulSoup
|
||||
from ebooklib import epub
|
||||
from ebooklib.epub import Link, Section
|
||||
from ebooklib.utils import debug
|
||||
|
||||
from src.data_objects import ChapterItem, NavPoint
|
||||
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids
|
||||
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
|
||||
preprocess_image
|
||||
|
||||
|
||||
# todo: https://docs.python.org/3/howto/unicode.html
|
||||
|
||||
|
||||
class EpubBookAdapter:
|
||||
def __init__(self, file):
|
||||
self.file = file
|
||||
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
|
||||
self.href2img_bytes = {}
|
||||
|
||||
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
|
||||
debug(x)
|
||||
file_name = x.file_name
|
||||
content = x.content
|
||||
# todo: check how file path is count in lib
|
||||
self.href2img_bytes[file_name] = content
|
||||
|
||||
self.id_anchor_exist_in_nav_points = False
|
||||
self.href2soup_html = self.build_href2soup_content()
|
||||
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
||||
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
|
||||
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
|
||||
self.href2ids = defaultdict(list)
|
||||
@@ -164,11 +179,13 @@ class EpubBookAdapter:
|
||||
def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
|
||||
title = node.title
|
||||
if node.id:
|
||||
content = self.id_anchor2soup[(node.href, node.id)]
|
||||
content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
|
||||
else:
|
||||
content = self.href2soup_html[node.href]
|
||||
content_preprocessed = str(content) # todo self.preprocess_html(content, node.id)
|
||||
content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed)
|
||||
content: BeautifulSoup = self.href2soup_html[node.href]
|
||||
|
||||
preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None)
|
||||
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
|
||||
|
||||
sub_nodes = []
|
||||
# warning! not EpubHtmlItems won;t be added to chapter
|
||||
if self.adjacency_list.get(node):
|
||||
@@ -196,9 +213,6 @@ if __name__ == "__main__":
|
||||
"content": l
|
||||
}
|
||||
|
||||
output_file = open('output.out', 'w')
|
||||
output_file.write(str(tmp))
|
||||
|
||||
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(tmp, f, ensure_ascii=False)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user