forked from LiveCarta/BookConverter
epub converter: add css processing
This commit is contained in:
@@ -10,8 +10,7 @@ from ebooklib.epub import Link, Section
|
||||
|
||||
from src.data_objects import ChapterItem, NavPoint
|
||||
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
|
||||
preprocess_image, preprocess_footnotes
|
||||
|
||||
update_src_links_in_images, preprocess_footnotes
|
||||
|
||||
# epub3 examples:
|
||||
# https://github.com/IDPF/epub3-samples
|
||||
@@ -27,12 +26,15 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_
|
||||
# поиск toc в epublib:
|
||||
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
|
||||
# если его там нет, пробуют искать nav tag в manifest -> EpubNav.
|
||||
from src.util.css_reader import clean_css, add_inline_style_to_html_soup
|
||||
|
||||
|
||||
class EpubPostprocessor:
|
||||
def __init__(self, file, access=None):
|
||||
self.file = file
|
||||
self.access = access
|
||||
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
|
||||
# read images
|
||||
self.href2img_bytes = {}
|
||||
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
|
||||
file_name = x.file_name
|
||||
@@ -42,21 +44,26 @@ class EpubPostprocessor:
|
||||
# read html
|
||||
self.id_anchor_exist_in_nav_points = False
|
||||
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
||||
# read css
|
||||
self.html_href2css_href = {}
|
||||
self.css_href2content = {}
|
||||
self.build_css_content()
|
||||
# add css
|
||||
self.add_css_styles2soup()
|
||||
# read footnotes
|
||||
self.footnotes = []
|
||||
for href in self.href2soup_html:
|
||||
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html,
|
||||
noteref_attr_name='data-type'))
|
||||
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
|
||||
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
|
||||
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html))
|
||||
# read toc
|
||||
self.href2ids = defaultdict(list)
|
||||
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf
|
||||
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
||||
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
|
||||
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
|
||||
|
||||
# build simple toc from spine if needed
|
||||
if not self.is_toc_valid():
|
||||
self.build_adjacency_list_from_spine()
|
||||
|
||||
# read anchored blocks, split html into separate block
|
||||
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
|
||||
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
|
||||
self.build_anchor2soup()
|
||||
|
||||
# if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
|
||||
@@ -68,12 +75,37 @@ class EpubPostprocessor:
|
||||
# todo: check if other chapters exist
|
||||
nodes = dict()
|
||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
html_text = item.get_body_content()
|
||||
soup = BeautifulSoup(html_text, features='lxml')
|
||||
html_body_text = item.get_body_content()
|
||||
soup = BeautifulSoup(html_body_text, features='lxml')
|
||||
nodes[item.file_name] = soup
|
||||
|
||||
return nodes
|
||||
|
||||
def build_css_content(self):
|
||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
html_text = item.content
|
||||
soup = BeautifulSoup(html_text, features='lxml')
|
||||
for tag in soup.find_all('link', attrs={"type": "text/css"}):
|
||||
css_href = tag.attrs.get('href')
|
||||
self.html_href2css_href[item.file_name] = css_href
|
||||
if css_href not in self.css_href2content:
|
||||
print(css_href)
|
||||
css_content: str = self.ebooklib_book.get_item_with_href(css_href).get_content().decode()
|
||||
self.css_href2content[css_href] = clean_css(css_content)
|
||||
|
||||
for i, tag in enumerate(soup.find_all('style')):
|
||||
css_content = tag.string
|
||||
self.html_href2css_href[item.file_name] = f'href{i}'
|
||||
self.css_href2content[f'href{i}'] = clean_css(css_content)
|
||||
|
||||
def add_css_styles2soup(self):
|
||||
for href in self.href2soup_html:
|
||||
if self.html_href2css_href.get(href):
|
||||
css: str = self.css_href2content[self.html_href2css_href[href]]
|
||||
content = self.href2soup_html[href]
|
||||
content = add_inline_style_to_html_soup(content, css)
|
||||
self.href2soup_html[href] = content
|
||||
|
||||
def build_manifest_id2href(self):
|
||||
links = dict()
|
||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
@@ -193,7 +225,7 @@ class EpubPostprocessor:
|
||||
else:
|
||||
content: BeautifulSoup = self.href2soup_html[node.href]
|
||||
|
||||
preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
|
||||
update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
|
||||
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
|
||||
|
||||
sub_nodes = []
|
||||
|
||||
Reference in New Issue
Block a user