epub converter: add css processing

This commit is contained in:
shirshasa
2021-04-22 17:26:17 +03:00
parent 8f284651c4
commit e0e64a0c38
3 changed files with 229 additions and 31 deletions

View File

@@ -10,8 +10,7 @@ from ebooklib.epub import Link, Section
from src.data_objects import ChapterItem, NavPoint
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
preprocess_image, preprocess_footnotes
update_src_links_in_images, preprocess_footnotes
# epub3 examples:
# https://github.com/IDPF/epub3-samples
@@ -27,12 +26,15 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_
# поиск toc в epublib:
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav.
from src.util.css_reader import clean_css, add_inline_style_to_html_soup
class EpubPostprocessor:
def __init__(self, file, access=None):
self.file = file
self.access = access
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
# read images
self.href2img_bytes = {}
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
file_name = x.file_name
@@ -42,21 +44,26 @@ class EpubPostprocessor:
# read html
self.id_anchor_exist_in_nav_points = False
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
# read css
self.html_href2css_href = {}
self.css_href2content = {}
self.build_css_content()
# add css
self.add_css_styles2soup()
# read footnotes
self.footnotes = []
for href in self.href2soup_html:
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html,
noteref_attr_name='data-type'))
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html))
# read toc
self.href2ids = defaultdict(list)
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
# build simple toc from spine if needed
if not self.is_toc_valid():
self.build_adjacency_list_from_spine()
# read anchored blocks, split html into separate block
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
self.build_anchor2soup()
# if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
@@ -68,12 +75,37 @@ class EpubPostprocessor:
# todo: check if other chapters exist
nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_text = item.get_body_content()
soup = BeautifulSoup(html_text, features='lxml')
html_body_text = item.get_body_content()
soup = BeautifulSoup(html_body_text, features='lxml')
nodes[item.file_name] = soup
return nodes
def build_css_content(self):
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_text = item.content
soup = BeautifulSoup(html_text, features='lxml')
for tag in soup.find_all('link', attrs={"type": "text/css"}):
css_href = tag.attrs.get('href')
self.html_href2css_href[item.file_name] = css_href
if css_href not in self.css_href2content:
print(css_href)
css_content: str = self.ebooklib_book.get_item_with_href(css_href).get_content().decode()
self.css_href2content[css_href] = clean_css(css_content)
for i, tag in enumerate(soup.find_all('style')):
css_content = tag.string
self.html_href2css_href[item.file_name] = f'href{i}'
self.css_href2content[f'href{i}'] = clean_css(css_content)
def add_css_styles2soup(self):
for href in self.href2soup_html:
if self.html_href2css_href.get(href):
css: str = self.css_href2content[self.html_href2css_href[href]]
content = self.href2soup_html[href]
content = add_inline_style_to_html_soup(content, css)
self.href2soup_html[href] = content
def build_manifest_id2href(self):
links = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
@@ -193,7 +225,7 @@ class EpubPostprocessor:
else:
content: BeautifulSoup = self.href2soup_html[node.href]
preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
sub_nodes = []