epub converter: prettify epub_postprocessor.py

This commit is contained in:
shirshasa
2021-09-06 21:01:09 +03:00
parent 82d51f93df
commit 178896d510

View File

@@ -26,7 +26,7 @@ class EpubPostprocessor:
self.file = file self.file = file
self.access = access self.access = access
self.logger: BookLogger = logger self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib self.ebooklib_book = epub.read_epub(file)
self.internal_anchors = set() self.internal_anchors = set()
self.logger.log('Image processing.') self.logger.log('Image processing.')
self.href2img_bytes = {} self.href2img_bytes = {}
@@ -43,7 +43,6 @@ class EpubPostprocessor:
self.logger.log('CSS files processing.') self.logger.log('CSS files processing.')
self.css_href2content, self.html_href2css_href = self.build_css_content() self.css_href2content, self.html_href2css_href = self.build_css_content()
# add css
self.logger.log('CSS styles adding.') self.logger.log('CSS styles adding.')
self.add_css_styles2soup() self.add_css_styles2soup()
@@ -67,7 +66,7 @@ class EpubPostprocessor:
self.logger.log('TOC processing.') self.logger.log('TOC processing.')
self.href2subchapter_ids = defaultdict(list) self.href2subchapter_ids = defaultdict(list)
self.added_to_toc_hrefs = set() self.added_to_toc_hrefs = set()
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # nav_point2nav_points
self.build_adjacency_list_from_toc(self.ebooklib_book.toc) self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed # build simple toc from spine if needed
if not self.is_toc_valid(): if not self.is_toc_valid():
@@ -75,14 +74,10 @@ class EpubPostprocessor:
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs] not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
self.logger.log(f'Html documents not added to TOC: {not_added}.') self.logger.log(f'Html documents not added to TOC: {not_added}.')
self.add_not_added_files_to_adjacency_list(not_added) self.add_not_added_files_to_adjacency_list(not_added)
# read anchored blocks, split html into separate block self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
self.unwrap_all_html_soup() # used only after parsed toc, ids from toc needed
self.process_internal_links() self.process_internal_links()
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
self.build_anchor2soup() self.define_chapters_content()
# if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
# self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements # using EpubElements
@@ -145,7 +140,14 @@ class EpubPostprocessor:
return links return links
def build_adjacency_list_from_toc(self, element, lvl=0): def build_adjacency_list_from_toc(self, element, lvl=0):
# use book.toc as a root """
self.adjacency_list builds based on TOC nested structure
key = -1 if root, value = None if leaf
:param element: [Link, tuple, list] - element that appears in TOC( usually parsed from nav.ncx)
:param lvl: level of depth
"""
if isinstance(element, Link): if isinstance(element, Link):
# todo: check if link exists # todo: check if link exists
@@ -204,7 +206,7 @@ class EpubPostprocessor:
self.adjacency_list[-1].append(node) self.adjacency_list[-1].append(node)
self.added_to_toc_hrefs.add(file) self.added_to_toc_hrefs.add(file)
def unwrap_all_html_soup(self): def process_html_soup_structure_to_line(self):
# mark # mark
for href in self.href2soup_html: for href in self.href2soup_html:
ids = self.href2subchapter_ids[href] ids = self.href2subchapter_ids[href]
@@ -250,7 +252,7 @@ class EpubPostprocessor:
return full_path[0] return full_path[0]
def process_internal_links(self): def process_internal_links(self):
# rebuild ids to be unique in all documents # 1. rebuild ids to be unique in all documents
for toc_href in self.added_to_toc_hrefs: for toc_href in self.added_to_toc_hrefs:
for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}): for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
if tag.attrs.get('class') == 'converter-chapter-mark': if tag.attrs.get('class') == 'converter-chapter-mark':
@@ -262,8 +264,8 @@ class EpubPostprocessor:
new_id = self._create_unique_id(toc_href, tag.attrs['id']) new_id = self._create_unique_id(toc_href, tag.attrs['id'])
tag.attrs['id'] = new_id tag.attrs['id'] = new_id
# --------------------------------------------------------------------------------- # 2.a) process anchor which is a whole xhtml file
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)') # anchor is a whole xhtml file internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)')
for toc_href in self.added_to_toc_hrefs: for toc_href in self.added_to_toc_hrefs:
soup = self.href2soup_html[toc_href] soup = self.href2soup_html[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
@@ -282,8 +284,7 @@ class EpubPostprocessor:
del internal_link_tag.attrs['href'] del internal_link_tag.attrs['href']
# ------------------------------------------------------------------------ # 2.a) process anchor which is a an element in xhtml file
# add placeholder to all internal links
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
for toc_href in self.added_to_toc_hrefs: for toc_href in self.added_to_toc_hrefs:
soup = self.href2soup_html[toc_href] soup = self.href2soup_html[toc_href]
@@ -323,22 +324,18 @@ class EpubPostprocessor:
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={a_tag_id}') f' Old id={a_tag_id}')
def build_one_anchored_section(self, node): def build_one_chapter(self, node):
""" """
к этому моементу html soup уже существует в линейном виде Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
- если не в линейном - то мы не виноваты
есть 3 случая: 3 cases:
id оборачивает весь контент, id wraps all chapter content,
id оборачивает контент чаптера и под-чаптера, id wraps chapter's content + subchapters' content
id только указывает на заголовок id points to the start of title of a chapter
во всех 3х случаях мы знаем где начало заголовка. Поэтому In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
глава - это все теги от текущего заголовка - до какого угодно следущющего and id of the next chapter/subchapter
заголовок принимается в расчет если в toc есть указание id,тогда заголовок -
это любой тег с id из toc
:return:
""" """
if node.id: if node.id:
soup = self.href2soup_html[node.href] soup = self.href2soup_html[node.href]
@@ -346,22 +343,22 @@ class EpubPostprocessor:
new_tree = BeautifulSoup('', 'html.parser') new_tree = BeautifulSoup('', 'html.parser')
for tag in chapter_tags: for tag in chapter_tags:
new_tree.append(tag) new_tree.append(tag)
self.id_anchor2soup[(node.href, node.id)] = new_tree self.href_chapter_id2soup_html[(node.href, node.id)] = new_tree
if self.adjacency_list.get(node): if self.adjacency_list.get(node):
for sub_node in self.adjacency_list[node]: for sub_node in self.adjacency_list[node]:
self.build_one_anchored_section(sub_node) self.build_one_chapter(sub_node)
def build_anchor2soup(self): def define_chapters_content(self):
nav_points = self.adjacency_list[-1] nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points: if self.id_anchor_exist_in_nav_points:
for point in nav_points: for point in nav_points:
self.build_one_anchored_section(point) self.build_one_chapter(point)
def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem: def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem:
title = node.title title = node.title
if node.id: if node.id:
content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)] content: BeautifulSoup = self.href_chapter_id2soup_html[(node.href, node.id)]
else: else:
content: BeautifulSoup = self.href2soup_html[node.href] content: BeautifulSoup = self.href2soup_html[node.href]