epub converter: refactor epub_converter.py

This commit is contained in:
shirshasa
2021-09-13 21:06:01 +03:00
parent 73fa84bf76
commit 3ba7ea6bf4

View File

@@ -28,10 +28,28 @@ class EpubConverter:
self.access = access
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file)
self.href2soup_html: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
self.added_to_toc_hrefs = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
# key = -1 for top level NavPoints
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
# container for all chapters soup objects
# here soup object is only part of the .xhtml file
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
self.internal_anchors = set()
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
self.href2img_bytes = {} # file path to bytes
self.old_image_path2_aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
self.logger.log('Image processing.')
self.href2img_bytes = {}
self.old_image_path2_aws_path = {}
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
@@ -39,8 +57,7 @@ class EpubConverter:
self.href2img_bytes[file_name] = content
self.logger.log('HTML files reading.')
self.id_anchor_exist_in_nav_points = False
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.href2soup_html = self.build_href2soup_content()
self.logger.log('CSS files processing.')
self.css_href2content, self.html_href2css_href = self.build_css_content()
@@ -48,9 +65,6 @@ class EpubConverter:
self.add_css_styles2soup()
self.logger.log('Footnotes processing.')
self.footnotes_contents: List[str] = []
self.noterefs = []
self.footnotes: List[Tag] = []
for href in self.href2soup_html:
content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href],
self.href2soup_html)
@@ -65,19 +79,18 @@ class EpubConverter:
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
self.logger.log('TOC processing.')
self.href2subchapter_ids = defaultdict(list)
self.added_to_toc_hrefs = set()
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # nav_point2nav_points
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed
if not self.is_toc_valid():
if self.is_toc_empty():
self.build_adjacency_list_from_spine()
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
self.logger.log(f'Html documents not added to TOC: {not_added}.')
self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.')
self.label_chapters_ids_with_tmp_id()
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
self.process_internal_links()
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
self.logger.log(f'Building chapters content.')
self.define_chapters_content()
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
@@ -142,7 +155,7 @@ class EpubConverter:
def build_adjacency_list_from_toc(self, element, lvl=0):
"""
self.adjacency_list builds based on TOC nested structure
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc
key = -1 if root, value = None if leaf
@@ -186,10 +199,10 @@ class EpubConverter:
else:
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
def is_toc_valid(self):
def is_toc_empty(self):
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return False
return True
return True
return False
def build_adjacency_list_from_spine(self):
manifest_id2href = self.build_manifest_id2href()
@@ -207,8 +220,7 @@ class EpubConverter:
self.adjacency_list[-1].append(nav_point)
self.added_to_toc_hrefs.add(file)
def process_html_soup_structure_to_line(self):
# mark
def label_chapters_ids_with_tmp_id(self):
for href in self.href2soup_html:
ids = self.href2subchapter_ids[href]
for i in ids:
@@ -219,6 +231,7 @@ class EpubConverter:
new_h.attrs['id'] = i
tag.insert_before(new_h)
def process_html_soup_structure_to_line(self):
# go to line structure
for href in self.href2soup_html:
soup = self.href2soup_html[href]
@@ -236,7 +249,7 @@ class EpubConverter:
new_anchor_span.string = "\xa0"
return new_anchor_span
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
"""
TOC: a/b/c.xhtml
@@ -285,7 +298,7 @@ class EpubConverter:
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
a_tag_href = internal_link_tag.attrs['href']
# find full path
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc:
continue
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
@@ -306,7 +319,8 @@ class EpubConverter:
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
# find full path
if a_tag_href:
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href,
internal_link_tag)
else:
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
if not a_tag_href_matched_to_toc:
@@ -367,9 +381,9 @@ class EpubConverter:
self.build_one_chapter(sub_node)
def define_chapters_content(self):
nav_points = self.adjacency_list[-1]
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in nav_points:
for point in top_level_nav_points:
self.build_one_chapter(point)
def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: