forked from LiveCarta/BookConverter
epub converter: refactor epub_converter.py
This commit is contained in:
@@ -28,10 +28,28 @@ class EpubConverter:
|
||||
self.access = access
|
||||
self.logger: BookLogger = logger
|
||||
self.ebooklib_book = epub.read_epub(file)
|
||||
|
||||
self.href2soup_html: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
|
||||
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
|
||||
self.added_to_toc_hrefs = set() # enumerate all file paths that where added to TOC
|
||||
|
||||
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
|
||||
# key = -1 for top level NavPoints
|
||||
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
|
||||
|
||||
# container for all chapters soup objects
|
||||
# here soup object is only part of the .xhtml file
|
||||
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
|
||||
|
||||
self.internal_anchors = set()
|
||||
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
|
||||
self.href2img_bytes = {} # file path to bytes
|
||||
self.old_image_path2_aws_path = {} # file path from <a> to generated aws path
|
||||
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
||||
self.noterefs: List[Tag] = [] # start of the footnote
|
||||
self.footnotes: List[Tag] = [] # end of the footnote
|
||||
|
||||
self.logger.log('Image processing.')
|
||||
self.href2img_bytes = {}
|
||||
self.old_image_path2_aws_path = {}
|
||||
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
|
||||
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
|
||||
file_name = x.file_name
|
||||
@@ -39,8 +57,7 @@ class EpubConverter:
|
||||
self.href2img_bytes[file_name] = content
|
||||
|
||||
self.logger.log('HTML files reading.')
|
||||
self.id_anchor_exist_in_nav_points = False
|
||||
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
||||
self.href2soup_html = self.build_href2soup_content()
|
||||
|
||||
self.logger.log('CSS files processing.')
|
||||
self.css_href2content, self.html_href2css_href = self.build_css_content()
|
||||
@@ -48,9 +65,6 @@ class EpubConverter:
|
||||
self.add_css_styles2soup()
|
||||
|
||||
self.logger.log('Footnotes processing.')
|
||||
self.footnotes_contents: List[str] = []
|
||||
self.noterefs = []
|
||||
self.footnotes: List[Tag] = []
|
||||
for href in self.href2soup_html:
|
||||
content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href],
|
||||
self.href2soup_html)
|
||||
@@ -65,19 +79,18 @@ class EpubConverter:
|
||||
|
||||
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
|
||||
self.logger.log('TOC processing.')
|
||||
self.href2subchapter_ids = defaultdict(list)
|
||||
self.added_to_toc_hrefs = set()
|
||||
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # nav_point2nav_points
|
||||
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
||||
# build simple toc from spine if needed
|
||||
if not self.is_toc_valid():
|
||||
if self.is_toc_empty():
|
||||
self.build_adjacency_list_from_spine()
|
||||
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
|
||||
self.logger.log(f'Html documents not added to TOC: {not_added}.')
|
||||
self.add_not_added_files_to_adjacency_list(not_added)
|
||||
self.logger.log(f'Html internal links and structure processing.')
|
||||
self.label_chapters_ids_with_tmp_id()
|
||||
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
|
||||
self.process_internal_links()
|
||||
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
|
||||
self.logger.log(f'Building chapters content.')
|
||||
self.define_chapters_content()
|
||||
|
||||
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
||||
@@ -142,7 +155,7 @@ class EpubConverter:
|
||||
|
||||
def build_adjacency_list_from_toc(self, element, lvl=0):
|
||||
"""
|
||||
self.adjacency_list builds based on TOC nested structure
|
||||
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc
|
||||
|
||||
key = -1 if root, value = None if leaf
|
||||
|
||||
@@ -186,10 +199,10 @@ class EpubConverter:
|
||||
else:
|
||||
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
|
||||
|
||||
def is_toc_valid(self):
|
||||
def is_toc_empty(self):
|
||||
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
||||
return False
|
||||
return True
|
||||
return True
|
||||
return False
|
||||
|
||||
def build_adjacency_list_from_spine(self):
|
||||
manifest_id2href = self.build_manifest_id2href()
|
||||
@@ -207,8 +220,7 @@ class EpubConverter:
|
||||
self.adjacency_list[-1].append(nav_point)
|
||||
self.added_to_toc_hrefs.add(file)
|
||||
|
||||
def process_html_soup_structure_to_line(self):
|
||||
# mark
|
||||
def label_chapters_ids_with_tmp_id(self):
|
||||
for href in self.href2soup_html:
|
||||
ids = self.href2subchapter_ids[href]
|
||||
for i in ids:
|
||||
@@ -219,6 +231,7 @@ class EpubConverter:
|
||||
new_h.attrs['id'] = i
|
||||
tag.insert_before(new_h)
|
||||
|
||||
def process_html_soup_structure_to_line(self):
|
||||
# go to line structure
|
||||
for href in self.href2soup_html:
|
||||
soup = self.href2soup_html[href]
|
||||
@@ -236,7 +249,7 @@ class EpubConverter:
|
||||
new_anchor_span.string = "\xa0"
|
||||
return new_anchor_span
|
||||
|
||||
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
||||
def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
||||
"""
|
||||
TOC: a/b/c.xhtml
|
||||
|
||||
@@ -285,7 +298,7 @@ class EpubConverter:
|
||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
||||
a_tag_href = internal_link_tag.attrs['href']
|
||||
# find full path
|
||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||
if not a_tag_href_matched_to_toc:
|
||||
continue
|
||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
||||
@@ -306,7 +319,8 @@ class EpubConverter:
|
||||
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
||||
# find full path
|
||||
if a_tag_href:
|
||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href,
|
||||
internal_link_tag)
|
||||
else:
|
||||
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
|
||||
if not a_tag_href_matched_to_toc:
|
||||
@@ -367,9 +381,9 @@ class EpubConverter:
|
||||
self.build_one_chapter(sub_node)
|
||||
|
||||
def define_chapters_content(self):
|
||||
nav_points = self.adjacency_list[-1]
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
if self.id_anchor_exist_in_nav_points:
|
||||
for point in nav_points:
|
||||
for point in top_level_nav_points:
|
||||
self.build_one_chapter(point)
|
||||
|
||||
def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||
|
||||
Reference in New Issue
Block a user