forked from LiveCarta/BookConverter
epub converter: refactor epub_converter.py
This commit is contained in:
@@ -28,10 +28,28 @@ class EpubConverter:
|
|||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
self.ebooklib_book = epub.read_epub(file)
|
self.ebooklib_book = epub.read_epub(file)
|
||||||
|
|
||||||
|
self.href2soup_html: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
|
||||||
|
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
|
||||||
|
self.added_to_toc_hrefs = set() # enumerate all file paths that where added to TOC
|
||||||
|
|
||||||
|
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
|
||||||
|
# key = -1 for top level NavPoints
|
||||||
|
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
|
||||||
|
|
||||||
|
# container for all chapters soup objects
|
||||||
|
# here soup object is only part of the .xhtml file
|
||||||
|
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
|
||||||
|
|
||||||
self.internal_anchors = set()
|
self.internal_anchors = set()
|
||||||
|
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
|
||||||
|
self.href2img_bytes = {} # file path to bytes
|
||||||
|
self.old_image_path2_aws_path = {} # file path from <a> to generated aws path
|
||||||
|
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
||||||
|
self.noterefs: List[Tag] = [] # start of the footnote
|
||||||
|
self.footnotes: List[Tag] = [] # end of the footnote
|
||||||
|
|
||||||
self.logger.log('Image processing.')
|
self.logger.log('Image processing.')
|
||||||
self.href2img_bytes = {}
|
|
||||||
self.old_image_path2_aws_path = {}
|
|
||||||
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
|
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
|
||||||
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
|
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
|
||||||
file_name = x.file_name
|
file_name = x.file_name
|
||||||
@@ -39,8 +57,7 @@ class EpubConverter:
|
|||||||
self.href2img_bytes[file_name] = content
|
self.href2img_bytes[file_name] = content
|
||||||
|
|
||||||
self.logger.log('HTML files reading.')
|
self.logger.log('HTML files reading.')
|
||||||
self.id_anchor_exist_in_nav_points = False
|
self.href2soup_html = self.build_href2soup_content()
|
||||||
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
|
||||||
|
|
||||||
self.logger.log('CSS files processing.')
|
self.logger.log('CSS files processing.')
|
||||||
self.css_href2content, self.html_href2css_href = self.build_css_content()
|
self.css_href2content, self.html_href2css_href = self.build_css_content()
|
||||||
@@ -48,9 +65,6 @@ class EpubConverter:
|
|||||||
self.add_css_styles2soup()
|
self.add_css_styles2soup()
|
||||||
|
|
||||||
self.logger.log('Footnotes processing.')
|
self.logger.log('Footnotes processing.')
|
||||||
self.footnotes_contents: List[str] = []
|
|
||||||
self.noterefs = []
|
|
||||||
self.footnotes: List[Tag] = []
|
|
||||||
for href in self.href2soup_html:
|
for href in self.href2soup_html:
|
||||||
content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href],
|
content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href],
|
||||||
self.href2soup_html)
|
self.href2soup_html)
|
||||||
@@ -65,19 +79,18 @@ class EpubConverter:
|
|||||||
|
|
||||||
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
|
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
|
||||||
self.logger.log('TOC processing.')
|
self.logger.log('TOC processing.')
|
||||||
self.href2subchapter_ids = defaultdict(list)
|
|
||||||
self.added_to_toc_hrefs = set()
|
|
||||||
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # nav_point2nav_points
|
|
||||||
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
||||||
# build simple toc from spine if needed
|
# build simple toc from spine if needed
|
||||||
if not self.is_toc_valid():
|
if self.is_toc_empty():
|
||||||
self.build_adjacency_list_from_spine()
|
self.build_adjacency_list_from_spine()
|
||||||
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
|
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
|
||||||
self.logger.log(f'Html documents not added to TOC: {not_added}.')
|
self.logger.log(f'Html documents not added to TOC: {not_added}.')
|
||||||
self.add_not_added_files_to_adjacency_list(not_added)
|
self.add_not_added_files_to_adjacency_list(not_added)
|
||||||
|
self.logger.log(f'Html internal links and structure processing.')
|
||||||
|
self.label_chapters_ids_with_tmp_id()
|
||||||
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
|
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
|
||||||
self.process_internal_links()
|
self.process_internal_links()
|
||||||
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
|
self.logger.log(f'Building chapters content.')
|
||||||
self.define_chapters_content()
|
self.define_chapters_content()
|
||||||
|
|
||||||
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
||||||
@@ -142,7 +155,7 @@ class EpubConverter:
|
|||||||
|
|
||||||
def build_adjacency_list_from_toc(self, element, lvl=0):
|
def build_adjacency_list_from_toc(self, element, lvl=0):
|
||||||
"""
|
"""
|
||||||
self.adjacency_list builds based on TOC nested structure
|
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc
|
||||||
|
|
||||||
key = -1 if root, value = None if leaf
|
key = -1 if root, value = None if leaf
|
||||||
|
|
||||||
@@ -186,10 +199,10 @@ class EpubConverter:
|
|||||||
else:
|
else:
|
||||||
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
|
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
|
||||||
|
|
||||||
def is_toc_valid(self):
|
def is_toc_empty(self):
|
||||||
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
||||||
return False
|
return True
|
||||||
return True
|
return False
|
||||||
|
|
||||||
def build_adjacency_list_from_spine(self):
|
def build_adjacency_list_from_spine(self):
|
||||||
manifest_id2href = self.build_manifest_id2href()
|
manifest_id2href = self.build_manifest_id2href()
|
||||||
@@ -207,8 +220,7 @@ class EpubConverter:
|
|||||||
self.adjacency_list[-1].append(nav_point)
|
self.adjacency_list[-1].append(nav_point)
|
||||||
self.added_to_toc_hrefs.add(file)
|
self.added_to_toc_hrefs.add(file)
|
||||||
|
|
||||||
def process_html_soup_structure_to_line(self):
|
def label_chapters_ids_with_tmp_id(self):
|
||||||
# mark
|
|
||||||
for href in self.href2soup_html:
|
for href in self.href2soup_html:
|
||||||
ids = self.href2subchapter_ids[href]
|
ids = self.href2subchapter_ids[href]
|
||||||
for i in ids:
|
for i in ids:
|
||||||
@@ -219,6 +231,7 @@ class EpubConverter:
|
|||||||
new_h.attrs['id'] = i
|
new_h.attrs['id'] = i
|
||||||
tag.insert_before(new_h)
|
tag.insert_before(new_h)
|
||||||
|
|
||||||
|
def process_html_soup_structure_to_line(self):
|
||||||
# go to line structure
|
# go to line structure
|
||||||
for href in self.href2soup_html:
|
for href in self.href2soup_html:
|
||||||
soup = self.href2soup_html[href]
|
soup = self.href2soup_html[href]
|
||||||
@@ -236,7 +249,7 @@ class EpubConverter:
|
|||||||
new_anchor_span.string = "\xa0"
|
new_anchor_span.string = "\xa0"
|
||||||
return new_anchor_span
|
return new_anchor_span
|
||||||
|
|
||||||
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
||||||
"""
|
"""
|
||||||
TOC: a/b/c.xhtml
|
TOC: a/b/c.xhtml
|
||||||
|
|
||||||
@@ -285,7 +298,7 @@ class EpubConverter:
|
|||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
||||||
a_tag_href = internal_link_tag.attrs['href']
|
a_tag_href = internal_link_tag.attrs['href']
|
||||||
# find full path
|
# find full path
|
||||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||||
if not a_tag_href_matched_to_toc:
|
if not a_tag_href_matched_to_toc:
|
||||||
continue
|
continue
|
||||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
||||||
@@ -306,7 +319,8 @@ class EpubConverter:
|
|||||||
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
||||||
# find full path
|
# find full path
|
||||||
if a_tag_href:
|
if a_tag_href:
|
||||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href,
|
||||||
|
internal_link_tag)
|
||||||
else:
|
else:
|
||||||
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
|
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
|
||||||
if not a_tag_href_matched_to_toc:
|
if not a_tag_href_matched_to_toc:
|
||||||
@@ -367,9 +381,9 @@ class EpubConverter:
|
|||||||
self.build_one_chapter(sub_node)
|
self.build_one_chapter(sub_node)
|
||||||
|
|
||||||
def define_chapters_content(self):
|
def define_chapters_content(self):
|
||||||
nav_points = self.adjacency_list[-1]
|
top_level_nav_points = self.adjacency_list[-1]
|
||||||
if self.id_anchor_exist_in_nav_points:
|
if self.id_anchor_exist_in_nav_points:
|
||||||
for point in nav_points:
|
for point in top_level_nav_points:
|
||||||
self.build_one_chapter(point)
|
self.build_one_chapter(point)
|
||||||
|
|
||||||
def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||||
|
|||||||
Reference in New Issue
Block a user