forked from LiveCarta/BookConverter
Add htm support in processing anchors
This commit is contained in:
@@ -254,18 +254,18 @@ class EpubConverter:
|
||||
self.html_href2html_body_soup[href] = unwrap_structural_tags(soup)
|
||||
|
||||
@staticmethod
|
||||
def _create_unique_id(href, id_):
|
||||
def create_unique_id(href, id_):
|
||||
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
|
||||
|
||||
@staticmethod
|
||||
def _create_new_anchor_span(soup, id_):
|
||||
def create_new_anchor_span(soup, id_):
|
||||
new_anchor_span = soup.new_tag("span")
|
||||
new_anchor_span.attrs['id'] = id_
|
||||
new_anchor_span.attrs['class'] = 'link-anchor'
|
||||
new_anchor_span.string = "\xa0"
|
||||
return new_anchor_span
|
||||
|
||||
def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
||||
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
||||
"""
|
||||
TOC: a/b/c.xhtml
|
||||
|
||||
@@ -304,44 +304,44 @@ class EpubConverter:
|
||||
if tag.attrs.get('class') == 'footnote-element':
|
||||
continue
|
||||
|
||||
new_id = self._create_unique_id(toc_href, tag.attrs['id'])
|
||||
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
|
||||
tag.attrs['id'] = new_id
|
||||
|
||||
# 2.a) process anchor which is a whole xhtml file
|
||||
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)')
|
||||
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(htm|html|xhtml)$)')
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
soup = self.html_href2html_body_soup[toc_href]
|
||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
||||
a_tag_href = internal_link_tag.attrs['href']
|
||||
# find full path
|
||||
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||
if not a_tag_href_matched_to_toc:
|
||||
continue
|
||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
||||
new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
|
||||
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
||||
if new_id not in self.internal_anchors:
|
||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||
new_anchor_span = self._create_new_anchor_span(soup, new_id)
|
||||
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
||||
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
|
||||
self.internal_anchors.add(new_id)
|
||||
|
||||
del internal_link_tag.attrs['href']
|
||||
|
||||
# 2.b) process anchor which is a an element in xhtml file
|
||||
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
|
||||
# 2.b) process anchor which is an element in xhtml file
|
||||
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
soup = self.html_href2html_body_soup[toc_href]
|
||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
||||
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
||||
# find full path
|
||||
if a_tag_href:
|
||||
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href,
|
||||
internal_link_tag)
|
||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
|
||||
internal_link_tag)
|
||||
else:
|
||||
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
|
||||
if not a_tag_href_matched_to_toc:
|
||||
continue
|
||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
|
||||
new_id = self.create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
|
||||
|
||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
|
||||
@@ -359,7 +359,7 @@ class EpubConverter:
|
||||
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
||||
# create span to have cyclic links, link has 1 type of class, anchor another
|
||||
if anchor_tag.attrs['id'] not in self.internal_anchors:
|
||||
new_anchor_span = self._create_new_anchor_span(soup, new_id)
|
||||
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
||||
anchor_tag.insert_before(new_anchor_span)
|
||||
self.internal_anchors.add(new_id)
|
||||
del anchor_tag.attrs['id']
|
||||
@@ -402,7 +402,7 @@ class EpubConverter:
|
||||
for point in top_level_nav_points:
|
||||
self.build_one_chapter(point)
|
||||
|
||||
def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||
def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||
title = nav_point.title
|
||||
if nav_point.id:
|
||||
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
|
||||
@@ -423,7 +423,7 @@ class EpubConverter:
|
||||
# warning! not EpubHtmlItems won;t be added to chapter
|
||||
if self.adjacency_list.get(nav_point):
|
||||
for sub_node in self.adjacency_list[nav_point]:
|
||||
sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1)
|
||||
sub_chapter_item = self.node_to_livecarta_chapter_item(sub_node, lvl + 1)
|
||||
sub_nodes.append(sub_chapter_item)
|
||||
|
||||
if self.logger:
|
||||
@@ -436,7 +436,7 @@ class EpubConverter:
|
||||
top_level_chapters = []
|
||||
|
||||
for nav_point in top_level_nav_points:
|
||||
chapter = self.node2livecarta_chapter_item(nav_point)
|
||||
chapter = self.node_to_livecarta_chapter_item(nav_point)
|
||||
top_level_chapters.append(chapter)
|
||||
|
||||
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
|
||||
@@ -458,7 +458,7 @@ if __name__ == "__main__":
|
||||
|
||||
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
||||
|
||||
json_converter = EpubConverter('../../epub/9781634252221.epub',
|
||||
json_converter = EpubConverter('../../epub/Cook.epub',
|
||||
logger=logger_object)
|
||||
tmp = json_converter.convert_to_dict()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user