epub converter: add internal links processing2

This commit is contained in:
shirshasa
2021-05-26 08:15:56 +03:00
parent 3eac136e07
commit 836e2dbc11
3 changed files with 35 additions and 12 deletions

View File

@@ -25,7 +25,7 @@ class EpubPostprocessor:
self.access = access
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
self.internal_links_found = 0
self.internal_anchors = set()
self.logger.log('Image processing.')
self.href2img_bytes = {}
self.old_image_path2_aws_path = {}
@@ -196,7 +196,7 @@ class EpubPostprocessor:
@staticmethod
def _create_unique_id(href, id_):
return re.sub(r'([^\w\s])|_|-', '', href) + id_
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
def process_internal_links(self):
# rebuild ids to be unique in all documents
@@ -235,15 +235,29 @@ class EpubPostprocessor:
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
if anchor_tags:
if len(anchor_tags) > 1:
self.logger.log(f'Warning in {href}: multiple anchors: {anchor_tags} found.'
self.logger.log(f'Warning in {href}: multiple anchors: {len(anchor_tags)} found.'
f' While processing {internal_link_tag}')
anchor_tag = anchor_tags[0]
# if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
anchor_tag.attrs['class'] = 'link-anchor'
# create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.name == 'a':
new_anchor_already_created = soup.find_all('span',
attrs={'class': 'link-anchor',
'id': anchor_tag.attrs['id']})
if not new_anchor_already_created:
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs['id'] = anchor_tag.attrs['id']
new_anchor_span.attrs['class'] = 'link-anchor'
anchor_tag.insert(0, new_anchor_span)
del anchor_tag.attrs['id']
self.internal_anchors.add(new_anchor_span)
else:
self.internal_anchors.add(anchor_tag)
anchor_tag.attrs['class'] = 'link-anchor'
del internal_link_tag.attrs['href']
self.internal_links_found += 1
else:
internal_link_tag.attrs['converter-mark'] = 'bad-link'
@@ -325,7 +339,7 @@ class EpubPostprocessor:
top_level_chapters.append(chapter)
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
self.logger.log(f'Internal links found: {self.internal_links_found}.')
self.logger.log(f'Anchors found: {len(self.internal_anchors)}.')
self.logger.log('End conversion.')
return {