forked from LiveCarta/BookConverter
epub converter: add internal links processing2
This commit is contained in:
@@ -113,9 +113,16 @@ class EpubBook:
|
|||||||
self.book_api_wrapper.set_process_status()
|
self.book_api_wrapper.set_process_status()
|
||||||
self.logger_object.log('Beginning of processing json output.')
|
self.logger_object.log('Beginning of processing json output.')
|
||||||
|
|
||||||
json_converter = EpubPostprocessor(self.epub_path, access=self.access, logger=self.logger_object)
|
try:
|
||||||
content_dict = json_converter.convert_to_dict()
|
json_converter = EpubPostprocessor(self.epub_path, access=self.access, logger=self.logger_object)
|
||||||
self.book_api_wrapper.set_generate_status()
|
content_dict = json_converter.convert_to_dict()
|
||||||
self.write_to_json(content_dict)
|
self.book_api_wrapper.set_generate_status()
|
||||||
self.send_json_content(content_dict)
|
self.write_to_json(content_dict)
|
||||||
self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
self.send_json_content(content_dict)
|
||||||
|
self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
|
||||||
|
self.logger_object.log_error_to_main_log(str(exc))
|
||||||
|
self.book_api_wrapper.set_error_status()
|
||||||
|
raise exc
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ class EpubPostprocessor:
|
|||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
|
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
|
||||||
self.internal_links_found = 0
|
self.internal_anchors = set()
|
||||||
self.logger.log('Image processing.')
|
self.logger.log('Image processing.')
|
||||||
self.href2img_bytes = {}
|
self.href2img_bytes = {}
|
||||||
self.old_image_path2_aws_path = {}
|
self.old_image_path2_aws_path = {}
|
||||||
@@ -196,7 +196,7 @@ class EpubPostprocessor:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_unique_id(href, id_):
|
def _create_unique_id(href, id_):
|
||||||
return re.sub(r'([^\w\s])|_|-', '', href) + id_
|
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
|
||||||
|
|
||||||
def process_internal_links(self):
|
def process_internal_links(self):
|
||||||
# rebuild ids to be unique in all documents
|
# rebuild ids to be unique in all documents
|
||||||
@@ -235,15 +235,29 @@ class EpubPostprocessor:
|
|||||||
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
|
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
|
||||||
if anchor_tags:
|
if anchor_tags:
|
||||||
if len(anchor_tags) > 1:
|
if len(anchor_tags) > 1:
|
||||||
self.logger.log(f'Warning in {href}: multiple anchors: {anchor_tags} found.'
|
self.logger.log(f'Warning in {href}: multiple anchors: {len(anchor_tags)} found.'
|
||||||
f' While processing {internal_link_tag}')
|
f' While processing {internal_link_tag}')
|
||||||
|
|
||||||
anchor_tag = anchor_tags[0]
|
anchor_tag = anchor_tags[0]
|
||||||
# if anchor is found we could add placeholder for link creation on server side.
|
# if anchor is found we could add placeholder for link creation on server side.
|
||||||
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
||||||
anchor_tag.attrs['class'] = 'link-anchor'
|
# create span to have cyclic links, link has 1 type of class, anchor another
|
||||||
|
if anchor_tag.name == 'a':
|
||||||
|
new_anchor_already_created = soup.find_all('span',
|
||||||
|
attrs={'class': 'link-anchor',
|
||||||
|
'id': anchor_tag.attrs['id']})
|
||||||
|
if not new_anchor_already_created:
|
||||||
|
new_anchor_span = soup.new_tag("span")
|
||||||
|
new_anchor_span.attrs['id'] = anchor_tag.attrs['id']
|
||||||
|
new_anchor_span.attrs['class'] = 'link-anchor'
|
||||||
|
anchor_tag.insert(0, new_anchor_span)
|
||||||
|
del anchor_tag.attrs['id']
|
||||||
|
self.internal_anchors.add(new_anchor_span)
|
||||||
|
else:
|
||||||
|
self.internal_anchors.add(anchor_tag)
|
||||||
|
anchor_tag.attrs['class'] = 'link-anchor'
|
||||||
|
|
||||||
del internal_link_tag.attrs['href']
|
del internal_link_tag.attrs['href']
|
||||||
self.internal_links_found += 1
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
internal_link_tag.attrs['converter-mark'] = 'bad-link'
|
internal_link_tag.attrs['converter-mark'] = 'bad-link'
|
||||||
@@ -325,7 +339,7 @@ class EpubPostprocessor:
|
|||||||
top_level_chapters.append(chapter)
|
top_level_chapters.append(chapter)
|
||||||
|
|
||||||
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
|
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
|
||||||
self.logger.log(f'Internal links found: {self.internal_links_found}.')
|
self.logger.log(f'Anchors found: {len(self.internal_anchors)}.')
|
||||||
self.logger.log('End conversion.')
|
self.logger.log('End conversion.')
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -241,6 +241,8 @@ def unwrap_structural_tags(body_tag):
|
|||||||
# should be before other tags processing, not to remove converter empty tags with id
|
# should be before other tags processing, not to remove converter empty tags with id
|
||||||
# not all cases, if span has <p>s and NavigableString, it won't unwrap
|
# not all cases, if span has <p>s and NavigableString, it won't unwrap
|
||||||
for s in body_tag.find_all("span"):
|
for s in body_tag.find_all("span"):
|
||||||
|
if s.attrs.get('epub:type') == 'pagebreak':
|
||||||
|
continue
|
||||||
if s.contents:
|
if s.contents:
|
||||||
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
|
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
|
||||||
if all(is_not_struct_tag):
|
if all(is_not_struct_tag):
|
||||||
|
|||||||
Reference in New Issue
Block a user