diff --git a/src/epub_converter.py b/src/epub_converter.py index 3b63394..e591138 100644 --- a/src/epub_converter.py +++ b/src/epub_converter.py @@ -113,9 +113,16 @@ class EpubBook: self.book_api_wrapper.set_process_status() self.logger_object.log('Beginning of processing json output.') - json_converter = EpubPostprocessor(self.epub_path, access=self.access, logger=self.logger_object) - content_dict = json_converter.convert_to_dict() - self.book_api_wrapper.set_generate_status() - self.write_to_json(content_dict) - self.send_json_content(content_dict) - self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.') + try: + json_converter = EpubPostprocessor(self.epub_path, access=self.access, logger=self.logger_object) + content_dict = json_converter.convert_to_dict() + self.book_api_wrapper.set_generate_status() + self.write_to_json(content_dict) + self.send_json_content(content_dict) + self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.') + + except Exception as exc: + self.logger_object.log('Error has occurred while conversion.', logging.ERROR) + self.logger_object.log_error_to_main_log(str(exc)) + self.book_api_wrapper.set_error_status() + raise exc diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index c0d720c..27fed25 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -25,7 +25,7 @@ class EpubPostprocessor: self.access = access self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib - self.internal_links_found = 0 + self.internal_anchors = set() self.logger.log('Image processing.') self.href2img_bytes = {} self.old_image_path2_aws_path = {} @@ -196,7 +196,7 @@ class EpubPostprocessor: @staticmethod def _create_unique_id(href, id_): - return re.sub(r'([^\w\s])|_|-', '', href) + id_ + return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) def process_internal_links(self): # rebuild ids to be unique in all documents @@ -235,15 +235,29 @@ class EpubPostprocessor: anchor_tags = anchor_soup.find_all(attrs={'id': new_id}) if anchor_tags: if len(anchor_tags) > 1: - self.logger.log(f'Warning in {href}: multiple anchors: {anchor_tags} found.' + self.logger.log(f'Warning in {href}: multiple anchors: {len(anchor_tags)} found.' f' While processing {internal_link_tag}') anchor_tag = anchor_tags[0] # if anchor is found we could add placeholder for link creation on server side. internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' - anchor_tag.attrs['class'] = 'link-anchor' + # create span to have cyclic links, link has 1 type of class, anchor another + if anchor_tag.name == 'a': + new_anchor_already_created = soup.find_all('span', + attrs={'class': 'link-anchor', + 'id': anchor_tag.attrs['id']}) + if not new_anchor_already_created: + new_anchor_span = soup.new_tag("span") + new_anchor_span.attrs['id'] = anchor_tag.attrs['id'] + new_anchor_span.attrs['class'] = 'link-anchor' + anchor_tag.insert(0, new_anchor_span) + del anchor_tag.attrs['id'] + self.internal_anchors.add(new_anchor_span) + else: + self.internal_anchors.add(anchor_tag) + anchor_tag.attrs['class'] = 'link-anchor' + del internal_link_tag.attrs['href'] - self.internal_links_found += 1 else: internal_link_tag.attrs['converter-mark'] = 'bad-link' @@ -325,7 +339,7 @@ class EpubPostprocessor: top_level_chapters.append(chapter) top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] - self.logger.log(f'Internal links found: {self.internal_links_found}.') + self.logger.log(f'Anchors found: {len(self.internal_anchors)}.') self.logger.log('End conversion.') return { diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 9b813b4..d226fe5 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -241,6 +241,8 @@ def unwrap_structural_tags(body_tag): # should be before other tags processing, not to remove converter empty tags with id # not all cases, if span has

s and NavigableString, it won't unwrap for s in body_tag.find_all("span"): + if s.attrs.get('epub:type') == 'pagebreak': + continue if s.contents: is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents] if all(is_not_struct_tag):