From 240e1e30fffcdf8cc9f5ad219e71ff1c81e3a5f6 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Fri, 30 Jul 2021 16:07:38 +0300 Subject: [PATCH] epub converter: fix path in internal links processing --- src/epub_postprocessor.py | 81 ++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index 409ceb9..f4bce6c 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -1,6 +1,7 @@ import codecs import json import logging +import os import re from os.path import dirname, normpath, join from collections import defaultdict @@ -224,41 +225,54 @@ class EpubPostprocessor: def _create_unique_id(href, id_): return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) + def match_href_to_path_from_toc(self, href, href_in_link, internal_link_tag): + dir_name = os.path.dirname(href) + normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)) + full_path = [path for path in self.added_to_toc_hrefs if normed_path in path] + if not full_path: + self.logger.log(f'Error in {href} file. No {normed_path} file found in added to TOC documents. ' + f'While processing href in {internal_link_tag}.') + internal_link_tag.attrs['converter-mark'] = 'bad-link' + return None + + if len(full_path) > 1: + self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}' + f' while {internal_link_tag} processing. The first one will be chosen.') + + return full_path[0] + def process_internal_links(self): # rebuild ids to be unique in all documents - for href in self.added_to_toc_hrefs: - for tag in self.href2soup_html[href].find_all(attrs={'id': re.compile(r'.+')}): + for toc_href in self.added_to_toc_hrefs: + for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}): if tag.attrs.get('class') == 'converter-chapter-mark': continue if tag.attrs.get('class') == 'footnote-element': continue - new_id = self._create_unique_id(href, tag.attrs['id']) + new_id = self._create_unique_id(toc_href, tag.attrs['id']) tag.attrs['id'] = new_id # --------------------------------------------------------------------------------- - internal_link_reg1 = re.compile(r'(^.+\.(html|xhtml)$)') - for href in self.added_to_toc_hrefs: - soup = self.href2soup_html[href] + internal_link_reg1 = re.compile(r'(^.+\.(html|xhtml)$)') # anchor is a whole xhtml file + for toc_href in self.added_to_toc_hrefs: + soup = self.href2soup_html[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): - href_in_link = internal_link_tag.attrs['href'] - full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path] - if not full_path: - self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. ' - f'While processing href in {internal_link_tag}.') + a_tag_href = internal_link_tag.attrs['href'] + # find full path + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) + if not a_tag_href_matched_to_toc: continue - - href_in_link = full_path[0] - new_id = self._create_unique_id(href_in_link, '') + new_id = self._create_unique_id(a_tag_href_matched_to_toc, '') internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' if new_id not in self.internal_anchors: - anchor_soup = self.href2soup_html[href_in_link] + anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc] new_anchor_span = soup.new_tag("span") new_anchor_span.attrs['id'] = new_id new_anchor_span.attrs['class'] = 'link-anchor' new_anchor_span.string = "\xa0" - anchor_soup.insert(0, new_anchor_span) + anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file self.internal_anchors.add(new_id) del internal_link_tag.attrs['href'] @@ -266,43 +280,30 @@ class EpubPostprocessor: # ------------------------------------------------------------------------ # write placeholder to all internal links internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') - for href in self.added_to_toc_hrefs: - soup = self.href2soup_html[href] + for toc_href in self.added_to_toc_hrefs: + soup = self.href2soup_html[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): - href_in_link, id_in_link = internal_link_tag.attrs['href'].split('#') - if not href_in_link: - href_in_link = href + a_tag_href, id_in_link = internal_link_tag.attrs['href'].split('#') + a_tag_href = a_tag_href or toc_href # find full path - full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path] - if not full_path: - self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. ' - f'While processing href in {internal_link_tag}.') - internal_link_tag.attrs['converter-mark'] = 'bad-link' + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) + if not a_tag_href_matched_to_toc: continue + new_id = self._create_unique_id(a_tag_href_matched_to_toc, id_in_link) - if len(full_path) > 1: - self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}' - f' while {internal_link_tag} processing. The first one will be chosen.') - - href_in_link = full_path[0] - new_id = self._create_unique_id(href_in_link, id_in_link) - - anchor_soup = self.href2soup_html[href_in_link] + anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc] anchor_tags = anchor_soup.find_all(attrs={'id': new_id}) anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': id_in_link}) if anchor_tags: if len(anchor_tags) > 1: - self.logger.log(f'Warning in {href}: multiple anchors: {len(anchor_tags)} found.' + self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.' f' While processing {internal_link_tag}') anchor_tag = anchor_tags[0] # if anchor is found we could add placeholder for link creation on server side. internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' # create span to have cyclic links, link has 1 type of class, anchor another - new_anchor_already_created = soup.find_all('span', - attrs={'class': 'link-anchor', - 'id': anchor_tag.attrs['id']}) if anchor_tag.attrs['id'] not in self.internal_anchors: new_anchor_span = soup.new_tag("span") new_anchor_span.attrs['id'] = anchor_tag.attrs['id'] @@ -315,8 +316,8 @@ class EpubPostprocessor: else: internal_link_tag.attrs['converter-mark'] = 'bad-link' - self.logger.log(f'Error in {href}. While processing {internal_link_tag} no anchor found.' - f' Should be anchor with new id={new_id} in {href_in_link} file.' + self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.' + f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f' Old id={id_in_link}') def build_one_anchored_section(self, node):