epub converter: fix path in internal links processing

This commit is contained in:
shirshasa
2021-07-30 16:07:38 +03:00
parent 12baa07c0a
commit 240e1e30ff

View File

@@ -1,6 +1,7 @@
import codecs import codecs
import json import json
import logging import logging
import os
import re import re
from os.path import dirname, normpath, join from os.path import dirname, normpath, join
from collections import defaultdict from collections import defaultdict
@@ -224,41 +225,54 @@ class EpubPostprocessor:
def _create_unique_id(href, id_): def _create_unique_id(href, id_):
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
def match_href_to_path_from_toc(self, href, href_in_link, internal_link_tag):
dir_name = os.path.dirname(href)
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link))
full_path = [path for path in self.added_to_toc_hrefs if normed_path in path]
if not full_path:
self.logger.log(f'Error in {href} file. No {normed_path} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.')
internal_link_tag.attrs['converter-mark'] = 'bad-link'
return None
if len(full_path) > 1:
self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}'
f' while {internal_link_tag} processing. The first one will be chosen.')
return full_path[0]
def process_internal_links(self): def process_internal_links(self):
# rebuild ids to be unique in all documents # rebuild ids to be unique in all documents
for href in self.added_to_toc_hrefs: for toc_href in self.added_to_toc_hrefs:
for tag in self.href2soup_html[href].find_all(attrs={'id': re.compile(r'.+')}): for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
if tag.attrs.get('class') == 'converter-chapter-mark': if tag.attrs.get('class') == 'converter-chapter-mark':
continue continue
if tag.attrs.get('class') == 'footnote-element': if tag.attrs.get('class') == 'footnote-element':
continue continue
new_id = self._create_unique_id(href, tag.attrs['id']) new_id = self._create_unique_id(toc_href, tag.attrs['id'])
tag.attrs['id'] = new_id tag.attrs['id'] = new_id
# --------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------
internal_link_reg1 = re.compile(r'(^.+\.(html|xhtml)$)') internal_link_reg1 = re.compile(r'(^.+\.(html|xhtml)$)') # anchor is a whole xhtml file
for href in self.added_to_toc_hrefs: for toc_href in self.added_to_toc_hrefs:
soup = self.href2soup_html[href] soup = self.href2soup_html[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
href_in_link = internal_link_tag.attrs['href'] a_tag_href = internal_link_tag.attrs['href']
full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path] # find full path
if not full_path: a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. ' if not a_tag_href_matched_to_toc:
f'While processing href in {internal_link_tag}.')
continue continue
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
href_in_link = full_path[0]
new_id = self._create_unique_id(href_in_link, '')
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
if new_id not in self.internal_anchors: if new_id not in self.internal_anchors:
anchor_soup = self.href2soup_html[href_in_link] anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
new_anchor_span = soup.new_tag("span") new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs['id'] = new_id new_anchor_span.attrs['id'] = new_id
new_anchor_span.attrs['class'] = 'link-anchor' new_anchor_span.attrs['class'] = 'link-anchor'
new_anchor_span.string = "\xa0" new_anchor_span.string = "\xa0"
anchor_soup.insert(0, new_anchor_span) anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
self.internal_anchors.add(new_id) self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href'] del internal_link_tag.attrs['href']
@@ -266,43 +280,30 @@ class EpubPostprocessor:
# ------------------------------------------------------------------------ # ------------------------------------------------------------------------
# write placeholder to all internal links # write placeholder to all internal links
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
for href in self.added_to_toc_hrefs: for toc_href in self.added_to_toc_hrefs:
soup = self.href2soup_html[href] soup = self.href2soup_html[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
href_in_link, id_in_link = internal_link_tag.attrs['href'].split('#') a_tag_href, id_in_link = internal_link_tag.attrs['href'].split('#')
if not href_in_link: a_tag_href = a_tag_href or toc_href
href_in_link = href
# find full path # find full path
full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path] a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
if not full_path: if not a_tag_href_matched_to_toc:
self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.')
internal_link_tag.attrs['converter-mark'] = 'bad-link'
continue continue
new_id = self._create_unique_id(a_tag_href_matched_to_toc, id_in_link)
if len(full_path) > 1: anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}'
f' while {internal_link_tag} processing. The first one will be chosen.')
href_in_link = full_path[0]
new_id = self._create_unique_id(href_in_link, id_in_link)
anchor_soup = self.href2soup_html[href_in_link]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id}) anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': id_in_link}) anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': id_in_link})
if anchor_tags: if anchor_tags:
if len(anchor_tags) > 1: if len(anchor_tags) > 1:
self.logger.log(f'Warning in {href}: multiple anchors: {len(anchor_tags)} found.' self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.'
f' While processing {internal_link_tag}') f' While processing {internal_link_tag}')
anchor_tag = anchor_tags[0] anchor_tag = anchor_tags[0]
# if anchor is found we could add placeholder for link creation on server side. # if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
# create span to have cyclic links, link has 1 type of class, anchor another # create span to have cyclic links, link has 1 type of class, anchor another
new_anchor_already_created = soup.find_all('span',
attrs={'class': 'link-anchor',
'id': anchor_tag.attrs['id']})
if anchor_tag.attrs['id'] not in self.internal_anchors: if anchor_tag.attrs['id'] not in self.internal_anchors:
new_anchor_span = soup.new_tag("span") new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs['id'] = anchor_tag.attrs['id'] new_anchor_span.attrs['id'] = anchor_tag.attrs['id']
@@ -315,8 +316,8 @@ class EpubPostprocessor:
else: else:
internal_link_tag.attrs['converter-mark'] = 'bad-link' internal_link_tag.attrs['converter-mark'] = 'bad-link'
self.logger.log(f'Error in {href}. While processing {internal_link_tag} no anchor found.' self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
f' Should be anchor with new id={new_id} in {href_in_link} file.' f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={id_in_link}') f' Old id={id_in_link}')
def build_one_anchored_section(self, node): def build_one_anchored_section(self, node):