forked from LiveCarta/BookConverter
epub converter: fix path in internal links processing
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
from os.path import dirname, normpath, join
|
from os.path import dirname, normpath, join
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@@ -224,41 +225,54 @@ class EpubPostprocessor:
|
|||||||
def _create_unique_id(href, id_):
|
def _create_unique_id(href, id_):
|
||||||
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
|
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
|
||||||
|
|
||||||
|
def match_href_to_path_from_toc(self, href, href_in_link, internal_link_tag):
|
||||||
|
dir_name = os.path.dirname(href)
|
||||||
|
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link))
|
||||||
|
full_path = [path for path in self.added_to_toc_hrefs if normed_path in path]
|
||||||
|
if not full_path:
|
||||||
|
self.logger.log(f'Error in {href} file. No {normed_path} file found in added to TOC documents. '
|
||||||
|
f'While processing href in {internal_link_tag}.')
|
||||||
|
internal_link_tag.attrs['converter-mark'] = 'bad-link'
|
||||||
|
return None
|
||||||
|
|
||||||
|
if len(full_path) > 1:
|
||||||
|
self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}'
|
||||||
|
f' while {internal_link_tag} processing. The first one will be chosen.')
|
||||||
|
|
||||||
|
return full_path[0]
|
||||||
|
|
||||||
def process_internal_links(self):
|
def process_internal_links(self):
|
||||||
# rebuild ids to be unique in all documents
|
# rebuild ids to be unique in all documents
|
||||||
for href in self.added_to_toc_hrefs:
|
for toc_href in self.added_to_toc_hrefs:
|
||||||
for tag in self.href2soup_html[href].find_all(attrs={'id': re.compile(r'.+')}):
|
for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
|
||||||
if tag.attrs.get('class') == 'converter-chapter-mark':
|
if tag.attrs.get('class') == 'converter-chapter-mark':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if tag.attrs.get('class') == 'footnote-element':
|
if tag.attrs.get('class') == 'footnote-element':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
new_id = self._create_unique_id(href, tag.attrs['id'])
|
new_id = self._create_unique_id(toc_href, tag.attrs['id'])
|
||||||
tag.attrs['id'] = new_id
|
tag.attrs['id'] = new_id
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------------
|
||||||
internal_link_reg1 = re.compile(r'(^.+\.(html|xhtml)$)')
|
internal_link_reg1 = re.compile(r'(^.+\.(html|xhtml)$)') # anchor is a whole xhtml file
|
||||||
for href in self.added_to_toc_hrefs:
|
for toc_href in self.added_to_toc_hrefs:
|
||||||
soup = self.href2soup_html[href]
|
soup = self.href2soup_html[toc_href]
|
||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
||||||
href_in_link = internal_link_tag.attrs['href']
|
a_tag_href = internal_link_tag.attrs['href']
|
||||||
full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path]
|
# find full path
|
||||||
if not full_path:
|
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||||
self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. '
|
if not a_tag_href_matched_to_toc:
|
||||||
f'While processing href in {internal_link_tag}.')
|
|
||||||
continue
|
continue
|
||||||
|
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
||||||
href_in_link = full_path[0]
|
|
||||||
new_id = self._create_unique_id(href_in_link, '')
|
|
||||||
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
||||||
if new_id not in self.internal_anchors:
|
if new_id not in self.internal_anchors:
|
||||||
anchor_soup = self.href2soup_html[href_in_link]
|
anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
|
||||||
new_anchor_span = soup.new_tag("span")
|
new_anchor_span = soup.new_tag("span")
|
||||||
new_anchor_span.attrs['id'] = new_id
|
new_anchor_span.attrs['id'] = new_id
|
||||||
new_anchor_span.attrs['class'] = 'link-anchor'
|
new_anchor_span.attrs['class'] = 'link-anchor'
|
||||||
new_anchor_span.string = "\xa0"
|
new_anchor_span.string = "\xa0"
|
||||||
anchor_soup.insert(0, new_anchor_span)
|
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
|
||||||
self.internal_anchors.add(new_id)
|
self.internal_anchors.add(new_id)
|
||||||
|
|
||||||
del internal_link_tag.attrs['href']
|
del internal_link_tag.attrs['href']
|
||||||
@@ -266,43 +280,30 @@ class EpubPostprocessor:
|
|||||||
# ------------------------------------------------------------------------
|
# ------------------------------------------------------------------------
|
||||||
# write placeholder to all internal links
|
# write placeholder to all internal links
|
||||||
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
|
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
|
||||||
for href in self.added_to_toc_hrefs:
|
for toc_href in self.added_to_toc_hrefs:
|
||||||
soup = self.href2soup_html[href]
|
soup = self.href2soup_html[toc_href]
|
||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
||||||
href_in_link, id_in_link = internal_link_tag.attrs['href'].split('#')
|
a_tag_href, id_in_link = internal_link_tag.attrs['href'].split('#')
|
||||||
if not href_in_link:
|
a_tag_href = a_tag_href or toc_href
|
||||||
href_in_link = href
|
|
||||||
# find full path
|
# find full path
|
||||||
full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path]
|
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||||
if not full_path:
|
if not a_tag_href_matched_to_toc:
|
||||||
self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. '
|
|
||||||
f'While processing href in {internal_link_tag}.')
|
|
||||||
internal_link_tag.attrs['converter-mark'] = 'bad-link'
|
|
||||||
continue
|
continue
|
||||||
|
new_id = self._create_unique_id(a_tag_href_matched_to_toc, id_in_link)
|
||||||
|
|
||||||
if len(full_path) > 1:
|
anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
|
||||||
self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}'
|
|
||||||
f' while {internal_link_tag} processing. The first one will be chosen.')
|
|
||||||
|
|
||||||
href_in_link = full_path[0]
|
|
||||||
new_id = self._create_unique_id(href_in_link, id_in_link)
|
|
||||||
|
|
||||||
anchor_soup = self.href2soup_html[href_in_link]
|
|
||||||
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
|
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
|
||||||
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': id_in_link})
|
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': id_in_link})
|
||||||
|
|
||||||
if anchor_tags:
|
if anchor_tags:
|
||||||
if len(anchor_tags) > 1:
|
if len(anchor_tags) > 1:
|
||||||
self.logger.log(f'Warning in {href}: multiple anchors: {len(anchor_tags)} found.'
|
self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.'
|
||||||
f' While processing {internal_link_tag}')
|
f' While processing {internal_link_tag}')
|
||||||
|
|
||||||
anchor_tag = anchor_tags[0]
|
anchor_tag = anchor_tags[0]
|
||||||
# if anchor is found we could add placeholder for link creation on server side.
|
# if anchor is found we could add placeholder for link creation on server side.
|
||||||
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
||||||
# create span to have cyclic links, link has 1 type of class, anchor another
|
# create span to have cyclic links, link has 1 type of class, anchor another
|
||||||
new_anchor_already_created = soup.find_all('span',
|
|
||||||
attrs={'class': 'link-anchor',
|
|
||||||
'id': anchor_tag.attrs['id']})
|
|
||||||
if anchor_tag.attrs['id'] not in self.internal_anchors:
|
if anchor_tag.attrs['id'] not in self.internal_anchors:
|
||||||
new_anchor_span = soup.new_tag("span")
|
new_anchor_span = soup.new_tag("span")
|
||||||
new_anchor_span.attrs['id'] = anchor_tag.attrs['id']
|
new_anchor_span.attrs['id'] = anchor_tag.attrs['id']
|
||||||
@@ -315,8 +316,8 @@ class EpubPostprocessor:
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
internal_link_tag.attrs['converter-mark'] = 'bad-link'
|
internal_link_tag.attrs['converter-mark'] = 'bad-link'
|
||||||
self.logger.log(f'Error in {href}. While processing {internal_link_tag} no anchor found.'
|
self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
|
||||||
f' Should be anchor with new id={new_id} in {href_in_link} file.'
|
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
|
||||||
f' Old id={id_in_link}')
|
f' Old id={id_in_link}')
|
||||||
|
|
||||||
def build_one_anchored_section(self, node):
|
def build_one_anchored_section(self, node):
|
||||||
|
|||||||
Reference in New Issue
Block a user