From 240e1e30fffcdf8cc9f5ad219e71ff1c81e3a5f6 Mon Sep 17 00:00:00 2001
From: shirshasa <katerinagorbac@gmail.com>
Date: Fri, 30 Jul 2021 16:07:38 +0300
Subject: [PATCH] epub converter: fix path in internal links processing

---
 src/epub_postprocessor.py | 81 ++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py
index 409ceb9..f4bce6c 100644
--- a/src/epub_postprocessor.py
+++ b/src/epub_postprocessor.py
@@ -1,6 +1,7 @@
 import codecs
 import json
 import logging
+import os
 import re
 from os.path import dirname, normpath, join
 from collections import defaultdict
@@ -224,41 +225,54 @@ class EpubPostprocessor:
     def _create_unique_id(href, id_):
         return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
 
+    def match_href_to_path_from_toc(self, href, href_in_link, internal_link_tag):
+        dir_name = os.path.dirname(href)
+        normed_path = os.path.normpath(os.path.join(dir_name, href_in_link))
+        full_path = [path for path in self.added_to_toc_hrefs if normed_path in path]
+        if not full_path:
+            self.logger.log(f'Error in {href} file. No {normed_path} file found in added to TOC documents. '
+                            f'While processing href in {internal_link_tag}.')
+            internal_link_tag.attrs['converter-mark'] = 'bad-link'
+            return None
+
+        if len(full_path) > 1:
+            self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}'
+                            f' while {internal_link_tag} processing. The first one will be chosen.')
+            
+        return full_path[0]
+
     def process_internal_links(self):
         # rebuild ids to be unique in all documents
-        for href in self.added_to_toc_hrefs:
-            for tag in self.href2soup_html[href].find_all(attrs={'id': re.compile(r'.+')}):
+        for toc_href in self.added_to_toc_hrefs:
+            for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
                 if tag.attrs.get('class') == 'converter-chapter-mark':
                     continue
 
                 if tag.attrs.get('class') == 'footnote-element':
                     continue
 
-                new_id = self._create_unique_id(href, tag.attrs['id'])
+                new_id = self._create_unique_id(toc_href, tag.attrs['id'])
                 tag.attrs['id'] = new_id
 
         # ---------------------------------------------------------------------------------
-        internal_link_reg1 = re.compile(r'(^.+\.(html|xhtml)$)')
-        for href in self.added_to_toc_hrefs:
-            soup = self.href2soup_html[href]
+        internal_link_reg1 = re.compile(r'(^.+\.(html|xhtml)$)')  # anchor is a whole xhtml file
+        for toc_href in self.added_to_toc_hrefs:
+            soup = self.href2soup_html[toc_href]
             for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
-                href_in_link = internal_link_tag.attrs['href']
-                full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path]
-                if not full_path:
-                    self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. '
-                                    f'While processing href in {internal_link_tag}.')
+                a_tag_href = internal_link_tag.attrs['href']
+                # find full path
+                a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
+                if not a_tag_href_matched_to_toc:
                     continue
-
-                href_in_link = full_path[0]
-                new_id = self._create_unique_id(href_in_link, '')
+                new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
                 internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
                 if new_id not in self.internal_anchors:
-                    anchor_soup = self.href2soup_html[href_in_link]
+                    anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
                     new_anchor_span = soup.new_tag("span")
                     new_anchor_span.attrs['id'] = new_id
                     new_anchor_span.attrs['class'] = 'link-anchor'
                     new_anchor_span.string = "\xa0"
-                    anchor_soup.insert(0, new_anchor_span)
+                    anchor_soup.insert(0, new_anchor_span)  # insert a new span to the begin of the file
                     self.internal_anchors.add(new_id)
 
                 del internal_link_tag.attrs['href']
@@ -266,43 +280,30 @@ class EpubPostprocessor:
         # ------------------------------------------------------------------------
         # write placeholder to all internal links
         internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
-        for href in self.added_to_toc_hrefs:
-            soup = self.href2soup_html[href]
+        for toc_href in self.added_to_toc_hrefs:
+            soup = self.href2soup_html[toc_href]
             for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
-                href_in_link, id_in_link = internal_link_tag.attrs['href'].split('#')
-                if not href_in_link:
-                    href_in_link = href
+                a_tag_href, id_in_link = internal_link_tag.attrs['href'].split('#')
+                a_tag_href = a_tag_href or toc_href
                 # find full path
-                full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path]
-                if not full_path:
-                    self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. '
-                                    f'While processing href in {internal_link_tag}.')
-                    internal_link_tag.attrs['converter-mark'] = 'bad-link'
+                a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
+                if not a_tag_href_matched_to_toc:
                     continue
+                new_id = self._create_unique_id(a_tag_href_matched_to_toc, id_in_link)
 
-                if len(full_path) > 1:
-                    self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}'
-                                    f' while {internal_link_tag} processing. The first one will be chosen.')
-
-                href_in_link = full_path[0]
-                new_id = self._create_unique_id(href_in_link, id_in_link)
-
-                anchor_soup = self.href2soup_html[href_in_link]
+                anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
                 anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
                 anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': id_in_link})
 
                 if anchor_tags:
                     if len(anchor_tags) > 1:
-                        self.logger.log(f'Warning in {href}: multiple anchors: {len(anchor_tags)} found.'
+                        self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.'
                                         f' While processing {internal_link_tag}')
 
                     anchor_tag = anchor_tags[0]
                     # if anchor is found we could add placeholder for link creation on server side.
                     internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
                     # create span to have cyclic links, link has 1 type of class, anchor another
-                    new_anchor_already_created = soup.find_all('span',
-                                                               attrs={'class': 'link-anchor',
-                                                                      'id': anchor_tag.attrs['id']})
                     if anchor_tag.attrs['id'] not in self.internal_anchors:
                         new_anchor_span = soup.new_tag("span")
                         new_anchor_span.attrs['id'] = anchor_tag.attrs['id']
@@ -315,8 +316,8 @@ class EpubPostprocessor:
 
                 else:
                     internal_link_tag.attrs['converter-mark'] = 'bad-link'
-                    self.logger.log(f'Error in {href}. While processing {internal_link_tag} no anchor found.'
-                                    f' Should be anchor with new id={new_id} in {href_in_link} file.'
+                    self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
+                                    f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
                                     f' Old id={id_in_link}')
 
     def build_one_anchored_section(self, node):