epub converter: add internal links processing 7

-figure -links without id - img width, height
2021-05-28 15:41:33 +03:00
parent 515f1ef8bd
commit cbc163d973
2 changed files with 40 additions and 0 deletions
--- a/src/epub_postprocessor.py
+++ b/src/epub_postprocessor.py
@@ -208,6 +208,34 @@ class EpubPostprocessor:
                new_id = self._create_unique_id(href, tag.attrs['id'])
                tag.attrs['id'] = new_id

+        # ---------------------------------------------------------------------------------
+        internal_link_reg = re.compile(r'(^.+\.(html|xhtml)$)')
+        for href in self.added_to_toc_hrefs:
+            soup = self.href2soup_html[href]
+            tags = soup.find_all('a', {'href': internal_link_reg})
+            for t in tags:
+                href_in_link = t.attrs['href']
+                full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path]
+                if not full_path:
+                    self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. '
+                                    f'While processing href in {t}.')
+                    continue
+
+                href_in_link = full_path[0]
+                new_id = self._create_unique_id(href_in_link, '')
+                t.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
+                if new_id not in self.internal_anchors:
+                    anchor_soup = self.href2soup_html[href_in_link]
+                    new_anchor_span = soup.new_tag("span")
+                    new_anchor_span.attrs['id'] = new_id
+                    new_anchor_span.attrs['class'] = 'link-anchor'
+                    new_anchor_span.string = "\xa0"
+                    anchor_soup.insert(0, new_anchor_span)
+                    self.internal_anchors.add(new_id)
+
+                del t.attrs['href']
+
+        # ------------------------------------------------------------------------
        # write placeholder to all internal links
        internal_link_reg = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
        for href in self.added_to_toc_hrefs:
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -53,6 +53,10 @@ def update_src_links_in_images(body_tag: Tag,
            new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')

        img.attrs['src'] = str(new_folder)
+        if img.attrs.get('width'):
+            del img.attrs['width']
+        if img.attrs.get('height'):
+            del img.attrs['height']

    return path2aws_path

@@ -269,6 +273,14 @@ def unwrap_structural_tags(body_tag):
        _add_span_to_save_ids_for_links(s)
        s.unwrap()

+    for s in body_tag.find_all("figure"):
+        s.name = 'p'
+        s.attrs['style'] = "text-align: center;"
+
+    for s in body_tag.find_all("figcaption"):
+        _add_span_to_save_ids_for_links(s)
+        s.unwrap()
+
    for s in body_tag.find_all("aside"):
        s.name = 'blockquote'