diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index b57bddb..36578ed 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -208,6 +208,34 @@ class EpubPostprocessor: new_id = self._create_unique_id(href, tag.attrs['id']) tag.attrs['id'] = new_id + # --------------------------------------------------------------------------------- + internal_link_reg = re.compile(r'(^.+\.(html|xhtml)$)') + for href in self.added_to_toc_hrefs: + soup = self.href2soup_html[href] + tags = soup.find_all('a', {'href': internal_link_reg}) + for t in tags: + href_in_link = t.attrs['href'] + full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path] + if not full_path: + self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. ' + f'While processing href in {t}.') + continue + + href_in_link = full_path[0] + new_id = self._create_unique_id(href_in_link, '') + t.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' + if new_id not in self.internal_anchors: + anchor_soup = self.href2soup_html[href_in_link] + new_anchor_span = soup.new_tag("span") + new_anchor_span.attrs['id'] = new_id + new_anchor_span.attrs['class'] = 'link-anchor' + new_anchor_span.string = "\xa0" + anchor_soup.insert(0, new_anchor_span) + self.internal_anchors.add(new_id) + + del t.attrs['href'] + + # ------------------------------------------------------------------------ # write placeholder to all internal links internal_link_reg = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') for href in self.added_to_toc_hrefs: diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 744ed08..1338ee8 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -53,6 +53,10 @@ def update_src_links_in_images(body_tag: Tag, new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id') img.attrs['src'] = str(new_folder) + if img.attrs.get('width'): + del img.attrs['width'] + if img.attrs.get('height'): + del img.attrs['height'] return path2aws_path @@ -269,6 +273,14 @@ def unwrap_structural_tags(body_tag): _add_span_to_save_ids_for_links(s) s.unwrap() + for s in body_tag.find_all("figure"): + s.name = 'p' + s.attrs['style'] = "text-align: center;" + + for s in body_tag.find_all("figcaption"): + _add_span_to_save_ids_for_links(s) + s.unwrap() + for s in body_tag.find_all("aside"): s.name = 'blockquote'