epub converter: add logging, fix image processing

2021-05-20 19:03:05 +03:00
parent b472c5b9f7
commit 0ac20999b5
3 changed files with 62 additions and 18 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -27,7 +27,11 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
    return link


-def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
+def update_src_links_in_images(body_tag: Tag,
+                               href2img_content: dict,
+                               path_to_html,
+                               access=None,
+                               path2aws_path=None):
    img_tags = body_tag.find_all('img')

    for img in img_tags:
@@ -40,12 +44,18 @@ def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_ht

        img_content = href2img_content[path_to_img_from_root]
        if access is not None:
-            new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
+            if path_to_img_from_root in path2aws_path:
+                new_folder = path2aws_path[path_to_img_from_root]
+            else:
+                new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
+                path2aws_path[path_to_img_from_root] = new_folder
        else:
            new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')

        img.attrs['src'] = str(new_folder)

+    return path2aws_path
+

 def preprocess_figure():
    pass
@@ -196,7 +206,10 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
        if not file:
            target_html_tag = source_html_tag
        else:
-            target_html_tag = href2soup_html[file]
+            target_html_tag = href2soup_html.get(file)
+            if not target_html_tag:
+                print(f'Error. for\n{noteref_tag}\ninvalid path: {file} found.')
+                continue

        possible_footnote = 'note|footnote|endnote|rearenote'
        expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
@@ -250,6 +263,9 @@ def unwrap_structural_tags(body_tag):
    for s in body_tag.find_all("html"):
        s.unwrap()

+    for s in body_tag.find_all("header"):
+        s.name = 'span'
+
    # not all cases, if span has <p>s and NavigableString, it won't unwrap
    for s in body_tag.find_all("span"):
        if s.contents: