epub converter: add logging, fix image processing

This commit is contained in:
shirshasa
2021-05-20 19:03:05 +03:00
parent b472c5b9f7
commit 0ac20999b5
3 changed files with 62 additions and 18 deletions

View File

@@ -27,7 +27,11 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
return link
def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
def update_src_links_in_images(body_tag: Tag,
href2img_content: dict,
path_to_html,
access=None,
path2aws_path=None):
img_tags = body_tag.find_all('img')
for img in img_tags:
@@ -40,12 +44,18 @@ def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_ht
img_content = href2img_content[path_to_img_from_root]
if access is not None:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
return path2aws_path
def preprocess_figure():
pass
@@ -196,7 +206,10 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html[file]
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(f'Error. for\n{noteref_tag}\ninvalid path: {file} found.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
@@ -250,6 +263,9 @@ def unwrap_structural_tags(body_tag):
for s in body_tag.find_all("html"):
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in body_tag.find_all("span"):
if s.contents: