forked from LiveCarta/BookConverter
epub converter: add logging, fix image processing
This commit is contained in:
@@ -27,7 +27,11 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
|
||||
return link
|
||||
|
||||
|
||||
def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
|
||||
def update_src_links_in_images(body_tag: Tag,
|
||||
href2img_content: dict,
|
||||
path_to_html,
|
||||
access=None,
|
||||
path2aws_path=None):
|
||||
img_tags = body_tag.find_all('img')
|
||||
|
||||
for img in img_tags:
|
||||
@@ -40,12 +44,18 @@ def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_ht
|
||||
|
||||
img_content = href2img_content[path_to_img_from_root]
|
||||
if access is not None:
|
||||
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
|
||||
if path_to_img_from_root in path2aws_path:
|
||||
new_folder = path2aws_path[path_to_img_from_root]
|
||||
else:
|
||||
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
|
||||
path2aws_path[path_to_img_from_root] = new_folder
|
||||
else:
|
||||
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
|
||||
|
||||
img.attrs['src'] = str(new_folder)
|
||||
|
||||
return path2aws_path
|
||||
|
||||
|
||||
def preprocess_figure():
|
||||
pass
|
||||
@@ -196,7 +206,10 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
if not file:
|
||||
target_html_tag = source_html_tag
|
||||
else:
|
||||
target_html_tag = href2soup_html[file]
|
||||
target_html_tag = href2soup_html.get(file)
|
||||
if not target_html_tag:
|
||||
print(f'Error. for\n{noteref_tag}\ninvalid path: {file} found.')
|
||||
continue
|
||||
|
||||
possible_footnote = 'note|footnote|endnote|rearenote'
|
||||
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
|
||||
@@ -250,6 +263,9 @@ def unwrap_structural_tags(body_tag):
|
||||
for s in body_tag.find_all("html"):
|
||||
s.unwrap()
|
||||
|
||||
for s in body_tag.find_all("header"):
|
||||
s.name = 'span'
|
||||
|
||||
# not all cases, if span has <p>s and NavigableString, it won't unwrap
|
||||
for s in body_tag.find_all("span"):
|
||||
if s.contents:
|
||||
|
||||
Reference in New Issue
Block a user