epub converter: add logging, fix image processing

This commit is contained in:
shirshasa
2021-05-20 19:03:05 +03:00
parent b472c5b9f7
commit 0ac20999b5
3 changed files with 62 additions and 18 deletions

View File

@@ -113,7 +113,7 @@ class EpubBook:
self.book_api_wrapper.set_process_status() self.book_api_wrapper.set_process_status()
self.logger_object.log('Beginning of processing json output.') self.logger_object.log('Beginning of processing json output.')
json_converter = EpubPostprocessor(self.epub_path, self.access) json_converter = EpubPostprocessor(self.epub_path, access=self.access, logger=self.logger_object)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
self.book_api_wrapper.set_generate_status() self.book_api_wrapper.set_generate_status()
self.write_to_json(content_dict) self.write_to_json(content_dict)

View File

@@ -1,5 +1,6 @@
import codecs import codecs
import json import json
import logging
from os.path import dirname, normpath, join from os.path import dirname, normpath, join
from collections import defaultdict from collections import defaultdict
from typing import Dict, Union from typing import Dict, Union
@@ -14,41 +15,56 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids,
update_src_links_in_images, preprocess_footnotes update_src_links_in_images, preprocess_footnotes
from css_reader import clean_css, add_inline_style_to_html_soup from css_reader import clean_css, add_inline_style_to_html_soup
from livecarta_config import LawCartaConfig from livecarta_config import LawCartaConfig, BookLogger
class EpubPostprocessor: class EpubPostprocessor:
def __init__(self, file, access=None): def __init__(self, file, access=None, logger=None):
self.file = file self.file = file
self.access = access self.access = access
self.logger = logger
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
# read images
self.logger.log('Image processing.')
self.href2img_bytes = {} self.href2img_bytes = {}
self.old_image_path2_aws_path = {}
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE): for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
file_name = x.file_name file_name = x.file_name
content = x.content content = x.content
# todo: check how file path is count in lib # todo: check how file path is count in lib
self.href2img_bytes[file_name] = content self.href2img_bytes[file_name] = content
# read html for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER):
file_name = x.file_name
content = x.content
self.href2img_bytes[file_name] = content
self.logger.log('HTML files reading.')
self.id_anchor_exist_in_nav_points = False self.id_anchor_exist_in_nav_points = False
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
# read css
self.logger.log('CSS processing.')
self.html_href2css_href = {} self.html_href2css_href = {}
self.css_href2content = {} self.css_href2content = {}
self.build_css_content() self.build_css_content()
# add css # add css
self.add_css_styles2soup() # self.logger.log('CSS styles adding processing.')
# read footnotes # self.add_css_styles2soup()
self.logger.log('Footnotes processing.')
self.footnotes = [] self.footnotes = []
for href in self.href2soup_html: for href in self.href2soup_html:
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html)) self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html))
# read toc self.logger.log(f'Added {len(self.footnotes)} footnotes.')
self.logger.log('TOC processing.')
self.href2ids = defaultdict(list) self.href2ids = defaultdict(list)
self.added_to_toc_hrefs = []
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf
self.build_adjacency_list_from_toc(self.ebooklib_book.toc) self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed # build simple toc from spine if needed
if not self.is_toc_valid(): if not self.is_toc_valid():
self.build_adjacency_list_from_spine() self.build_adjacency_list_from_spine()
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
self.logger.log(f'html documents not added to TOC: {not_added}')
# read anchored blocks, split html into separate block # read anchored blocks, split html into separate block
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
@@ -115,6 +131,7 @@ class EpubPostprocessor:
self.id_anchor_exist_in_nav_points = True self.id_anchor_exist_in_nav_points = True
self.href2ids[node.href].append(node.id) self.href2ids[node.href].append(node.id)
self.adjacency_list[node] = None self.adjacency_list[node] = None
self.added_to_toc_hrefs.append(node.href)
return node return node
elif isinstance(element, tuple): elif isinstance(element, tuple):
@@ -130,6 +147,7 @@ class EpubPostprocessor:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[node] = sub_nodes self.adjacency_list[node] = sub_nodes
self.added_to_toc_hrefs.append(node.href)
return node return node
elif isinstance(element, list) and (lvl == 0): elif isinstance(element, list) and (lvl == 0):
@@ -155,6 +173,7 @@ class EpubPostprocessor:
for id_, _ in self.ebooklib_book.spine: for id_, _ in self.ebooklib_book.spine:
node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
self.adjacency_list[-1].append(node) self.adjacency_list[-1].append(node)
self.added_to_toc_hrefs.append(node.href)
def mark_and_line_href2soup_html(self): def mark_and_line_href2soup_html(self):
# mark # mark
@@ -202,8 +221,6 @@ class EpubPostprocessor:
for sub_node in self.adjacency_list[node]: for sub_node in self.adjacency_list[node]:
self.build_one_anchored_section(sub_node) self.build_one_anchored_section(sub_node)
# print(f'Chapter: {node.href, node.id} is split.')
def build_anchor2soup(self): def build_anchor2soup(self):
nav_points = self.adjacency_list[-1] nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points: if self.id_anchor_exist_in_nav_points:
@@ -217,7 +234,11 @@ class EpubPostprocessor:
else: else:
content: BeautifulSoup = self.href2soup_html[node.href] content: BeautifulSoup = self.href2soup_html[node.href]
update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access) self.old_image_path2_aws_path = update_src_links_in_images(content,
self.href2img_bytes,
path_to_html=node.href,
access=self.access,
path2aws_path=self.old_image_path2_aws_path)
is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content, title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
@@ -230,7 +251,8 @@ class EpubPostprocessor:
sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl+1) sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl+1)
sub_nodes.append(sub_chapter_item) sub_nodes.append(sub_chapter_item)
# print(f'Chapter: {title} is prepared.') if self.logger:
self.logger.log(f'Chapter: {title} is prepared.')
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self): def convert_to_dict(self):
@@ -250,9 +272,15 @@ class EpubPostprocessor:
if __name__ == "__main__": if __name__ == "__main__":
json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/Chaos_Engineering.epub') logger = logging.getLogger('epub')
file_handler = logging.StreamHandler()
logger.addHandler(file_handler)
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/9781284171242.epub',
logger=logger_object)
tmp = json_converter.convert_to_dict() tmp = json_converter.convert_to_dict()
with codecs.open('tmp.json', 'w', encoding='utf-8') as f: with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
json.dump(tmp, f, ensure_ascii=False) json.dump(tmp, f, ensure_ascii=False)

View File

@@ -27,7 +27,11 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
return link return link
def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None): def update_src_links_in_images(body_tag: Tag,
href2img_content: dict,
path_to_html,
access=None,
path2aws_path=None):
img_tags = body_tag.find_all('img') img_tags = body_tag.find_all('img')
for img in img_tags: for img in img_tags:
@@ -40,12 +44,18 @@ def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_ht
img_content = href2img_content[path_to_img_from_root] img_content = href2img_content[path_to_img_from_root]
if access is not None: if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id') new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
path2aws_path[path_to_img_from_root] = new_folder
else: else:
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id') new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder) img.attrs['src'] = str(new_folder)
return path2aws_path
def preprocess_figure(): def preprocess_figure():
pass pass
@@ -196,7 +206,10 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
if not file: if not file:
target_html_tag = source_html_tag target_html_tag = source_html_tag
else: else:
target_html_tag = href2soup_html[file] target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(f'Error. for\n{noteref_tag}\ninvalid path: {file} found.')
continue
possible_footnote = 'note|footnote|endnote|rearenote' possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id, expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
@@ -250,6 +263,9 @@ def unwrap_structural_tags(body_tag):
for s in body_tag.find_all("html"): for s in body_tag.find_all("html"):
s.unwrap() s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# not all cases, if span has <p>s and NavigableString, it won't unwrap # not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in body_tag.find_all("span"): for s in body_tag.find_all("span"):
if s.contents: if s.contents: