From 0ac20999b57d411b5bbd42e029b3bad652a9874b Mon Sep 17 00:00:00 2001 From: shirshasa Date: Thu, 20 May 2021 19:03:05 +0300 Subject: [PATCH] epub converter: add logging, fix image processing --- src/epub_converter.py | 2 +- src/epub_postprocessor.py | 56 ++++++++++++++++++++++++++--------- src/html_epub_preprocessor.py | 22 ++++++++++++-- 3 files changed, 62 insertions(+), 18 deletions(-) diff --git a/src/epub_converter.py b/src/epub_converter.py index 79df20e..3b63394 100644 --- a/src/epub_converter.py +++ b/src/epub_converter.py @@ -113,7 +113,7 @@ class EpubBook: self.book_api_wrapper.set_process_status() self.logger_object.log('Beginning of processing json output.') - json_converter = EpubPostprocessor(self.epub_path, self.access) + json_converter = EpubPostprocessor(self.epub_path, access=self.access, logger=self.logger_object) content_dict = json_converter.convert_to_dict() self.book_api_wrapper.set_generate_status() self.write_to_json(content_dict) diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index ecef400..c433c0f 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -1,5 +1,6 @@ import codecs import json +import logging from os.path import dirname, normpath, join from collections import defaultdict from typing import Dict, Union @@ -14,41 +15,56 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, update_src_links_in_images, preprocess_footnotes from css_reader import clean_css, add_inline_style_to_html_soup -from livecarta_config import LawCartaConfig +from livecarta_config import LawCartaConfig, BookLogger class EpubPostprocessor: - def __init__(self, file, access=None): + def __init__(self, file, access=None, logger=None): self.file = file self.access = access + self.logger = logger self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib - # read images + + self.logger.log('Image processing.') self.href2img_bytes = {} + self.old_image_path2_aws_path = {} for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE): file_name = x.file_name content = x.content # todo: check how file path is count in lib self.href2img_bytes[file_name] = content - # read html + for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER): + file_name = x.file_name + content = x.content + self.href2img_bytes[file_name] = content + + self.logger.log('HTML files reading.') self.id_anchor_exist_in_nav_points = False self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() - # read css + + self.logger.log('CSS processing.') self.html_href2css_href = {} self.css_href2content = {} self.build_css_content() # add css - self.add_css_styles2soup() - # read footnotes + # self.logger.log('CSS styles adding processing.') + # self.add_css_styles2soup() + + self.logger.log('Footnotes processing.') self.footnotes = [] for href in self.href2soup_html: self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html)) - # read toc + self.logger.log(f'Added {len(self.footnotes)} footnotes.') + self.logger.log('TOC processing.') self.href2ids = defaultdict(list) + self.added_to_toc_hrefs = [] self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed if not self.is_toc_valid(): self.build_adjacency_list_from_spine() + not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs] + self.logger.log(f'html documents not added to TOC: {not_added}') # read anchored blocks, split html into separate block self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} @@ -115,6 +131,7 @@ class EpubPostprocessor: self.id_anchor_exist_in_nav_points = True self.href2ids[node.href].append(node.id) self.adjacency_list[node] = None + self.added_to_toc_hrefs.append(node.href) return node elif isinstance(element, tuple): @@ -130,6 +147,7 @@ class EpubPostprocessor: sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) self.adjacency_list[node] = sub_nodes + self.added_to_toc_hrefs.append(node.href) return node elif isinstance(element, list) and (lvl == 0): @@ -155,6 +173,7 @@ class EpubPostprocessor: for id_, _ in self.ebooklib_book.spine: node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) self.adjacency_list[-1].append(node) + self.added_to_toc_hrefs.append(node.href) def mark_and_line_href2soup_html(self): # mark @@ -202,8 +221,6 @@ class EpubPostprocessor: for sub_node in self.adjacency_list[node]: self.build_one_anchored_section(sub_node) - # print(f'Chapter: {node.href, node.id} is split.') - def build_anchor2soup(self): nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: @@ -217,7 +234,11 @@ class EpubPostprocessor: else: content: BeautifulSoup = self.href2soup_html[node.href] - update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access) + self.old_image_path2_aws_path = update_src_links_in_images(content, + self.href2img_bytes, + path_to_html=node.href, + access=self.access, + path2aws_path=self.old_image_path2_aws_path) is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS title_preprocessed, content_preprocessed = prepare_title_and_content(title, content, @@ -230,7 +251,8 @@ class EpubPostprocessor: sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl+1) sub_nodes.append(sub_chapter_item) - # print(f'Chapter: {title} is prepared.') + if self.logger: + self.logger.log(f'Chapter: {title} is prepared.') return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) def convert_to_dict(self): @@ -250,9 +272,15 @@ class EpubPostprocessor: if __name__ == "__main__": - json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/Chaos_Engineering.epub') + logger = logging.getLogger('epub') + file_handler = logging.StreamHandler() + logger.addHandler(file_handler) + + logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) + + json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/9781284171242.epub', + logger=logger_object) tmp = json_converter.convert_to_dict() with codecs.open('tmp.json', 'w', encoding='utf-8') as f: json.dump(tmp, f, ensure_ascii=False) - diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 7514693..4486f51 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -27,7 +27,11 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id return link -def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None): +def update_src_links_in_images(body_tag: Tag, + href2img_content: dict, + path_to_html, + access=None, + path2aws_path=None): img_tags = body_tag.find_all('img') for img in img_tags: @@ -40,12 +44,18 @@ def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_ht img_content = href2img_content[path_to_img_from_root] if access is not None: - new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id') + if path_to_img_from_root in path2aws_path: + new_folder = path2aws_path[path_to_img_from_root] + else: + new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id') + path2aws_path[path_to_img_from_root] = new_folder else: new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id') img.attrs['src'] = str(new_folder) + return path2aws_path + def preprocess_figure(): pass @@ -196,7 +206,10 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note if not file: target_html_tag = source_html_tag else: - target_html_tag = href2soup_html[file] + target_html_tag = href2soup_html.get(file) + if not target_html_tag: + print(f'Error. for\n{noteref_tag}\ninvalid path: {file} found.') + continue possible_footnote = 'note|footnote|endnote|rearenote' expected_footnote_tags = list(target_html_tag.find_all(id=element_id, @@ -250,6 +263,9 @@ def unwrap_structural_tags(body_tag): for s in body_tag.find_all("html"): s.unwrap() + for s in body_tag.find_all("header"): + s.name = 'span' + # not all cases, if span has

s and NavigableString, it won't unwrap for s in body_tag.find_all("span"): if s.contents: