diff --git a/src/epub_converter.py b/src/epub_converter.py index 28ae341..bb8ee2b 100644 --- a/src/epub_converter.py +++ b/src/epub_converter.py @@ -1,6 +1,7 @@ import codecs import json import re +import os from collections import defaultdict from typing import Dict, Union @@ -8,17 +9,31 @@ import ebooklib from bs4 import BeautifulSoup from ebooklib import epub from ebooklib.epub import Link, Section +from ebooklib.utils import debug from src.data_objects import ChapterItem, NavPoint -from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids +from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \ + preprocess_image + + +# todo: https://docs.python.org/3/howto/unicode.html class EpubBookAdapter: def __init__(self, file): self.file = file self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib + self.href2img_bytes = {} + + for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE): + debug(x) + file_name = x.file_name + content = x.content + # todo: check how file path is count in lib + self.href2img_bytes[file_name] = content + self.id_anchor_exist_in_nav_points = False - self.href2soup_html = self.build_href2soup_content() + self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo) self.href2ids = defaultdict(list) @@ -164,11 +179,13 @@ class EpubBookAdapter: def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem: title = node.title if node.id: - content = self.id_anchor2soup[(node.href, node.id)] + content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)] else: - content = self.href2soup_html[node.href] - content_preprocessed = str(content) # todo self.preprocess_html(content, node.id) - content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed) + content: BeautifulSoup = self.href2soup_html[node.href] + + preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None) + title_preprocessed, content_preprocessed = prepare_title_and_content(title, content) + sub_nodes = [] # warning! not EpubHtmlItems won;t be added to chapter if self.adjacency_list.get(node): @@ -196,9 +213,6 @@ if __name__ == "__main__": "content": l } - output_file = open('output.out', 'w') - output_file.write(str(tmp)) - with codecs.open('tmp.json', 'w', encoding='utf-8') as f: json.dump(tmp, f, ensure_ascii=False) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 8e1eb7c..0f24c9c 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -1,10 +1,47 @@ +import os +import pathlib import re from bs4 import BeautifulSoup, NavigableString +from src.access import Access -def preprocess_image(): - pass + +def save_image_locally(img_file_path, img_content, book_id): + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/')) + new_path.mkdir(exist_ok=True) + + new_img_path = new_path / os.path.basename(img_file_path) + f = open(new_img_path, 'wb+') + f.write(img_content) + f.close() + + return new_img_path + + +def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id): + link = access.send_image_by_bytes(img_file_path, img_content, book_id) + return link + + +def preprocess_image(body_tag, href2img_content, path_to_html, access=None): + img_tags = body_tag.find_all('img') + + for img in img_tags: + path_to_img_from_html = img.attrs.get('src') + html_folder = os.path.dirname(path_to_html) + path_to_img_from_root = os.path.normpath(os.path.join(html_folder ,path_to_img_from_html)) + + assert path_to_img_from_root in href2img_content, f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' + + img_content = href2img_content[path_to_img_from_root] + if access is not None: + new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id') + else: + new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id') + + img.attrs['src'] = str(new_folder) def preprocess_table(): @@ -15,8 +52,12 @@ def preprocess_quote(): pass -def clean_heading_in_content(): - pass +def clean_heading_in_content(content, title: str): + for child in content.contents: + if child.text and re.sub(r'([\n\t\xa0])', '', child.text): + if title == child.text: + child.extract() + break def preprocess_footnotes(): @@ -28,8 +69,18 @@ def add_fonts(): def unwrap_structural_tags(body_tag): + structural_tags_names = [ + 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', + 'figure', 'footer', 'iframe', 'span' + ] + divs = body_tag.find_all("div") for div in divs: + if div.contents: + is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents] + if all(is_not_struct_tag): + div.name = 'p' + continue div.unwrap() secs = body_tag.find_all("section") @@ -48,19 +99,18 @@ def unwrap_structural_tags(body_tag): for s in articles: s.unwrap() - # articles = body_tag.find_all("html") - # for s in articles: - # s.unwrap() + articles = body_tag.find_all("html") + for s in articles: + s.unwrap() spans = body_tag.find_all("span") # not all cases, if span has
s and NavigableString, it won't unwrap for s in spans: if not s.string and s.contents: - is_string = [isinstance(child, NavigableString) for child in s.contents] - if any(is_string): - pass - else: - s.unwrap() + is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents] + if all(is_not_struct_tag): + continue + s.unwrap() for node in body_tag: if isinstance(node, NavigableString): @@ -75,15 +125,6 @@ def unwrap_structural_tags(body_tag): return body_tag -def str2html_soup(html_text: str, element_id=None): - html_soup = BeautifulSoup(html_text, features='lxml') - if element_id: - x = html_soup.find(id=element_id) - return str(x) - else: - return str(html_text) - - def get_tags_between_ids(first_id, href, html_soup): h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'}) if h_marked: @@ -102,3 +143,22 @@ def get_tags_between_ids(first_id, href, html_soup): assert 0, f'Warning: no match for {first_id, href}' return tags + + +def prepare_title_and_content(title, content: BeautifulSoup): + title_str = BeautifulSoup(title, features='lxml').string + # 0. cleaning \n + to_remove = [] + for child in content.contents: + if isinstance(child, NavigableString): + s = re.sub(r'([\n\t\xa0])', '', child.string) + if s == '': + to_remove.append(child) + + [x.extract() for x in to_remove] + # 1. rule#1 for heading removal + clean_heading_in_content(content, title_str) + + content_str = re.sub(r'([\n\t\xa0])', ' ', str(content)) + title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) + return title_str, content_str