import codecs import json from collections import defaultdict from typing import Dict, Union import ebooklib from bs4 import BeautifulSoup from ebooklib import epub from ebooklib.epub import Link, Section from src.data_objects import ChapterItem, NavPoint from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \ update_src_links_in_images, preprocess_footnotes # epub3 examples: # https://github.com/IDPF/epub3-samples # specification: # https://idpf.github.io/epub-vocabs/structure/ # footnotes: # http://www.theheratik.net/books/tech-epub/chapter-8/ # http://kb.daisy.org/publishing/docs/html/epub-type.html # todo: http://kb.daisy.org/publishing/docs/html/notes.html # todo: https://docs.python.org/3/howto/unicode.html # поиск toc в epublib: # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap # если его там нет, пробуют искать nav tag в manifest -> EpubNav. from src.util.css_reader import clean_css, add_inline_style_to_html_soup class EpubPostprocessor: def __init__(self, file, access=None): self.file = file self.access = access self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib # read images self.href2img_bytes = {} for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE): file_name = x.file_name content = x.content # todo: check how file path is count in lib self.href2img_bytes[file_name] = content # read html self.id_anchor_exist_in_nav_points = False self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() # read css self.html_href2css_href = {} self.css_href2content = {} self.build_css_content() # add css self.add_css_styles2soup() # read footnotes self.footnotes = [] for href in self.href2soup_html: self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html)) # read toc self.href2ids = defaultdict(list) self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed if not self.is_toc_valid(): self.build_adjacency_list_from_spine() # read anchored blocks, split html into separate block self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} self.build_anchor2soup() # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list # self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: # using EpubElements # for now just for HTML objects, as it is simplest chapter # todo: check if other chapters exist nodes = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_body_text = item.get_body_content() soup = BeautifulSoup(html_body_text, features='lxml') nodes[item.file_name] = soup return nodes def build_css_content(self): for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_text = item.content soup = BeautifulSoup(html_text, features='lxml') for tag in soup.find_all('link', attrs={"type": "text/css"}): css_href = tag.attrs.get('href') self.html_href2css_href[item.file_name] = css_href if css_href not in self.css_href2content: print(css_href) css_content: str = self.ebooklib_book.get_item_with_href(css_href).get_content().decode() self.css_href2content[css_href] = clean_css(css_content) for i, tag in enumerate(soup.find_all('style')): css_content = tag.string self.html_href2css_href[item.file_name] = f'href{i}' self.css_href2content[f'href{i}'] = clean_css(css_content) def add_css_styles2soup(self): for href in self.href2soup_html: if self.html_href2css_href.get(href): css: str = self.css_href2content[self.html_href2css_href[href]] content = self.href2soup_html[href] content = add_inline_style_to_html_soup(content, css) self.href2soup_html[href] = content def build_manifest_id2href(self): links = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): links[item.id] = item.file_name return links def build_adjacency_list_from_toc(self, element, lvl=0): # use book.toc as a root if isinstance(element, Link): # todo: check if link exists node = NavPoint(element) if node.id: self.id_anchor_exist_in_nav_points = True self.href2ids[node.href].append(node.id) self.adjacency_list[node] = None return node elif isinstance(element, tuple): first, second = element assert isinstance(first, Section) node = NavPoint(first) if node.id: self.id_anchor_exist_in_nav_points = True self.href2ids[node.href].append(node.id) sub_nodes = [] for i in second: sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) self.adjacency_list[node] = sub_nodes return node elif isinstance(element, list) and (lvl == 0): sub_nodes = [] for i in element: sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) self.adjacency_list[-1] = sub_nodes else: assert 0, f'Error. Element is not tuple/Link instance: {type(element)}' def is_toc_valid(self): if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): return False return True def build_adjacency_list_from_spine(self): manifest_id2href = self.build_manifest_id2href() self.adjacency_list = { -1: [] } for id_, _ in self.ebooklib_book.spine: node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) self.adjacency_list[-1].append(node) def mark_and_line_href2soup_html(self): # mark for href in self.href2soup_html: ids = self.href2ids[href] for i in ids: soup = self.href2soup_html[href] tag = soup.find(id=i) new_h = soup.new_tag('h1') new_h.attrs['class'] = 'internal-mark' new_h.attrs['id'] = i tag.insert_before(new_h) # go to line structure for href in self.href2soup_html: soup = self.href2soup_html[href] self.href2soup_html[href] = unwrap_structural_tags(soup) def build_one_anchored_section(self, node): """ к этому моементу html soup уже существует в линейном виде - если не в линейном - то мы не виноваты есть 3 случая: id оборачивает весь контент, id оборачивает контент чаптера и под-чаптера, id только указывает на заголовок во всех 3х случаях мы знаем где начало заголовка. Поэтому глава - это все теги от текущего заголовка - до какого угодно следущющего заголовок принимается в расчет если в toc есть указание id,тогда заголовок - это любой тег с id из toc :return: """ if node.id: soup = self.href2soup_html[node.href] chapter_tags = get_tags_between_ids(first_id=node.id, href=node.href, html_soup=soup) new_tree = BeautifulSoup('', 'html.parser') for tag in chapter_tags: new_tree.append(tag) self.id_anchor2soup[(node.href, node.id)] = new_tree if self.adjacency_list.get(node): for sub_node in self.adjacency_list[node]: self.build_one_anchored_section(sub_node) # print(f'Chapter: {node.href, node.id} is split.') def build_anchor2soup(self): nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: for point in nav_points: self.build_one_anchored_section(point) def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem: title = node.title if node.id: content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)] else: content: BeautifulSoup = self.href2soup_html[node.href] update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access) title_preprocessed, content_preprocessed = prepare_title_and_content(title, content) sub_nodes = [] # warning! not EpubHtmlItems won;t be added to chapter if self.adjacency_list.get(node): for sub_node in self.adjacency_list[node]: sub_chapter_item = self.node2livecarta_chapter_item(sub_node) sub_nodes.append(sub_chapter_item) # print(f'Chapter: {title} is prepared.') return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) def convert_to_dict(self): top_level_nav_points = self.adjacency_list[-1] top_level_chapters = [] for nav_point in top_level_nav_points: chapter = self.node2livecarta_chapter_item(nav_point) top_level_chapters.append(chapter) top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] return { "content": top_level_dict_chapters, "footnotes": self.footnotes } if __name__ == "__main__": json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/Chaos_Engineering.epub') tmp = json_converter.convert_to_dict() with codecs.open('tmp.json', 'w', encoding='utf-8') as f: json.dump(tmp, f, ensure_ascii=False)