import re import json import codecs import logging import os from os.path import dirname, normpath, join from itertools import chain from collections import defaultdict from typing import Dict, Union, List import ebooklib from ebooklib import epub from ebooklib.epub import Link, Section from bs4 import BeautifulSoup, Tag from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \ update_src_links_in_images, preprocess_footnotes class EpubConverter: def __init__(self, file, access=None, logger=None): self.file = file self.access = access self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file) self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files self.html_href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC # toc tree structure stored as adj.list (NavPoint to list of NavPoints) # key = -1 for top level NavPoints self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # container for all chapters soup objects # here soup object is only part of the .xhtml file self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {} self.internal_anchors = set() self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed self.img_href2img_bytes = {} # file path to bytes self.old_image_path2aws_path = {} # file path from to generated aws path self.footnotes_contents: List[str] = [] # to be sent on server as is self.noterefs: List[Tag] = [] # start of the footnote self.footnotes: List[Tag] = [] # end of the footnote self.logger.log('Image processing.') for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): file_name = x.file_name content = x.content self.img_href2img_bytes[file_name] = content self.logger.log('HTML files reading.') self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() self.logger.log('CSS files processing.') self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() self.logger.log('CSS styles adding.') self.add_css_styles_to_html_soup() self.logger.log('Footnotes processing.') for href in self.html_href2html_body_soup: content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup) self.footnotes_contents.extend(content) self.noterefs.extend(noterefs) self.footnotes.extend(footnotes_tags) for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)): noteref.attrs['data-id'] = i + 1 noteref.attrs['id'] = f'footnote-{i + 1}' footnote.attrs['href'] = f'#footnote-{i + 1}' self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.') self.logger.log('TOC processing.') self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed if self.is_toc_empty(): self.build_adjacency_list_from_spine() not_added = [ x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] self.logger.log(f'Html documents not added to TOC: {not_added}.') self.add_not_added_files_to_adjacency_list(not_added) self.logger.log(f'Html internal links and structure processing.') self.label_chapters_ids_with_tmp_id() # used only after parsed toc, ids from toc needed self.process_html_soup_structure_to_line() self.process_internal_links() self.logger.log(f'Building chapters content.') self.define_chapters_content() def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: # using EpubElements # for now just for HTML objects, as it is simplest chapter nodes = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_body_text = item.get_body_content() # html.parser closes tags if needed soup = BeautifulSoup(html_body_text, features='html.parser') nodes[item.file_name] = soup return nodes def get_css_content(self, css_href, html_href): path_to_css_from_html = css_href html_folder = dirname(html_href) path_to_css_from_root = normpath( join(html_folder, path_to_css_from_html)).replace('\\', '/') css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) assert css_obj, f'Css style {css_href} was not in manifest.' css_content: str = css_obj.get_content().decode() return css_content def build_html_and_css_relations(self): ''' This function is designed to get 2 dictionaries: The first is css_href2css_content. It is created to connect href of css to content of css The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html ...2... = key2value ''' # dictionary: href of html to related css files html_href2css_href: defaultdict = defaultdict(list) css_href2css_content: dict = {} for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_content = item.content html_href = item.file_name soup_html_content = BeautifulSoup(html_content, features='lxml') # check if file links to css file for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): continue css_href = tag.attrs.get('href') html_href2css_href[html_href].append(css_href) if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict css_href2css_content[css_href] = build_css_content( self.get_css_content(css_href, html_href)) for i, tag in enumerate(soup_html_content.find_all('style')): css_content = tag.string html_href2css_href[html_href].append(f'href{i}') css_href2css_content[f'href{i}'] = build_css_content( css_content) return html_href2css_href, css_href2css_content, def add_css_styles_to_html_soup(self): ''' This function is designed to update html_href2html_body_soup And add to html_inline_style css_style_content ''' for html_href in self.html_href2html_body_soup: if self.html_href2css_href.get(html_href): css = '' for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] content: BeautifulSoup = self.html_href2html_body_soup[html_href] content = convert_html_soup_with_css_style(content, css) self.html_href2html_body_soup[html_href] = content def build_manifest_id2html_href(self): links = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): links[item.id] = item.file_name return links def build_adjacency_list_from_toc(self, element, lvl=0): """ self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc key = -1 if root(top chapters), value = None if leaf(least chapters) :param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx) :param lvl: level of depth """ if isinstance(element, Link): nav_point = NavPoint(element) if nav_point.id: self.id_anchor_exist_in_nav_points = True self.html_href2subchapter_ids[nav_point.href].append(nav_point.id) self.adjacency_list[nav_point] = None self.hrefs_added_to_toc.add(nav_point.href) return nav_point elif isinstance(element, tuple): first, second = element assert isinstance(first, Section) nav_point = NavPoint(first) if nav_point.id: self.id_anchor_exist_in_nav_points = True self.html_href2subchapter_ids[nav_point.href].append(nav_point.id) sub_nodes = [] for i in second: sub_nodes.append( self.build_adjacency_list_from_toc(i, lvl + 1)) self.adjacency_list[nav_point] = sub_nodes self.hrefs_added_to_toc.add(nav_point.href) return nav_point elif isinstance(element, list) and (lvl == 0): sub_nodes = [] for i in element: sub_nodes.append( self.build_adjacency_list_from_toc(i, lvl + 1)) self.adjacency_list[-1] = sub_nodes else: assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' def is_toc_empty(self): # there is no toc in ebook or no top chapters if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): return True return False def build_adjacency_list_from_spine(self): manifest_id2html_href = self.build_manifest_id2html_href() self.adjacency_list = { -1: [] } for id_, _ in self.ebooklib_book.spine: nav_point = NavPoint( Section(manifest_id2html_href[id_], manifest_id2html_href[id_])) self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(nav_point.href) def add_not_added_files_to_adjacency_list(self, not_added): for i, file in enumerate(not_added): nav_point = NavPoint( Section(f'To check #{i}, filename: {file}', file)) self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(file) def label_chapters_ids_with_tmp_id(self): for html_href in self.html_href2html_body_soup: ids = self.html_href2subchapter_ids[html_href] for i in ids: soup = self.html_href2html_body_soup[html_href] tag = soup.find(id=i) new_h = soup.new_tag('tmp') new_h.attrs['class'] = 'converter-chapter-mark' new_h.attrs['id'] = i tag.insert_before(new_h) def process_html_soup_structure_to_line(self): # go to line structure for html_href in self.html_href2html_body_soup: soup = self.html_href2html_body_soup[html_href] self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup) @staticmethod def create_unique_id(href, id_): return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) @staticmethod def create_new_anchor_span(soup, id_): new_anchor_span = soup.new_tag("span") new_anchor_span.attrs['id'] = id_ new_anchor_span.attrs['class'] = 'link-anchor' new_anchor_span.string = "\xa0" return new_anchor_span def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag): """ TOC: a/b/c.xhtml b/c.xhtml -> a/b/c.xhtml c.xhtml -> a/b/c.xhtml Used to find full path to file that is parsed from tag link :param cur_file_path: path to current file with tag link :param href_in_link: filename got from tag link, like file1.xhtml :param internal_link_tag: tag object that is parsed now :return: """ dir_name = os.path.dirname(cur_file_path) normed_path = os.path.normpath(os.path.join( dir_name, href_in_link)).replace('\\', '/') full_path = [ path for path in self.hrefs_added_to_toc if normed_path in path] if not full_path: self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' f'While processing href in {internal_link_tag}.') internal_link_tag.attrs['converter-mark'] = 'bad-link' return None if len(full_path) > 1: self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}' f' while {internal_link_tag} processing. The first one will be chosen.') return full_path[0] def process_internal_links(self): # 1. rebuild ids to be unique in all documents for toc_href in self.hrefs_added_to_toc: for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}): if tag.attrs.get('class') == 'converter-chapter-mark': continue if tag.attrs.get('class') == 'footnote-element': continue new_id = self.create_unique_id(toc_href, tag.attrs['id']) tag.attrs['id'] = new_id # 2.a) process anchor which is a whole xhtml file internal_link_reg1 = re.compile( r'(^(?!https?://).+\.(htm|html|xhtml)$)') for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): a_tag_href = internal_link_tag.attrs['href'] # find full path a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( toc_href, a_tag_href, internal_link_tag) if not a_tag_href_matched_to_toc: continue new_id = self.create_unique_id(a_tag_href_matched_to_toc, '') internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' if new_id not in self.internal_anchors: anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] new_anchor_span = self.create_new_anchor_span(soup, new_id) # insert a new span to the begin of the file anchor_soup.insert(0, new_anchor_span) self.internal_anchors.add(new_id) del internal_link_tag.attrs['href'] # 2.b) process anchor which is an element in xhtml file internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)') for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split( '#') # find full path if a_tag_href: a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) else: a_tag_href_matched_to_toc = os.path.normpath( toc_href).replace('\\', '/') if not a_tag_href_matched_to_toc: continue new_id = self.create_unique_id( a_tag_href_matched_to_toc, a_tag_id) anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] anchor_tags = anchor_soup.find_all(attrs={'id': new_id, }) anchor_tags = anchor_tags or anchor_soup.find_all( attrs={'id': a_tag_id}) # if link is a footnote if anchor_tags: if len(anchor_tags) > 1: self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n' f'{anchor_tags}\n' f' While processing {internal_link_tag}') anchor_tag = anchor_tags[0] assert anchor_tag.attrs['id'] in [new_id, a_tag_id] # if anchor is found we could add placeholder for link creation on server side. internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' # create span to have cyclic links, link has 1 type of class, anchor another if anchor_tag.attrs['id'] not in self.internal_anchors: new_anchor_span = self.create_new_anchor_span( soup, new_id) anchor_tag.insert_before(new_anchor_span) self.internal_anchors.add(new_id) del anchor_tag.attrs['id'] del internal_link_tag.attrs['href'] else: internal_link_tag.attrs['converter-mark'] = 'bad-link' self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.' f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f' Old id={a_tag_id}') def build_one_chapter(self, nav_point): """ Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) 3 cases: id wraps all chapter content, id wraps chapter's content + subchapters' content id points to the start of title of a chapter In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id and id of the next chapter/subchapter """ if nav_point.id: soup = self.html_href2html_body_soup[nav_point.href] chapter_tags = get_tags_between_chapter_marks( first_id=nav_point.id, href=nav_point.href, html_soup=soup) new_tree = BeautifulSoup('', 'html.parser') for tag in chapter_tags: new_tree.append(tag) self.href_chapter_id2soup_html[( nav_point.href, nav_point.id)] = new_tree if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: self.build_one_chapter(sub_node) def define_chapters_content(self): top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: for point in top_level_nav_points: self.build_one_chapter(point) def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: title = nav_point.title if nav_point.id: content: BeautifulSoup = self.href_chapter_id2soup_html[( nav_point.href, nav_point.id)] else: content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href] self.old_image_path2aws_path = update_src_links_in_images(content, self.img_href2img_bytes, path_to_html=nav_point.href, access=self.access, path2aws_path=self.old_image_path2aws_path, book_id=self.file.stem or 'book_id') is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS title_preprocessed = prepare_title(title) content_preprocessed = prepare_content(title_preprocessed, content, remove_title_from_chapter=is_chapter) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: sub_chapter_item = self.node_to_livecarta_chapter_item( sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) if self.logger: indent = ' ' * lvl self.logger.log(f'{indent}Chapter: {title} is prepared.') return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) def convert_to_dict(self): top_level_nav_points = self.adjacency_list[-1] top_level_chapters = [] for nav_point in top_level_nav_points: chapter = self.node_to_livecarta_chapter_item(nav_point) top_level_chapters.append(chapter) top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] self.logger.log(f'Anchors found: {len(self.internal_anchors)}.') self.logger.log('End conversion.') return { "content": top_level_dict_chapters, "footnotes": self.footnotes_contents } if __name__ == "__main__": logger = logging.getLogger('epub') stream_handler = logging.StreamHandler() logger.addHandler(stream_handler) file_handler = logging.FileHandler('../../logs/epub.log', mode='w+') logger.addHandler(file_handler) logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) json_converter = EpubConverter('../../epub/9781641051217.epub', logger=logger_object) tmp = json_converter.convert_to_dict() with codecs.open('../../json/tmp.json', 'w', encoding='utf-8') as f: json.dump(tmp, f, ensure_ascii=False)