import re import json import codecs import ebooklib from ebooklib import epub from ebooklib.epub import Link, Section from os import path from pathlib import Path from itertools import chain from premailer import transform from collections import defaultdict from typing import Dict, Union, List from bs4 import BeautifulSoup, NavigableString, Tag from src.util.helpers import BookLogger from src.preset_processor import PresetProcessor from src.epub_converter.css_processor import CSSPreprocessor from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.footnotes_processing import preprocess_footnotes from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor class EpubConverter: def __init__(self, file_path, access=None, logger=None, css_processor=None, html_processor=None): self.file_path = file_path self.access = access self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file_path) self.css_processor = css_processor self.html_processor = html_processor # main container for all epub .xhtml files self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # enumerate all subchapter id for each file self.html_href2subchapters_ids = defaultdict(list) self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC # toc tree structure stored as adj.list (NavPoint to list of NavPoints) # key = -1 for top level NavPoints self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # list to offset Chapter_i on 1st level self.offset_sub_nodes = [] # container for all chapters soup objects # here soup object is only part of the .xhtml file self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {} self.internal_anchors = set() # flag to be updated while ebooklib.toc is parsed self.id_anchor_exist_in_nav_points = False self.img_href2img_bytes = {} # file path to bytes # file path from to generated aws path self.book_image_src_path2aws_path = {} self.footnotes_contents: List[str] = [] # to be sent on server as is self.noterefs: List[Tag] = [] # start of the footnote self.footnotes: List[Tag] = [] # end of the footnote self.logger.log("Image processing.") for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): file_name = x.file_name content = x.content self.img_href2img_bytes[file_name] = content self.logger.log("HTML files reading.") self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() self.logger.log("CSS inline style processing.") self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup) self.logger.log("CSS files processing.") self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() self.logger.log("CSS styles fusion(inline+file).") self.add_css_styles_to_html_soup() self.logger.log("Footnotes processing.") for href in self.html_href2html_body_soup: self.footnotes_contents, self.noterefs, self.footnotes =\ preprocess_footnotes( self.html_href2html_body_soup[href], self.html_href2html_body_soup) self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") self.logger.log("TOC processing.") self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed if self.is_toc_empty(): self.build_adjacency_list_from_spine() not_added = [ x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] self.logger.log(f"Html documents not added to TOC: {not_added}.") self.logger.log(f"Add documents not added to TOC.") self.add_not_added_files_to_adjacency_list(not_added) self.logger.log(f"Label subchapters with converter tag.") self.label_subchapters_with_lc_tag() self.logger.log(f"Process html internal links.") self.process_internal_links() self.logger.log( f"Check if converter-chapter-marks are on the same level.") self.chapter_marks_are_same_level() self.logger.log(f"Define chapters content.") self.define_chapters_with_content() self.logger.log(f"Converting html_nodes to LiveCarta chapter items.") def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: # using EpubElements # for now just for HTML objects, as it is the simplest chapter nodes = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_body_text = item.get_body_content() # html.parser closes tags if needed soup = BeautifulSoup(html_body_text, features="html.parser") nodes[item.file_name] = soup return nodes def build_html_and_css_relations(self) -> tuple[dict, dict]: """ Function is designed to get 2 dictionaries: The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them ) which are used on this html The second is css_href2css_content. It is created to connect href of css to content of css ...2... = key2value Returns ---------- html_href2css_href, css_href2css_content: tuple[dict, dict] dictionary: href of html to related css files, dictionary: css files to related css content """ # dictionary: href of html to related css files html_href2css_href: defaultdict = defaultdict(list) css_href2css_content: dict = {} for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_content = item.content html_href = item.file_name soup_html_content = BeautifulSoup(html_content, features="lxml") # check if file links to css file for tag in soup_html_content.find_all("link", attrs={"type": "text/css"}): # alternate page of original page (e.g. another language) if tag.attrs.get("rel") and ("alternate" in tag.attrs["rel"]): continue css_href = tag.attrs.get("href") html_href2css_href[html_href].append(css_href) if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict css_href2css_content[css_href] = self.css_processor.build_css_file_content( self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book)) for i, tag in enumerate(soup_html_content.find_all("style")): css_content = tag.string html_href2css_href[html_href].append(f"href{i}") css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content( css_content) return html_href2css_href, css_href2css_content @staticmethod def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: """ Function adds styles from .css to inline style. Parameters ---------- html_soup: BeautifulSoup html page with inline style css_text: str css content from css file Returns ------- inline_soup: BeautifulSoup soup with styles from css """ # remove this specification because it causes problems css_text = css_text.replace( '@namespace epub "http://www.idpf.org/2007/ops";', '') # here we add css styles to inline style html_with_css_styles: str = transform(str(html_soup), css_text=css_text, remove_classes=False, external_styles=False, allow_network=False, disable_validation=True, ) # soup with converted styles from css inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, attrs={"style": re.compile(".*")}) # go through the tags with inline style + style parsed from css file for tag_inline_style in tags_with_inline_style: style_converter = TagInlineStyleProcessor(tag_inline_style) style_converter.convert_initial_tag() return inline_soup def add_css_styles_to_html_soup(self): """ This function is designed to update html_href2html_body_soup - add to html_inline_style css_style_content Returns ------- None updated soups with styles from css """ for html_href in self.html_href2html_body_soup: if self.html_href2css_href.get(html_href): css = "" for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] html_content = self.modify_html_soup_with_css_styles( html_content, css) self.html_href2html_body_soup[html_href] = html_content def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): """ Function self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc key = -1 if root(top chapters), value = None if leaf(the least chapters) Parameters ---------- element: [Link, tuple, list] element that appears in TOC(usually parsed from nav.ncx) lvl: int level of node Returns ---------- None built adjacency list """ if isinstance(element, Link): nav_point = NavPoint(element) if nav_point.id: self.id_anchor_exist_in_nav_points = True self.html_href2subchapters_ids[nav_point.href].append( nav_point.id) self.adjacency_list[nav_point] = None self.hrefs_added_to_toc.add(nav_point.href) return nav_point elif isinstance(element, tuple): first, second = element assert isinstance(first, Section) nav_point = NavPoint(first) if nav_point.id: self.id_anchor_exist_in_nav_points = True self.html_href2subchapters_ids[nav_point.href].append( nav_point.id) sub_nodes = [] for elem in second: if (bool(re.search('^section$|^part$', first.title.lower()))) and lvl == 1: self.offset_sub_nodes.append( self.build_adjacency_list_from_toc(elem, lvl)) else: sub_nodes.append( self.build_adjacency_list_from_toc(elem, lvl + 1)) self.adjacency_list[nav_point] = sub_nodes self.hrefs_added_to_toc.add(nav_point.href) return nav_point elif isinstance(element, list) and (lvl == 0): nodes = [] # add through every element for elem in element: nodes.append( self.build_adjacency_list_from_toc(elem, lvl + 1)) # set chapter_i after Section & add through every offset sub element for offset_sub_node in self.offset_sub_nodes: nodes.append(offset_sub_node) self.offset_sub_nodes = [] self.adjacency_list[-1] = nodes else: assert 0, f"Error. Element is not tuple/Link/list instance: {type(element)}" def is_toc_empty(self) -> bool: """Function checks is toc empty""" # there is no toc in ebook or no top chapters if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): return True return False def build_adjacency_list_from_spine(self): def build_manifest_id2html_href() -> dict: links = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): links[item.id] = item.file_name return links manifest_id2html_href = build_manifest_id2html_href() self.adjacency_list = { -1: [] } for id_, _ in self.ebooklib_book.spine: nav_point = NavPoint( Section(manifest_id2html_href[id_], manifest_id2html_href[id_])) self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(nav_point.href) def add_not_added_files_to_adjacency_list(self, not_added: list): """Function add files that not added to adjacency list""" for i, file in enumerate(not_added): nav_point = NavPoint( Section(f"To check #{i}, filename: {file}", file)) self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(file) def label_subchapters_with_lc_tag(self): for html_href in self.html_href2html_body_soup: ids, soup = self.html_href2subchapters_ids[html_href], \ self.html_href2html_body_soup[html_href] for i in ids: tag = soup.find(id=i) tmp_tag = soup.new_tag("lc_tmp") tmp_tag.attrs["class"] = "converter-chapter-mark" tmp_tag.attrs["id"] = i tag.insert_before(tmp_tag) def chapter_marks_are_same_level(self): """ Function checks that marks for pointing a start of a chapter are placed on one level in html tree. Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed. This tag must have a chapter_tag as a parent. Otherwise, it is wrapped with some tags. Like:

""" for html_href in self.html_href2html_body_soup: chapter_tag = self.html_href2html_body_soup[html_href] # check marks for chapter starting are on the same level - 1st marks = chapter_tag.find_all( attrs={"class": "converter-chapter-mark"}) # fix marks to be on 1 level for mark in marks: while mark.parent != chapter_tag: # todo warning! could reflect on formatting/internal links in some cases mark.parent.unwrap() @staticmethod def create_unique_id(href, id_): return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_) def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]: """ Function used to find full path to file that is parsed from tag link TOC: a/b/c.xhtml b/c.xhtml -> a/b/c.xhtml c.xhtml -> a/b/c.xhtml Parameters ---------- cur_file_path: str path to current file with tag link href_in_link: str filename got from tag link, like file1.xhtml internal_link_tag: Tag object that is parsed now Returns ------- full_path[0]: str prepared content """ dir_name = path.dirname(cur_file_path) normed_path = path.normpath(path.join( dir_name, href_in_link)).replace("\\", "/") full_path = [ path for path in self.hrefs_added_to_toc if normed_path in path] if not full_path: self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. " f"While processing href in {internal_link_tag}.") internal_link_tag.attrs["converter-mark"] = "bad-link" return None if len(full_path) > 1: self.logger.log(f"Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}" f" while {internal_link_tag} processing. The first one will be chosen.") return full_path[0] @staticmethod def create_new_anchor_span(soup, id_): new_anchor_span = soup.new_tag("span") new_anchor_span.attrs["id"] = id_ new_anchor_span.attrs["class"] = "link-anchor" new_anchor_span.string = "\xa0" return new_anchor_span def process_internal_links(self): """ Function - processing internal links in a book - make ids unique Steps ---------- 1. rebuild ids to be unique in all documents 2a. process anchor which is a whole htm|html|xhtml file 2b. process anchor which is an element in htm|html|xhtml file Returns ------- None process links in html """ def make_ids_unique(): for toc_href in self.hrefs_added_to_toc: for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]: new_id = self.create_unique_id(toc_href, tag.attrs["id"]) tag.attrs["id"] = new_id def process_file_anchor(): for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}): a_tag_href = internal_link_tag.attrs["href"] a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( toc_href, a_tag_href, internal_link_tag) if a_tag_href_matched_to_toc: new_id = self.create_unique_id(a_tag_href_matched_to_toc, "") internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" if new_id not in self.internal_anchors: anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] new_anchor_span = self.create_new_anchor_span(soup, new_id) # insert a new span to the beginning of the file anchor_soup.insert(0, new_anchor_span) self.internal_anchors.add(new_id) del internal_link_tag.attrs["href"] def process_file_element_anchor(): for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] # process_file_element_anchor for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}): a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#") a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( toc_href, a_tag_href, internal_link_tag) if a_tag_href \ else path.normpath(toc_href).replace("\\", "/") if a_tag_href_matched_to_toc: new_id = self.create_unique_id( a_tag_href_matched_to_toc, a_tag_id) anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \ anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote if anchor_tags: if len(anchor_tags) > 1: self.logger.log(f"Warning in {toc_href}: multiple anchors:" f"{len(anchor_tags)} found.\n" f"{anchor_tags}\n" f"While processing {internal_link_tag}") anchor_tag = anchor_tags[0] assert anchor_tag.attrs["id"] in [new_id, a_tag_id] # if anchor is found we could add placeholder for link creation on server side. internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" # create span to have cyclic links, link has 1 type of class, anchor another if anchor_tag.attrs["id"] not in self.internal_anchors: new_anchor_span = self.create_new_anchor_span( soup, new_id) anchor_tag.insert_before(new_anchor_span) self.internal_anchors.add(new_id) del anchor_tag.attrs["id"] del internal_link_tag.attrs["href"] else: internal_link_tag.attrs["converter-mark"] = "bad-link" self.logger.log(f"Error in {toc_href}." f" While processing {internal_link_tag} no anchor found." f" Should be anchor with new id={new_id} in" f" {a_tag_href_matched_to_toc} file." f" Old id={a_tag_id}") # 1. make ids to be unique in all documents make_ids_unique() # 2a. process anchor which is a whole htm|html|xhtml file process_file_anchor() # 2b. process anchor which is an element in htm|html|xhtml file process_file_element_anchor() @staticmethod def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: """ Get tags between LiveCarta chapter marks Parameters ---------- first_id: str Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" href: str Name of current chapters file html_soup: Tag Soup object of current file Returns ------- tags: list [Tag, NavigableString] Chapter's tags """ marked_tags = html_soup.find( attrs={"id": first_id, "class": "converter-chapter-mark"}) if marked_tags: next_tag = marked_tags.next_sibling tags = [] while next_tag: if not isinstance(next_tag, NavigableString) and \ (next_tag.attrs.get("class") == "converter-chapter-mark"): break tags.append(next_tag) next_tag = next_tag.next_sibling # remove tags between first_id and next found id # save them in list for next steps tags = [tag.extract() for tag in tags] html_soup.smooth() else: assert 0, f"Warning: no match for {first_id, href}" return tags def detect_one_chapter(self, nav_point: NavPoint): """ Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) 3 cases: id wraps all chapter content, id wraps chapter"s content + subchapters" content id points to the start of title of a chapter In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id and id of the next chapter/subchapter Parameters ---------- nav_point: NavPoint Returns ------- None built chapter """ if nav_point.id: soup = self.html_href2html_body_soup[nav_point.href] subchapter_tags = self.get_tags_between_chapter_marks( first_id=nav_point.id, href=nav_point.href, html_soup=soup) new_tree = BeautifulSoup("", "html.parser") for subchapter_tag in subchapter_tags: new_tree.append(subchapter_tag) self.href_chapter_id2soup_html[( nav_point.href, nav_point.id)] = new_tree if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: self.detect_one_chapter(sub_node) def define_chapters_with_content(self): """Function build chapters content, starts from top level chapters""" top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: for tl_nav_point in top_level_nav_points: self.detect_one_chapter(tl_nav_point) def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: """ Function prepare style, tags to json structure Parameters ---------- nav_point: NavPoint lvl: int level of chapter Returns ------- ChapterItem built chapter """ title = nav_point.title content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \ if nav_point.id else self.html_href2html_body_soup[nav_point.href] indent = " " * lvl self.logger.log(indent + f"Chapter: {title} is processing.") is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS self.logger.log(indent + "Process title.") title_preprocessed = self.html_processor.prepare_title(title) self.logger.log(indent + "Process content.") content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content, remove_title_from_chapter=is_chapter) self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed, self.img_href2img_bytes, path_to_html=nav_point.href, access=self.access, path2aws_path=self.book_image_src_path2aws_path, book_id=Path(self.file_path).stem) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter # if it doesn't have subchapters if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: sub_chapter_item = self.html_node_to_livecarta_chapter_item( sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes) def convert_to_dict(self) -> dict: """Function which convert list of html nodes to appropriate json structure""" top_level_nav_points = self.adjacency_list[-1] top_level_chapters = [] # loop through to level chapters for tl_nav_point in top_level_nav_points: chapter = self.html_node_to_livecarta_chapter_item(tl_nav_point) top_level_chapters.append(chapter) top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] self.logger.log(f"Anchors found: {len(self.internal_anchors)}.") self.logger.log("End conversion.") return { "content": top_level_dict_chapters, "footnotes": self.footnotes_contents } if __name__ == "__main__": epub_file_path = "../../epub/9780763774134.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\ .get_preset_json() css_processor = CSSPreprocessor() html_processor = HtmlEpubPreprocessor( preset=preset, logger=logger_object) json_converter = EpubConverter(epub_file_path, logger=logger_object, css_processor=css_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: json.dump(content_dict, f_json, ensure_ascii=False)