BookConverter/src/epub_converter/epub_converter.py

import re
import json
import codecs
import os
from os.path import dirname, normpath, join
from itertools import chain
from collections import defaultdict
from typing import Dict, Union, List


import ebooklib
from ebooklib import epub
from ebooklib.epub import Link, Section
from bs4 import BeautifulSoup, Tag


from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\
    prepare_title, prepare_content, update_images_src_links, preprocess_footnotes


class EpubConverter:
    def __init__(self, file_path, access=None, logger=None):
        self.file_path = file_path
        self.access = access
        self.logger: BookLogger = logger
        self.ebooklib_book = epub.read_epub(file_path)

        # main container for all epub .xhtml files
        self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
        # enumerate all subchapter id for each file
        self.html_href2subchapter_ids = defaultdict(list)
        self.hrefs_added_to_toc = set()  # enumerate all file paths that where added to TOC

        # toc tree structure stored as adj.list (NavPoint to list of NavPoints)
        # key = -1 for top level NavPoints
        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}

        # list to offset Chapter_i on 1st level
        self.offset_sub_nodes = []

        # container for all chapters soup objects
        # here soup object is only part of the .xhtml file
        self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}

        self.internal_anchors = set()
        # flag to be updated while ebooklib.toc is parsed
        self.id_anchor_exist_in_nav_points = False
        self.img_href2img_bytes = {}  # file path to bytes
        # file path from <a> to generated aws path
        self.book_image_src_path2aws_path = {}
        self.footnotes_contents: List[str] = []  # to be sent on server as is
        self.noterefs: List[Tag] = []  # start of the footnote
        self.footnotes: List[Tag] = []  # end of the footnote

        self.logger.log('Image processing.')
        for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
                       self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
            file_name = x.file_name
            content = x.content
            self.img_href2img_bytes[file_name] = content

        self.logger.log('HTML files reading.')
        self.html_href2html_body_soup: Dict[str,
                                            BeautifulSoup] = self.build_href2soup_content()
        # TODO Presets

        self.logger.log('Process CSS inline styles.')
        self.process_inline_styles_in_html_soup()
        self.logger.log('CSS files processing.')
        self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
        self.logger.log('CSS styles adding.')
        self.add_css_styles_to_html_soup()

        self.logger.log('Footnotes processing.')
        for href in self.html_href2html_body_soup:
            content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
                                                                     self.html_href2html_body_soup)
            self.footnotes_contents.extend(content)
            self.noterefs.extend(noterefs)
            self.footnotes.extend(footnotes_tags)

        for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
            noteref.attrs['data-id'] = i + 1
            noteref.attrs['id'] = f'footnote-{i + 1}'
            footnote.attrs['href'] = f'#footnote-{i + 1}'

        self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
        self.logger.log('TOC processing.')
        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
        # build simple toc from spine if needed
        if self.is_toc_empty():
            self.build_adjacency_list_from_spine()
        not_added = [
            x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
        self.logger.log(f'Html documents not added to TOC: {not_added}.')
        self.add_not_added_files_to_adjacency_list(not_added)
        self.logger.log(f'Html internal links and structure processing.')
        self.label_chapters_ids_with_tmp_id()
        # used only after parsed toc, ids from toc needed
        self.process_html_soup_structure_to_line()
        self.process_internal_links()
        self.logger.log(f'Building chapters content.')
        self.define_chapters_content()

    def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
        # using EpubElements
        # for now just for HTML objects, as it is the simplest chapter

        nodes = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            html_body_text = item.get_body_content()
            # html.parser  closes tags if needed
            soup = BeautifulSoup(html_body_text, features='html.parser')
            nodes[item.file_name] = soup
        return nodes

    def get_css_content(self, css_href, html_href):
        path_to_css_from_html = css_href
        html_folder = dirname(html_href)
        path_to_css_from_root = normpath(
            join(html_folder, path_to_css_from_html)).replace('\\', '/')
        css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
        # if in css file we import another css
        if "@import" in str(css_obj.content):
            path_to_css_from_root = "css/" + \
                re.search('"(.*)"', str(css_obj.content)).group(1)
            css_obj = self.ebooklib_book.get_item_with_href(
                path_to_css_from_root)
        assert css_obj, f'Css style {css_href} was not in manifest.'
        css_content: str = css_obj.get_content().decode()
        return css_content

    def process_inline_styles_in_html_soup(self):
        """This function is designed to convert inline html styles"""
        for html_href in self.html_href2html_body_soup:
            html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
            could_have_style_in_livecarta_regexp = re.compile(
                '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
            tags_with_inline_style = html_content.find_all(could_have_style_in_livecarta_regexp,
                                                           attrs={'style': re.compile('.*')})

            for tag_initial_inline_style in tags_with_inline_style:
                inline_style = tag_initial_inline_style.attrs['style']
                tag_initial_inline_style.attrs['style'] = \
                    build_inline_style_content(inline_style)

    def build_html_and_css_relations(self) -> tuple[dict, dict]:
        """
        Function is designed to get 2 dictionaries:
        The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them
        ) which are used on this html
        The second is css_href2css_content. It is created to connect href of css to content of css
        ...2... = key2value
        Returns
        ----------
        html_href2css_href, css_href2css_content: tuple[dict, dict]
            dictionary: href of html to related css files, dictionary: css files to related css content

        """
        # dictionary: href of html to related css files
        html_href2css_href: defaultdict = defaultdict(list)
        css_href2css_content: dict = {}

        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            html_content = item.content
            html_href = item.file_name
            soup_html_content = BeautifulSoup(html_content, features='lxml')
            # check if file links to css file
            for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
                # alternate page of original page (e.g. another language)
                if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
                    continue
                css_href = tag.attrs.get('href')
                html_href2css_href[html_href].append(css_href)
                if css_href not in css_href2css_content:
                    # css_href not in css_href2css_content, add to this dict
                    css_href2css_content[css_href] = build_css_file_content(
                        self.get_css_content(css_href, html_href))

            for i, tag in enumerate(soup_html_content.find_all('style')):
                css_content = tag.string
                html_href2css_href[html_href].append(f'href{i}')
                css_href2css_content[f'href{i}'] = build_css_file_content(
                    css_content)
        return html_href2css_href, css_href2css_content

    def add_css_styles_to_html_soup(self):
        """
        This function is designed to update html_href2html_body_soup
        - add to html_inline_style css_style_content

        """
        for html_href in self.html_href2html_body_soup:
            if self.html_href2css_href.get(html_href):
                css = ''
                for css_href in self.html_href2css_href[html_href]:
                    css += self.css_href2css_content[css_href]
                html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
                html_content = convert_html_soup_with_css_style(html_content, css)
                self.html_href2html_body_soup[html_href] = html_content

    def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
        """
        Function
        self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc

        key = -1 if root(top chapters),
        value = None if leaf(the least chapters)
        Parameters
        ----------
        element: [Link, tuple, list]
            element that appears in TOC(usually parsed from nav.ncx)
        lvl: int
            level of node

        Returns
        ----------
        None
            built adjacency list

        """
        if isinstance(element, Link):
            nav_point = NavPoint(element)
            if nav_point.id:
                self.id_anchor_exist_in_nav_points = True
                self.html_href2subchapter_ids[nav_point.href].append(
                    nav_point.id)
            self.adjacency_list[nav_point] = None
            self.hrefs_added_to_toc.add(nav_point.href)
            return nav_point

        elif isinstance(element, tuple):
            first, second = element
            assert isinstance(first, Section)
            nav_point = NavPoint(first)
            if nav_point.id:
                self.id_anchor_exist_in_nav_points = True
                self.html_href2subchapter_ids[nav_point.href].append(
                    nav_point.id)

            sub_nodes = []
            for elem in second:
                if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
                    self.offset_sub_nodes.append(
                        self.build_adjacency_list_from_toc(elem, lvl))
                else:
                    sub_nodes.append(
                        self.build_adjacency_list_from_toc(elem, lvl + 1))

            self.adjacency_list[nav_point] = sub_nodes
            self.hrefs_added_to_toc.add(nav_point.href)
            return nav_point

        elif isinstance(element, list) and (lvl == 0):
            nodes = []
            # add through every element
            for elem in element:
                nodes.append(
                    self.build_adjacency_list_from_toc(elem, lvl + 1))
                # set chapter_i after Section & add through every offset sub element
                for offset_sub_node in self.offset_sub_nodes:
                    nodes.append(offset_sub_node)
                self.offset_sub_nodes = []

            self.adjacency_list[-1] = nodes
        else:
            assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'

    def is_toc_empty(self) -> bool:
        """Function checks is toc empty"""
        # there is no toc in ebook or no top chapters
        if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
            return True
        return False

    def build_manifest_id2html_href(self) -> dict:
        links = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            links[item.id] = item.file_name
        return links

    def build_adjacency_list_from_spine(self):
        manifest_id2html_href = self.build_manifest_id2html_href()
        self.adjacency_list = {
            -1: []
        }
        for id_, _ in self.ebooklib_book.spine:
            nav_point = NavPoint(
                Section(manifest_id2html_href[id_], manifest_id2html_href[id_]))
            self.adjacency_list[-1].append(nav_point)
            self.hrefs_added_to_toc.add(nav_point.href)

    def add_not_added_files_to_adjacency_list(self, not_added):
        """Function add files that not added to adjacency list"""
        for i, file in enumerate(not_added):
            nav_point = NavPoint(
                Section(f'To check #{i}, filename: {file}', file))
            self.adjacency_list[-1].append(nav_point)
            self.hrefs_added_to_toc.add(file)

    def label_chapters_ids_with_tmp_id(self):
        for html_href in self.html_href2html_body_soup:
            ids = self.html_href2subchapter_ids[html_href]
            for i in ids:
                soup = self.html_href2html_body_soup[html_href]
                tag = soup.find(id=i)
                new_h = soup.new_tag('tmp')
                new_h.attrs['class'] = 'converter-chapter-mark'
                new_h.attrs['id'] = i
                tag.insert_before(new_h)

    def process_html_soup_structure_to_line(self):
        # go to line structure
        for html_href in self.html_href2html_body_soup:
            soup = self.html_href2html_body_soup[html_href]
            self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)

    @staticmethod
    def create_unique_id(href, id_):
        return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)

    @staticmethod
    def create_new_anchor_span(soup, id_):
        new_anchor_span = soup.new_tag("span")
        new_anchor_span.attrs['id'] = id_
        new_anchor_span.attrs['class'] = 'link-anchor'
        new_anchor_span.string = "\xa0"
        return new_anchor_span

    def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
        """
        Function used to find full path to file that is parsed from tag link
        TOC: a/b/c.xhtml
        b/c.xhtml -> a/b/c.xhtml
        c.xhtml -> a/b/c.xhtml
        Parameters
        ----------
        cur_file_path: str
            path to current file with tag link
        href_in_link: str
            filename got from tag link, like file1.xhtml
        internal_link_tag: Tag
            object that is parsed now

        Returns
        -------
        full_path[0]: str
            prepared content

        """
        dir_name = os.path.dirname(cur_file_path)
        normed_path = os.path.normpath(os.path.join(
            dir_name, href_in_link)).replace('\\', '/')
        full_path = [
            path for path in self.hrefs_added_to_toc if normed_path in path]
        if not full_path:
            self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
                            f'While processing href in {internal_link_tag}.')
            internal_link_tag.attrs['converter-mark'] = 'bad-link'
            return None

        if len(full_path) > 1:
            self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
                            f' while {internal_link_tag} processing. The first one will be chosen.')

        return full_path[0]

    def process_internal_links(self):
        """
        Function
        - processing internal links in a book
        - make ids unique
        Steps
        ----------
        1. rebuild ids to be unique in all documents
        2a. process anchor which is a whole xhtml file
        2b. process anchor which is an element in xhtml file
        Returns
        -------
        None
            process links in html

        """
        # 1. rebuild ids to be unique in all documents
        for toc_href in self.hrefs_added_to_toc:
            for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
                if tag.attrs.get('class') == 'converter-chapter-mark':
                    continue

                if tag.attrs.get('class') == 'footnote-element':
                    continue

                new_id = self.create_unique_id(toc_href, tag.attrs['id'])
                tag.attrs['id'] = new_id

        # 2a. process anchor which is a whole xhtml file
        internal_link_reg1 = re.compile(
            r'(^(?!https?://).+\.(htm|html|xhtml)$)')
        for toc_href in self.hrefs_added_to_toc:
            soup = self.html_href2html_body_soup[toc_href]
            for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
                a_tag_href = internal_link_tag.attrs['href']
                # find full path
                a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
                    toc_href, a_tag_href, internal_link_tag)
                if not a_tag_href_matched_to_toc:
                    continue
                new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
                internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
                if new_id not in self.internal_anchors:
                    anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
                    new_anchor_span = self.create_new_anchor_span(soup, new_id)
                    # insert a new span to the beginning of the file
                    anchor_soup.insert(0, new_anchor_span)
                    self.internal_anchors.add(new_id)

                del internal_link_tag.attrs['href']

        # 2b. process anchor which is an element in xhtml file
        internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)')
        for toc_href in self.hrefs_added_to_toc:
            soup = self.html_href2html_body_soup[toc_href]
            for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
                a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split(
                    '#')
                # find full path
                if a_tag_href:
                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
                                                                                 internal_link_tag)
                else:
                    a_tag_href_matched_to_toc = os.path.normpath(
                        toc_href).replace('\\', '/')

                if not a_tag_href_matched_to_toc:
                    continue

                new_id = self.create_unique_id(
                    a_tag_href_matched_to_toc, a_tag_id)

                anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
                anchor_tags = anchor_soup.find_all(attrs={'id': new_id, })
                anchor_tags = anchor_tags or anchor_soup.find_all(
                    attrs={'id': a_tag_id})  # if link is a footnote

                if anchor_tags:
                    if len(anchor_tags) > 1:
                        self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n'
                                        f'{anchor_tags}\n'
                                        f' While processing {internal_link_tag}')

                    anchor_tag = anchor_tags[0]
                    assert anchor_tag.attrs['id'] in [new_id, a_tag_id]
                    # if anchor is found we could add placeholder for link creation on server side.
                    internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
                    # create span to have cyclic links, link has 1 type of class, anchor another
                    if anchor_tag.attrs['id'] not in self.internal_anchors:
                        new_anchor_span = self.create_new_anchor_span(
                            soup, new_id)
                        anchor_tag.insert_before(new_anchor_span)
                        self.internal_anchors.add(new_id)
                        del anchor_tag.attrs['id']
                    del internal_link_tag.attrs['href']

                else:
                    internal_link_tag.attrs['converter-mark'] = 'bad-link'
                    self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
                                    f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
                                    f' Old id={a_tag_id}')

    def build_one_chapter(self, nav_point: NavPoint):
        """
        Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)

        3 cases:
            id wraps all chapter content,
            id wraps chapter's content + subchapters' content
            id points to the start of title of a chapter

        In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
        and id of the next chapter/subchapter
        Parameters
        ----------
        nav_point: NavPoint

        Returns
        -------
        None
            built chapter

        """
        if nav_point.id:
            soup = self.html_href2html_body_soup[nav_point.href]
            chapter_tags = get_tags_between_chapter_marks(
                first_id=nav_point.id, href=nav_point.href, html_soup=soup)
            new_tree = BeautifulSoup('', 'html.parser')
            for tag in chapter_tags:
                new_tree.append(tag)
            self.href_chapter_id2soup_html[(
                nav_point.href, nav_point.id)] = new_tree

        if self.adjacency_list.get(nav_point):
            for sub_node in self.adjacency_list[nav_point]:
                self.build_one_chapter(sub_node)

    def define_chapters_content(self):
        """Function build chapters content, starts from top level chapters"""
        top_level_nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
            for point in top_level_nav_points:
                self.build_one_chapter(point)

    def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
        title = nav_point.title
        if nav_point.id:
            content: BeautifulSoup = self.href_chapter_id2soup_html[(
                nav_point.href, nav_point.id)]
        else:
            content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
        self.book_image_src_path2aws_path = update_images_src_links(content,
                                                                    self.img_href2img_bytes,
                                                                    path_to_html=nav_point.href,
                                                                    access=self.access,
                                                                    path2aws_path=self.book_image_src_path2aws_path,
                                                                    book_id=self.file_path.stem
                                                                    if hasattr(self.file_path, 'stem') else 'book_id')

        is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
        title_preprocessed = prepare_title(title)
        content_preprocessed = prepare_content(title_preprocessed, content,
                                               remove_title_from_chapter=is_chapter)
        sub_nodes = []
        # warning! not EpubHtmlItems won't be added to chapter
        if self.adjacency_list.get(nav_point):
            for sub_node in self.adjacency_list[nav_point]:
                sub_chapter_item = self.node_to_livecarta_chapter_item(
                    sub_node, lvl + 1)
                sub_nodes.append(sub_chapter_item)

        if self.logger:
            indent = ' ' * lvl
            self.logger.log(f'{indent}Chapter: {title} is prepared.')
        return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)

    def convert_to_dict(self) -> dict:
        """Function which convert list of html nodes to appropriate json structure"""
        top_level_nav_points = self.adjacency_list[-1]
        top_level_chapters = []

        for nav_point in top_level_nav_points:
            chapter = self.node_to_livecarta_chapter_item(nav_point)
            top_level_chapters.append(chapter)
        top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
        self.logger.log(f'Anchors found: {len(self.internal_anchors)}.')
        self.logger.log('End conversion.')

        return {
            "content": top_level_dict_chapters,
            "footnotes": self.footnotes_contents
        }


if __name__ == "__main__":
    epub_file_path = '../../epub/9781614382264.epub'
    logger_object = BookLogger(
        name='epub', book_id=epub_file_path.split('/')[-1])

    json_converter = EpubConverter(epub_file_path, logger=logger_object)
    content_dict = json_converter.convert_to_dict()

    with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json:
        json.dump(content_dict, f_json, ensure_ascii=False)