BookConverter/src/epub_postprocessor.py

import codecs
import json
from collections import defaultdict
from typing import Dict, Union

import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
from ebooklib.epub import Link, Section

from src.data_objects import ChapterItem, NavPoint
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
    update_src_links_in_images, preprocess_footnotes

# epub3 examples:
# https://github.com/IDPF/epub3-samples
# specification:
# https://idpf.github.io/epub-vocabs/structure/
# footnotes:
# http://www.theheratik.net/books/tech-epub/chapter-8/
# http://kb.daisy.org/publishing/docs/html/epub-type.html
# todo: http://kb.daisy.org/publishing/docs/html/notes.html
# todo: https://docs.python.org/3/howto/unicode.html


# поиск toc в epublib:
# если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav.
from src.util.css_reader import clean_css, add_inline_style_to_html_soup


class EpubPostprocessor:
    def __init__(self, file, access=None):
        self.file = file
        self.access = access
        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
        # read images
        self.href2img_bytes = {}
        for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
            file_name = x.file_name
            content = x.content
            # todo: check how file path is count in lib
            self.href2img_bytes[file_name] = content
        # read html
        self.id_anchor_exist_in_nav_points = False
        self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
        # read css
        self.html_href2css_href = {}
        self.css_href2content = {}
        self.build_css_content()
        # add css
        self.add_css_styles2soup()
        # read footnotes
        self.footnotes = []
        for href in self.href2soup_html:
            self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html))
        # read toc
        self.href2ids = defaultdict(list)
        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}  # k = -1 if root, v = None if leaf
        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
        # build simple toc from spine if needed
        if not self.is_toc_valid():
            self.build_adjacency_list_from_spine()
        # read anchored blocks, split html into separate block
        self.mark_and_line_href2soup_html()  # used only after parsed toc, ids from toc needed
        self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
        self.build_anchor2soup()

        # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
        #     self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine

    def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
        # using EpubElements
        # for now just for HTML objects, as it is simplest chapter
        # todo: check if other chapters exist
        nodes = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            html_body_text = item.get_body_content()
            soup = BeautifulSoup(html_body_text, features='lxml')
            nodes[item.file_name] = soup

        return nodes

    def build_css_content(self):
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            html_text = item.content
            soup = BeautifulSoup(html_text, features='lxml')
            for tag in soup.find_all('link', attrs={"type": "text/css"}):
                css_href = tag.attrs.get('href')
                self.html_href2css_href[item.file_name] = css_href
                if css_href not in self.css_href2content:
                    print(css_href)
                    css_content: str = self.ebooklib_book.get_item_with_href(css_href).get_content().decode()
                    self.css_href2content[css_href] = clean_css(css_content)

            for i, tag in enumerate(soup.find_all('style')):
                css_content = tag.string
                self.html_href2css_href[item.file_name] = f'href{i}'
                self.css_href2content[f'href{i}'] = clean_css(css_content)

    def add_css_styles2soup(self):
        for href in self.href2soup_html:
            if self.html_href2css_href.get(href):
                css: str = self.css_href2content[self.html_href2css_href[href]]
                content = self.href2soup_html[href]
                content = add_inline_style_to_html_soup(content, css)
                self.href2soup_html[href] = content

    def build_manifest_id2href(self):
        links = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            links[item.id] = item.file_name

        return links

    def build_adjacency_list_from_toc(self, element, lvl=0):
        # use book.toc as a root

        if isinstance(element, Link):
            # todo: check if link exists
            node = NavPoint(element)
            if node.id:
                self.id_anchor_exist_in_nav_points = True
                self.href2ids[node.href].append(node.id)
            self.adjacency_list[node] = None
            return node

        elif isinstance(element, tuple):
            first, second = element
            assert isinstance(first, Section)
            node = NavPoint(first)
            if node.id:
                self.id_anchor_exist_in_nav_points = True
                self.href2ids[node.href].append(node.id)

            sub_nodes = []
            for i in second:
                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))

            self.adjacency_list[node] = sub_nodes
            return node

        elif isinstance(element, list) and (lvl == 0):
            sub_nodes = []
            for i in element:
                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))

            self.adjacency_list[-1] = sub_nodes

        else:
            assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'

    def is_toc_valid(self):
        if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
            return False
        return True

    def build_adjacency_list_from_spine(self):
        manifest_id2href = self.build_manifest_id2href()
        self.adjacency_list = {
            -1: []
        }
        for id_, _ in self.ebooklib_book.spine:
            node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
            self.adjacency_list[-1].append(node)

    def mark_and_line_href2soup_html(self):
        # mark
        for href in self.href2soup_html:
            ids = self.href2ids[href]
            for i in ids:
                soup = self.href2soup_html[href]
                tag = soup.find(id=i)
                new_h = soup.new_tag('h1')
                new_h.attrs['class'] = 'internal-mark'
                new_h.attrs['id'] = i
                tag.insert_before(new_h)

        # go to line structure
        for href in self.href2soup_html:
            soup = self.href2soup_html[href]
            self.href2soup_html[href] = unwrap_structural_tags(soup)

    def build_one_anchored_section(self, node):
        """
        к этому моементу html soup уже существует в линейном виде
        - если не в линейном - то мы не виноваты

        есть 3 случая:
         id оборачивает весь контент,
         id оборачивает контент чаптера и под-чаптера,
         id только указывает на заголовок

        во всех 3х случаях мы знаем где начало заголовка. Поэтому
        глава - это все теги от текущего заголовка - до какого угодно следущющего

        заголовок принимается в расчет если в toc есть указание id,тогда заголовок -
        это любой тег с id из toc
        :return:
        """
        if node.id:
            soup = self.href2soup_html[node.href]
            chapter_tags = get_tags_between_ids(first_id=node.id, href=node.href, html_soup=soup)
            new_tree = BeautifulSoup('', 'html.parser')
            for tag in chapter_tags:
                new_tree.append(tag)
            self.id_anchor2soup[(node.href, node.id)] = new_tree

        if self.adjacency_list.get(node):
            for sub_node in self.adjacency_list[node]:
                self.build_one_anchored_section(sub_node)

        # print(f'Chapter: {node.href, node.id} is split.')

    def build_anchor2soup(self):
        nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
            for point in nav_points:
                self.build_one_anchored_section(point)

    def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
        title = node.title
        if node.id:
            content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
        else:
            content: BeautifulSoup = self.href2soup_html[node.href]

        update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)

        sub_nodes = []
        # warning! not EpubHtmlItems won;t be added to chapter
        if self.adjacency_list.get(node):
            for sub_node in self.adjacency_list[node]:
                sub_chapter_item = self.node2livecarta_chapter_item(sub_node)
                sub_nodes.append(sub_chapter_item)

        # print(f'Chapter: {title} is prepared.')
        return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)

    def convert_to_dict(self):
        top_level_nav_points = self.adjacency_list[-1]
        top_level_chapters = []

        for nav_point in top_level_nav_points:
            chapter = self.node2livecarta_chapter_item(nav_point)
            top_level_chapters.append(chapter)

        top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]

        return {
            "content": top_level_dict_chapters,
            "footnotes": self.footnotes
        }


if __name__ == "__main__":
    json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/Chaos_Engineering.epub')
    tmp = json_converter.convert_to_dict()

    with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
        json.dump(tmp, f, ensure_ascii=False)