epub converter: rename from epub_converter.py

2021-04-19 11:35:38 +03:00
parent 1df37b6122
commit dce0f871a8
1 changed files with 20 additions and 19 deletions
--- a/src/epub_postprocessor.py
+++ b/src/epub_postprocessor.py
@@ -0,0 +1,231 @@
+import codecs
+import json
+import re
+import os
+from collections import defaultdict
+from typing import Dict, Union
+
+import ebooklib
+from bs4 import BeautifulSoup
+from ebooklib import epub
+from ebooklib.epub import Link, Section
+from ebooklib.utils import debug
+
+from src.data_objects import ChapterItem, NavPoint
+from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
+    preprocess_image, preprocess_footnotes
+
+
+# epub3 examples:
+# https://github.com/IDPF/epub3-samples
+# specification:
+# https://idpf.github.io/epub-vocabs/structure/
+# footnotes:
+# http://www.theheratik.net/books/tech-epub/chapter-8/
+# http://kb.daisy.org/publishing/docs/html/epub-type.html
+# todo: http://kb.daisy.org/publishing/docs/html/notes.html
+# todo: https://docs.python.org/3/howto/unicode.html
+
+
+class EpubPostprocessor:
+    def __init__(self, file):
+        self.file = file
+        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
+        self.href2img_bytes = {}
+
+        for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
+            debug(x)
+            file_name = x.file_name
+            content = x.content
+            # todo: check how file path is count in lib
+            self.href2img_bytes[file_name] = content
+
+        self.id_anchor_exist_in_nav_points = False
+        self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
+        self.footnotes = []
+        for href in self.href2soup_html:
+            self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html,
+                                                       noteref_attr_name='data-type'))
+        # если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
+        # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
+        self.href2ids = defaultdict(list)
+        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}  # k = -1 if root, v = None if leaf
+        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
+        self.mark_and_line_href2soup_html()  # used only after parsed toc, ids from toc needed
+        self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
+
+        if not self.is_toc_valid():
+            self.build_adjacency_list_from_spine()
+
+        self.build_anchor2soup()
+
+        # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
+        #     self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine
+
+    def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
+        # using EpubElements
+        # for now just for HTML objects, as it is simplest chapter
+        # todo: check if other chapters exist
+        nodes = dict()
+        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+            html_text = item.get_body_content()
+            soup = BeautifulSoup(html_text, features='lxml')
+            nodes[item.file_name] = soup
+
+        return nodes
+
+    def build_manifest_id2href(self):
+        links = dict()
+        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+            links[item.id] = item.file_name
+
+        return links
+
+    def build_adjacency_list_from_toc(self, element, lvl=0):
+        # use book.toc as a root
+
+        if isinstance(element, Link):
+            # todo: check if link exists
+            node = NavPoint(element)
+            if node.id:
+                self.id_anchor_exist_in_nav_points = True
+                self.href2ids[node.href].append(node.id)
+            self.adjacency_list[node] = None
+            return node
+
+        elif isinstance(element, tuple):
+            first, second = element
+            assert isinstance(first, Section)
+            node = NavPoint(first)
+            if node.id:
+                self.id_anchor_exist_in_nav_points = True
+                self.href2ids[node.href].append(node.id)
+
+            sub_nodes = []
+            for i in second:
+                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
+
+            self.adjacency_list[node] = sub_nodes
+            return node
+
+        elif isinstance(element, list) and (lvl == 0):
+            sub_nodes = []
+            for i in element:
+                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
+
+            self.adjacency_list[-1] = sub_nodes
+
+        else:
+            assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
+
+    def is_toc_valid(self):
+        if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
+            return False
+        return True
+
+    def build_adjacency_list_from_spine(self):
+        manifest_id2href = self.build_manifest_id2href()
+        self.adjacency_list = {
+            -1: []
+        }
+        for id_, _ in self.ebooklib_book.spine:
+            node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
+            self.adjacency_list[-1].append(node)
+
+    def mark_and_line_href2soup_html(self):
+        # mark
+        for href in self.href2soup_html:
+            ids = self.href2ids[href]
+            for i in ids:
+                soup = self.href2soup_html[href]
+                tag = soup.find(id=i)
+                new_h = soup.new_tag('h1')
+                new_h.attrs['class'] = 'internal-mark'
+                new_h.attrs['id'] = i
+                tag.insert_before(new_h)
+
+        # go to line structure
+        for href in self.href2soup_html:
+            soup = self.href2soup_html[href]
+            self.href2soup_html[href] = unwrap_structural_tags(soup)
+
+    def build_one_anchored_section(self, node):
+        """
+        к этому моементу html soup уже существует в линейном виде
+        - если не в линейном - то мы не виноваты
+
+        есть 3 случая:
+         id оборачивает весь контент,
+         id оборачивает контент чаптера и под-чаптера,
+         id только указывает на заголовок
+
+        во всех 3х случаях мы знаем где начало заголовка. Поэтому
+        глава - это все теги от текущего заголовка - до какого угодно следущющего
+
+        заголовок принимается в расчет если в toc есть указание id,тогда заголовок -
+        это любой тег с id из toc
+        :return:
+        """
+        if node.id:
+            soup = self.href2soup_html[node.href]
+            chapter_tags = get_tags_between_ids(first_id=node.id, href=node.href, html_soup=soup)
+            new_tree = BeautifulSoup('', 'html.parser')
+            for tag in chapter_tags:
+                new_tree.append(tag)
+            self.id_anchor2soup[(node.href, node.id)] = new_tree
+
+        if self.adjacency_list.get(node):
+            for sub_node in self.adjacency_list[node]:
+                self.build_one_anchored_section(sub_node)
+
+        # print(f'Chapter: {node.href, node.id} is split.')
+
+    def build_anchor2soup(self):
+        nav_points = self.adjacency_list[-1]
+        if self.id_anchor_exist_in_nav_points:
+            for point in nav_points:
+                self.build_one_anchored_section(point)
+
+    def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
+        title = node.title
+        if node.id:
+            content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
+        else:
+            content: BeautifulSoup = self.href2soup_html[node.href]
+
+        preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None)
+        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
+
+        sub_nodes = []
+        # warning! not EpubHtmlItems won;t be added to chapter
+        if self.adjacency_list.get(node):
+            for sub_node in self.adjacency_list[node]:
+                sub_chapter_item = self.node2livecarta_chapter_item(sub_node)
+                sub_nodes.append(sub_chapter_item)
+
+        # print(f'Chapter: {title} is prepared.')
+        return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
+
+    def convert_to_dict(self):
+        top_level_nav_points = self.adjacency_list[-1]
+        top_level_chapters = []
+
+        for nav_point in top_level_nav_points:
+            chapter = self.node2livecarta_chapter_item(nav_point)
+            top_level_chapters.append(chapter)
+
+        top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
+
+        return {
+            "content": top_level_dict_chapters,
+            "footnotes": self.footnotes
+        }
+
+
+if __name__ == "__main__":
+    json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/Chaos_Engineering.epub')
+    tmp = json_converter.convert_to_dict()
+
+    with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
+        json.dump(tmp, f, ensure_ascii=False)
+