epub converter: add files

2021-04-14 14:29:19 +03:00
parent 4eb30bd80c
commit 880b045de0
3 changed files with 371 additions and 0 deletions
--- a/src/data_objects.py
+++ b/src/data_objects.py
@@ -0,0 +1,62 @@
+import re
+from typing import Union
+
+from ebooklib.epub import Section, Link
+
+
+"""
+These are data structures which form mapping from NCX to python data structures.
+"""
+
+
+class NavPoint:
+    def __init__(self, obj: Union[Link, Section]=None, ):
+        self.href, self.id = self.parse_href_id(obj)
+        self.title = obj.title
+
+    @staticmethod
+    def parse_href_id(item: Union[Link, Section]):
+        reg = '(.+\..+\#)(.+)'
+        match = re.search(reg, item.href)
+        href, div_id = None, None
+        if match:
+            div_id = match.group(2)
+            if match.group(1):
+                href = match.group(1)[:-1]
+        else:
+            reg2 = '(.+\..+)'
+            match2 = re.search(reg2, item.href)
+            if match2 and match2.group(1):
+                href = match2.group(1)
+
+        return href, div_id
+
+    def __str__(self):
+        return '<NavPoint: %s, %s>' % (self.href, self.id)
+
+
+"""
+These are data structures which form mapping to livecarta json structure.
+"""
+
+
+class ChapterItem:
+    def __init__(self, title, content, sub_items):
+        self.title = title
+        self.content = content
+        self.sub_items = sub_items
+
+    def to_dict(self):
+        tmp = []
+        if self.sub_items:
+            for i in self.sub_items:
+                tmp.append(i.to_dict())
+
+        return {
+            "title": self.title,
+            "contents": [self.content],
+            "sub_items": tmp
+        }
+
+    def __str__(self):
+        return '<Chapter: %s>' % self.title
--- a/src/epub_converter.py
+++ b/src/epub_converter.py
@@ -0,0 +1,205 @@
+import codecs
+import json
+import re
+from collections import defaultdict
+from typing import Dict, Union
+
+import ebooklib
+from bs4 import BeautifulSoup
+from ebooklib import epub
+from ebooklib.epub import Link, Section
+
+from src.data_objects import ChapterItem, NavPoint
+from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids
+
+
+class EpubBookAdapter:
+    def __init__(self, file):
+        self.file = file
+        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
+        self.id_anchor_exist_in_nav_points = False
+        self.href2soup_html = self.build_href2soup_content()
+        # если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
+        # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
+        self.href2ids = defaultdict(list)
+        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}  # k = -1 if root, v = None if leaf
+        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
+        self.mark_and_line_href2soup_html()  # used only after parsed toc, ids from toc needed
+        self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
+
+        if not self.is_toc_valid():
+            self.build_adjacency_list_from_spine()
+
+        self.build_anchor2soup()
+
+        # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
+        #     self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine
+
+    def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
+        # using EpubElements
+        # for now just for HTML objects, as it is simplest chapter
+        # todo: check if other chapters exist
+        nodes = dict()
+        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+            html_text = item.get_body_content()
+            soup = BeautifulSoup(html_text, features='lxml')
+            nodes[item.file_name] = soup
+
+        return nodes
+
+    def build_manifest_id2href(self):
+        links = dict()
+        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+            links[item.id] = item.file_name
+
+        return links
+
+    def build_adjacency_list_from_toc(self, element, lvl=0):
+        # use book.toc as a root
+        # todo: read _create_section in get_nav
+        # todo: try list on hrefs, extra info in another db
+
+        if isinstance(element, Link):
+            # todo: check if link exists
+            node = NavPoint(element)
+            if node.id:
+                self.id_anchor_exist_in_nav_points = True
+                self.href2ids[node.href].append(node.id)
+            self.adjacency_list[node] = None
+            return node
+
+        elif isinstance(element, tuple):
+            first, second = element
+            assert isinstance(first, Section)
+            node = NavPoint(first)
+            if node.id:
+                self.id_anchor_exist_in_nav_points = True
+                self.href2ids[node.href].append(node.id)
+
+            sub_nodes = []
+            for i in second:
+                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
+
+            self.adjacency_list[node] = sub_nodes
+            return node
+
+        elif isinstance(element, list) and (lvl == 0):
+            sub_nodes = []
+            for i in element:
+                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
+
+            self.adjacency_list[-1] = sub_nodes
+
+        else:
+            assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
+
+    def is_toc_valid(self):
+        if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
+            return False
+        return True
+
+    def build_adjacency_list_from_spine(self):
+        manifest_id2href = self.build_manifest_id2href()
+        self.adjacency_list = {
+            -1: []
+        }
+        for id_, _ in self.ebooklib_book.spine:
+            node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
+            self.adjacency_list[-1].append(node)
+
+    def mark_and_line_href2soup_html(self):
+        # mark
+        for href in self.href2soup_html:
+            ids = self.href2ids[href]
+            for i in ids:
+                soup = self.href2soup_html[href]
+                tag = soup.find(id=i)
+                new_h = soup.new_tag('h1')
+                new_h.attrs['class'] = 'internal-mark'
+                new_h.attrs['id'] = i
+                tag.insert_before(new_h)
+
+        # go to line structure
+        for href in self.href2soup_html:
+            soup = self.href2soup_html[href]
+            self.href2soup_html[href] = unwrap_structural_tags(soup)
+
+    def build_one_anchored_section(self, node):
+        """
+        к этому моементу html soup уже существует в линейном виде
+        - если не в линейном - то мы не виноваты
+
+        есть 3 случая:
+         id оборачивает весь контент,
+         id оборачивает контент чаптера и под-чаптера,
+         id только указывает на заголовок
+
+        во всех 3х случаях мы знаем где начало заголовка. Поэтому
+        глава - это все теги от текущего заголовка - до какого угодно следущющего
+
+        заголовок принимается в расчет если в toc есть указание id,тогда заголовок -
+        это любой тег с id из toc
+        :return:
+        """
+        if node.id:
+            soup = self.href2soup_html[node.href]
+            chapter_tags = get_tags_between_ids(first_id=node.id, href=node.href, html_soup=soup)
+            new_tree = BeautifulSoup('', 'html.parser')
+            for tag in chapter_tags:
+                new_tree.append(tag)
+            self.id_anchor2soup[(node.href, node.id)] = new_tree
+
+        if self.adjacency_list.get(node):
+            for sub_node in self.adjacency_list[node]:
+                self.build_one_anchored_section(sub_node)
+
+        print(f'Chapter: {node.href, node.id} is split.')
+
+    def build_anchor2soup(self):
+        nav_points = self.adjacency_list[-1]
+        if self.id_anchor_exist_in_nav_points:
+            for point in nav_points:
+                self.build_one_anchored_section(point)
+
+    def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
+        title = node.title
+        if node.id:
+            content = self.id_anchor2soup[(node.href, node.id)]
+        else:
+            content = self.href2soup_html[node.href]
+        content_preprocessed = str(content)  # todo self.preprocess_html(content, node.id)
+        content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed)
+        sub_nodes = []
+        # warning! not EpubHtmlItems won;t be added to chapter
+        if self.adjacency_list.get(node):
+            for sub_node in self.adjacency_list[node]:
+                sub_chapter_item = self.node2livecarta_chapter_item(sub_node)
+                sub_nodes.append(sub_chapter_item)
+
+        # print(f'Chapter: {title} is prepared.')
+        return ChapterItem(title, content_preprocessed, sub_nodes)
+
+
+if __name__ == "__main__":
+    adapter = EpubBookAdapter('/home/katerina/PycharmProjects/Jenia/converter/epub/calibri.epub')
+
+    top_level_nav_points = adapter.adjacency_list[-1]
+    top_level_chapters = []
+
+    for nav_point in top_level_nav_points:
+        chapter = adapter.node2livecarta_chapter_item(nav_point)
+        top_level_chapters.append(chapter)
+
+    l = [x.to_dict() for x in top_level_chapters]
+
+    tmp = {
+        "content": l
+    }
+
+    output_file = open('output.out', 'w')
+    output_file.write(str(tmp))
+
+    with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
+        json.dump(tmp, f, ensure_ascii=False)
+
+
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -0,0 +1,104 @@
+import re
+
+from bs4 import BeautifulSoup, NavigableString
+
+
+def preprocess_image():
+    pass
+
+
+def preprocess_table():
+    pass
+
+
+def preprocess_quote():
+    pass
+
+
+def clean_heading_in_content():
+    pass
+
+
+def preprocess_footnotes():
+    pass
+
+
+def add_fonts():
+    pass
+
+
+def unwrap_structural_tags(body_tag):
+    divs = body_tag.find_all("div")
+    for div in divs:
+        div.unwrap()
+
+    secs = body_tag.find_all("section")
+    for s in secs:
+        s.unwrap()
+
+    articles = body_tag.find_all("article")
+    for s in articles:
+        s.unwrap()
+
+    articles = body_tag.find_all("main")
+    for s in articles:
+        s.unwrap()
+
+    articles = body_tag.find_all("body")
+    for s in articles:
+        s.unwrap()
+
+    # articles = body_tag.find_all("html")
+    # for s in articles:
+    #     s.unwrap()
+
+    spans = body_tag.find_all("span")
+    # not all cases, if span has <p>s and NavigableString, it won't unwrap
+    for s in spans:
+        if not s.string and s.contents:
+            is_string = [isinstance(child, NavigableString) for child in s.contents]
+            if any(is_string):
+                pass
+            else:
+                s.unwrap()
+
+    for node in body_tag:
+        if isinstance(node, NavigableString):
+            content = str(node)
+            content = re.sub(r'([\n\t\xa0])', ' ', content)
+            content = content.strip()
+            if content:
+                tag = body_tag.new_tag('p')
+                tag.append(str(node))
+                node.replace_with(tag)
+
+    return body_tag
+
+
+def str2html_soup(html_text: str, element_id=None):
+    html_soup = BeautifulSoup(html_text, features='lxml')
+    if element_id:
+        x = html_soup.find(id=element_id)
+        return str(x)
+    else:
+        return str(html_text)
+
+
+def get_tags_between_ids(first_id, href, html_soup):
+    h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
+    if h_marked:
+        p = h_marked.next_sibling
+        tags = []
+        while p:
+            if p.name == 'h1' and p.attrs.get('class') == 'internal-mark':
+                break
+            tags.append(p)
+            p = p.next_sibling
+
+        tags = [tag.extract() for tag in tags]
+        html_soup.smooth()
+
+    else:
+        assert 0, f'Warning: no match for {first_id, href}'
+
+    return tags