epub converter: add files

2021-04-14 14:29:19 +03:00
parent 4eb30bd80c
commit 880b045de0
3 changed files with 371 additions and 0 deletions
--- a/src/data_objects.py
+++ b/src/data_objects.py
@@ -0,0 +1,62 @@
 import re
 from typing import Union
 from ebooklib.epub import Section, Link
 """
 These are data structures which form mapping from NCX to python data structures.
 """
 class NavPoint:
    def __init__(self, obj: Union[Link, Section]=None, ):
        self.href, self.id = self.parse_href_id(obj)
        self.title = obj.title
    @staticmethod
    def parse_href_id(item: Union[Link, Section]):
        reg = '(.+\..+\#)(.+)'
        match = re.search(reg, item.href)
        href, div_id = None, None
        if match:
            div_id = match.group(2)
            if match.group(1):
                href = match.group(1)[:-1]
        else:
            reg2 = '(.+\..+)'
            match2 = re.search(reg2, item.href)
            if match2 and match2.group(1):
                href = match2.group(1)
        return href, div_id
    def __str__(self):
        return '<NavPoint: %s, %s>' % (self.href, self.id)
 """
 These are data structures which form mapping to livecarta json structure.
 """
 class ChapterItem:
    def __init__(self, title, content, sub_items):
        self.title = title
        self.content = content
        self.sub_items = sub_items
    def to_dict(self):
        tmp = []
        if self.sub_items:
            for i in self.sub_items:
                tmp.append(i.to_dict())
        return {
            "title": self.title,
            "contents": [self.content],
            "sub_items": tmp
        }
    def __str__(self):
        return '<Chapter: %s>' % self.title
--- a/src/epub_converter.py
+++ b/src/epub_converter.py
@@ -0,0 +1,205 @@
 import codecs
 import json
 import re
 from collections import defaultdict
 from typing import Dict, Union
 import ebooklib
 from bs4 import BeautifulSoup
 from ebooklib import epub
 from ebooklib.epub import Link, Section
 from src.data_objects import ChapterItem, NavPoint
 from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids
 class EpubBookAdapter:
    def __init__(self, file):
        self.file = file
        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
        self.id_anchor_exist_in_nav_points = False
        self.href2soup_html = self.build_href2soup_content()
        # если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
        # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
        self.href2ids = defaultdict(list)
        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}  # k = -1 if root, v = None if leaf
        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
        self.mark_and_line_href2soup_html()  # used only after parsed toc, ids from toc needed
        self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
        if not self.is_toc_valid():
            self.build_adjacency_list_from_spine()
        self.build_anchor2soup()
        # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
        #     self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine
    def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
        # using EpubElements
        # for now just for HTML objects, as it is simplest chapter
        # todo: check if other chapters exist
        nodes = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            html_text = item.get_body_content()
            soup = BeautifulSoup(html_text, features='lxml')
            nodes[item.file_name] = soup
        return nodes
    def build_manifest_id2href(self):
        links = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            links[item.id] = item.file_name
        return links
    def build_adjacency_list_from_toc(self, element, lvl=0):
        # use book.toc as a root
        # todo: read _create_section in get_nav
        # todo: try list on hrefs, extra info in another db
        if isinstance(element, Link):
            # todo: check if link exists
            node = NavPoint(element)
            if node.id:
                self.id_anchor_exist_in_nav_points = True
                self.href2ids[node.href].append(node.id)
            self.adjacency_list[node] = None
            return node
        elif isinstance(element, tuple):
            first, second = element
            assert isinstance(first, Section)
            node = NavPoint(first)
            if node.id:
                self.id_anchor_exist_in_nav_points = True
                self.href2ids[node.href].append(node.id)
            sub_nodes = []
            for i in second:
                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
            self.adjacency_list[node] = sub_nodes
            return node
        elif isinstance(element, list) and (lvl == 0):
            sub_nodes = []
            for i in element:
                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
            self.adjacency_list[-1] = sub_nodes
        else:
            assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
    def is_toc_valid(self):
        if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
            return False
        return True
    def build_adjacency_list_from_spine(self):
        manifest_id2href = self.build_manifest_id2href()
        self.adjacency_list = {
            -1: []
        }
        for id_, _ in self.ebooklib_book.spine:
            node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
            self.adjacency_list[-1].append(node)
    def mark_and_line_href2soup_html(self):
        # mark
        for href in self.href2soup_html:
            ids = self.href2ids[href]
            for i in ids:
                soup = self.href2soup_html[href]
                tag = soup.find(id=i)
                new_h = soup.new_tag('h1')
                new_h.attrs['class'] = 'internal-mark'
                new_h.attrs['id'] = i
                tag.insert_before(new_h)
        # go to line structure
        for href in self.href2soup_html:
            soup = self.href2soup_html[href]
            self.href2soup_html[href] = unwrap_structural_tags(soup)
    def build_one_anchored_section(self, node):
        """
        к этому моементу html soup уже существует в линейном виде
        - если не в линейном - то мы не виноваты
        есть 3 случая:
         id оборачивает весь контент,
         id оборачивает контент чаптера и под-чаптера,
         id только указывает на заголовок
        во всех 3х случаях мы знаем где начало заголовка. Поэтому
        глава - это все теги от текущего заголовка - до какого угодно следущющего
        заголовок принимается в расчет если в toc есть указание id,тогда заголовок -
        это любой тег с id из toc
        :return:
        """
        if node.id:
            soup = self.href2soup_html[node.href]
            chapter_tags = get_tags_between_ids(first_id=node.id, href=node.href, html_soup=soup)
            new_tree = BeautifulSoup('', 'html.parser')
            for tag in chapter_tags:
                new_tree.append(tag)
            self.id_anchor2soup[(node.href, node.id)] = new_tree
        if self.adjacency_list.get(node):
            for sub_node in self.adjacency_list[node]:
                self.build_one_anchored_section(sub_node)
        print(f'Chapter: {node.href, node.id} is split.')
    def build_anchor2soup(self):
        nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
            for point in nav_points:
                self.build_one_anchored_section(point)
    def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
        title = node.title
        if node.id:
            content = self.id_anchor2soup[(node.href, node.id)]
        else:
            content = self.href2soup_html[node.href]
        content_preprocessed = str(content)  # todo self.preprocess_html(content, node.id)
        content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed)
        sub_nodes = []
        # warning! not EpubHtmlItems won;t be added to chapter
        if self.adjacency_list.get(node):
            for sub_node in self.adjacency_list[node]:
                sub_chapter_item = self.node2livecarta_chapter_item(sub_node)
                sub_nodes.append(sub_chapter_item)
        # print(f'Chapter: {title} is prepared.')
        return ChapterItem(title, content_preprocessed, sub_nodes)
 if __name__ == "__main__":
    adapter = EpubBookAdapter('/home/katerina/PycharmProjects/Jenia/converter/epub/calibri.epub')
    top_level_nav_points = adapter.adjacency_list[-1]
    top_level_chapters = []
    for nav_point in top_level_nav_points:
        chapter = adapter.node2livecarta_chapter_item(nav_point)
        top_level_chapters.append(chapter)
    l = [x.to_dict() for x in top_level_chapters]
    tmp = {
        "content": l
    }
    output_file = open('output.out', 'w')
    output_file.write(str(tmp))
    with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
        json.dump(tmp, f, ensure_ascii=False)
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -0,0 +1,104 @@
 import re
 from bs4 import BeautifulSoup, NavigableString
 def preprocess_image():
    pass
 def preprocess_table():
    pass
 def preprocess_quote():
    pass
 def clean_heading_in_content():
    pass
 def preprocess_footnotes():
    pass
 def add_fonts():
    pass
 def unwrap_structural_tags(body_tag):
    divs = body_tag.find_all("div")
    for div in divs:
        div.unwrap()
    secs = body_tag.find_all("section")
    for s in secs:
        s.unwrap()
    articles = body_tag.find_all("article")
    for s in articles:
        s.unwrap()
    articles = body_tag.find_all("main")
    for s in articles:
        s.unwrap()
    articles = body_tag.find_all("body")
    for s in articles:
        s.unwrap()
    # articles = body_tag.find_all("html")
    # for s in articles:
    #     s.unwrap()
    spans = body_tag.find_all("span")
    # not all cases, if span has <p>s and NavigableString, it won't unwrap
    for s in spans:
        if not s.string and s.contents:
            is_string = [isinstance(child, NavigableString) for child in s.contents]
            if any(is_string):
                pass
            else:
                s.unwrap()
    for node in body_tag:
        if isinstance(node, NavigableString):
            content = str(node)
            content = re.sub(r'([\n\t\xa0])', ' ', content)
            content = content.strip()
            if content:
                tag = body_tag.new_tag('p')
                tag.append(str(node))
                node.replace_with(tag)
    return body_tag
 def str2html_soup(html_text: str, element_id=None):
    html_soup = BeautifulSoup(html_text, features='lxml')
    if element_id:
        x = html_soup.find(id=element_id)
        return str(x)
    else:
        return str(html_text)
 def get_tags_between_ids(first_id, href, html_soup):
    h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
    if h_marked:
        p = h_marked.next_sibling
        tags = []
        while p:
            if p.name == 'h1' and p.attrs.get('class') == 'internal-mark':
                break
            tags.append(p)
            p = p.next_sibling
        tags = [tag.extract() for tag in tags]
        html_soup.smooth()
    else:
        assert 0, f'Warning: no match for {first_id, href}'
    return tags