epub converter: rename from epub_converter.py

This commit is contained in:
shirshasa
2021-04-19 11:35:38 +03:00
parent 1df37b6122
commit dce0f871a8

231
src/epub_postprocessor.py Normal file
View File

@@ -0,0 +1,231 @@
import codecs
import json
import re
import os
from collections import defaultdict
from typing import Dict, Union
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
from ebooklib.epub import Link, Section
from ebooklib.utils import debug
from src.data_objects import ChapterItem, NavPoint
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
preprocess_image, preprocess_footnotes
# epub3 examples:
# https://github.com/IDPF/epub3-samples
# specification:
# https://idpf.github.io/epub-vocabs/structure/
# footnotes:
# http://www.theheratik.net/books/tech-epub/chapter-8/
# http://kb.daisy.org/publishing/docs/html/epub-type.html
# todo: http://kb.daisy.org/publishing/docs/html/notes.html
# todo: https://docs.python.org/3/howto/unicode.html
class EpubPostprocessor:
def __init__(self, file):
self.file = file
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
self.href2img_bytes = {}
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
debug(x)
file_name = x.file_name
content = x.content
# todo: check how file path is count in lib
self.href2img_bytes[file_name] = content
self.id_anchor_exist_in_nav_points = False
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.footnotes = []
for href in self.href2soup_html:
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html,
noteref_attr_name='data-type'))
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
self.href2ids = defaultdict(list)
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
if not self.is_toc_valid():
self.build_adjacency_list_from_spine()
self.build_anchor2soup()
# if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
# self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
# for now just for HTML objects, as it is simplest chapter
# todo: check if other chapters exist
nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_text = item.get_body_content()
soup = BeautifulSoup(html_text, features='lxml')
nodes[item.file_name] = soup
return nodes
def build_manifest_id2href(self):
links = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
links[item.id] = item.file_name
return links
def build_adjacency_list_from_toc(self, element, lvl=0):
# use book.toc as a root
if isinstance(element, Link):
# todo: check if link exists
node = NavPoint(element)
if node.id:
self.id_anchor_exist_in_nav_points = True
self.href2ids[node.href].append(node.id)
self.adjacency_list[node] = None
return node
elif isinstance(element, tuple):
first, second = element
assert isinstance(first, Section)
node = NavPoint(first)
if node.id:
self.id_anchor_exist_in_nav_points = True
self.href2ids[node.href].append(node.id)
sub_nodes = []
for i in second:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[node] = sub_nodes
return node
elif isinstance(element, list) and (lvl == 0):
sub_nodes = []
for i in element:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[-1] = sub_nodes
else:
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
def is_toc_valid(self):
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return False
return True
def build_adjacency_list_from_spine(self):
manifest_id2href = self.build_manifest_id2href()
self.adjacency_list = {
-1: []
}
for id_, _ in self.ebooklib_book.spine:
node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
self.adjacency_list[-1].append(node)
def mark_and_line_href2soup_html(self):
# mark
for href in self.href2soup_html:
ids = self.href2ids[href]
for i in ids:
soup = self.href2soup_html[href]
tag = soup.find(id=i)
new_h = soup.new_tag('h1')
new_h.attrs['class'] = 'internal-mark'
new_h.attrs['id'] = i
tag.insert_before(new_h)
# go to line structure
for href in self.href2soup_html:
soup = self.href2soup_html[href]
self.href2soup_html[href] = unwrap_structural_tags(soup)
def build_one_anchored_section(self, node):
"""
к этому моементу html soup уже существует в линейном виде
- если не в линейном - то мы не виноваты
есть 3 случая:
id оборачивает весь контент,
id оборачивает контент чаптера и под-чаптера,
id только указывает на заголовок
во всех 3х случаях мы знаем где начало заголовка. Поэтому
глава - это все теги от текущего заголовка - до какого угодно следущющего
заголовок принимается в расчет если в toc есть указание id,тогда заголовок -
это любой тег с id из toc
:return:
"""
if node.id:
soup = self.href2soup_html[node.href]
chapter_tags = get_tags_between_ids(first_id=node.id, href=node.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser')
for tag in chapter_tags:
new_tree.append(tag)
self.id_anchor2soup[(node.href, node.id)] = new_tree
if self.adjacency_list.get(node):
for sub_node in self.adjacency_list[node]:
self.build_one_anchored_section(sub_node)
# print(f'Chapter: {node.href, node.id} is split.')
def build_anchor2soup(self):
nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in nav_points:
self.build_one_anchored_section(point)
def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
title = node.title
if node.id:
content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
else:
content: BeautifulSoup = self.href2soup_html[node.href]
preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None)
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
sub_nodes = []
# warning! not EpubHtmlItems won;t be added to chapter
if self.adjacency_list.get(node):
for sub_node in self.adjacency_list[node]:
sub_chapter_item = self.node2livecarta_chapter_item(sub_node)
sub_nodes.append(sub_chapter_item)
# print(f'Chapter: {title} is prepared.')
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self):
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
for nav_point in top_level_nav_points:
chapter = self.node2livecarta_chapter_item(nav_point)
top_level_chapters.append(chapter)
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
return {
"content": top_level_dict_chapters,
"footnotes": self.footnotes
}
if __name__ == "__main__":
json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/Chaos_Engineering.epub')
tmp = json_converter.convert_to_dict()
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
json.dump(tmp, f, ensure_ascii=False)