epub converter: add files

This commit is contained in:
shirshasa
2021-04-14 14:29:19 +03:00
parent 4eb30bd80c
commit 880b045de0
3 changed files with 371 additions and 0 deletions

62
src/data_objects.py Normal file
View File

@@ -0,0 +1,62 @@
import re
from typing import Union
from ebooklib.epub import Section, Link
"""
These are data structures which form mapping from NCX to python data structures.
"""
class NavPoint:
def __init__(self, obj: Union[Link, Section]=None, ):
self.href, self.id = self.parse_href_id(obj)
self.title = obj.title
@staticmethod
def parse_href_id(item: Union[Link, Section]):
reg = '(.+\..+\#)(.+)'
match = re.search(reg, item.href)
href, div_id = None, None
if match:
div_id = match.group(2)
if match.group(1):
href = match.group(1)[:-1]
else:
reg2 = '(.+\..+)'
match2 = re.search(reg2, item.href)
if match2 and match2.group(1):
href = match2.group(1)
return href, div_id
def __str__(self):
return '<NavPoint: %s, %s>' % (self.href, self.id)
"""
These are data structures which form mapping to livecarta json structure.
"""
class ChapterItem:
def __init__(self, title, content, sub_items):
self.title = title
self.content = content
self.sub_items = sub_items
def to_dict(self):
tmp = []
if self.sub_items:
for i in self.sub_items:
tmp.append(i.to_dict())
return {
"title": self.title,
"contents": [self.content],
"sub_items": tmp
}
def __str__(self):
return '<Chapter: %s>' % self.title

205
src/epub_converter.py Normal file
View File

@@ -0,0 +1,205 @@
import codecs
import json
import re
from collections import defaultdict
from typing import Dict, Union
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
from ebooklib.epub import Link, Section
from src.data_objects import ChapterItem, NavPoint
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids
class EpubBookAdapter:
def __init__(self, file):
self.file = file
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
self.id_anchor_exist_in_nav_points = False
self.href2soup_html = self.build_href2soup_content()
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
self.href2ids = defaultdict(list)
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
if not self.is_toc_valid():
self.build_adjacency_list_from_spine()
self.build_anchor2soup()
# if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
# self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
# for now just for HTML objects, as it is simplest chapter
# todo: check if other chapters exist
nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_text = item.get_body_content()
soup = BeautifulSoup(html_text, features='lxml')
nodes[item.file_name] = soup
return nodes
def build_manifest_id2href(self):
links = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
links[item.id] = item.file_name
return links
def build_adjacency_list_from_toc(self, element, lvl=0):
# use book.toc as a root
# todo: read _create_section in get_nav
# todo: try list on hrefs, extra info in another db
if isinstance(element, Link):
# todo: check if link exists
node = NavPoint(element)
if node.id:
self.id_anchor_exist_in_nav_points = True
self.href2ids[node.href].append(node.id)
self.adjacency_list[node] = None
return node
elif isinstance(element, tuple):
first, second = element
assert isinstance(first, Section)
node = NavPoint(first)
if node.id:
self.id_anchor_exist_in_nav_points = True
self.href2ids[node.href].append(node.id)
sub_nodes = []
for i in second:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[node] = sub_nodes
return node
elif isinstance(element, list) and (lvl == 0):
sub_nodes = []
for i in element:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[-1] = sub_nodes
else:
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
def is_toc_valid(self):
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return False
return True
def build_adjacency_list_from_spine(self):
manifest_id2href = self.build_manifest_id2href()
self.adjacency_list = {
-1: []
}
for id_, _ in self.ebooklib_book.spine:
node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
self.adjacency_list[-1].append(node)
def mark_and_line_href2soup_html(self):
# mark
for href in self.href2soup_html:
ids = self.href2ids[href]
for i in ids:
soup = self.href2soup_html[href]
tag = soup.find(id=i)
new_h = soup.new_tag('h1')
new_h.attrs['class'] = 'internal-mark'
new_h.attrs['id'] = i
tag.insert_before(new_h)
# go to line structure
for href in self.href2soup_html:
soup = self.href2soup_html[href]
self.href2soup_html[href] = unwrap_structural_tags(soup)
def build_one_anchored_section(self, node):
"""
к этому моементу html soup уже существует в линейном виде
- если не в линейном - то мы не виноваты
есть 3 случая:
id оборачивает весь контент,
id оборачивает контент чаптера и под-чаптера,
id только указывает на заголовок
во всех 3х случаях мы знаем где начало заголовка. Поэтому
глава - это все теги от текущего заголовка - до какого угодно следущющего
заголовок принимается в расчет если в toc есть указание id,тогда заголовок -
это любой тег с id из toc
:return:
"""
if node.id:
soup = self.href2soup_html[node.href]
chapter_tags = get_tags_between_ids(first_id=node.id, href=node.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser')
for tag in chapter_tags:
new_tree.append(tag)
self.id_anchor2soup[(node.href, node.id)] = new_tree
if self.adjacency_list.get(node):
for sub_node in self.adjacency_list[node]:
self.build_one_anchored_section(sub_node)
print(f'Chapter: {node.href, node.id} is split.')
def build_anchor2soup(self):
nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in nav_points:
self.build_one_anchored_section(point)
def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
title = node.title
if node.id:
content = self.id_anchor2soup[(node.href, node.id)]
else:
content = self.href2soup_html[node.href]
content_preprocessed = str(content) # todo self.preprocess_html(content, node.id)
content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed)
sub_nodes = []
# warning! not EpubHtmlItems won;t be added to chapter
if self.adjacency_list.get(node):
for sub_node in self.adjacency_list[node]:
sub_chapter_item = self.node2livecarta_chapter_item(sub_node)
sub_nodes.append(sub_chapter_item)
# print(f'Chapter: {title} is prepared.')
return ChapterItem(title, content_preprocessed, sub_nodes)
if __name__ == "__main__":
adapter = EpubBookAdapter('/home/katerina/PycharmProjects/Jenia/converter/epub/calibri.epub')
top_level_nav_points = adapter.adjacency_list[-1]
top_level_chapters = []
for nav_point in top_level_nav_points:
chapter = adapter.node2livecarta_chapter_item(nav_point)
top_level_chapters.append(chapter)
l = [x.to_dict() for x in top_level_chapters]
tmp = {
"content": l
}
output_file = open('output.out', 'w')
output_file.write(str(tmp))
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
json.dump(tmp, f, ensure_ascii=False)

View File

@@ -0,0 +1,104 @@
import re
from bs4 import BeautifulSoup, NavigableString
def preprocess_image():
pass
def preprocess_table():
pass
def preprocess_quote():
pass
def clean_heading_in_content():
pass
def preprocess_footnotes():
pass
def add_fonts():
pass
def unwrap_structural_tags(body_tag):
divs = body_tag.find_all("div")
for div in divs:
div.unwrap()
secs = body_tag.find_all("section")
for s in secs:
s.unwrap()
articles = body_tag.find_all("article")
for s in articles:
s.unwrap()
articles = body_tag.find_all("main")
for s in articles:
s.unwrap()
articles = body_tag.find_all("body")
for s in articles:
s.unwrap()
# articles = body_tag.find_all("html")
# for s in articles:
# s.unwrap()
spans = body_tag.find_all("span")
# not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in spans:
if not s.string and s.contents:
is_string = [isinstance(child, NavigableString) for child in s.contents]
if any(is_string):
pass
else:
s.unwrap()
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def str2html_soup(html_text: str, element_id=None):
html_soup = BeautifulSoup(html_text, features='lxml')
if element_id:
x = html_soup.find(id=element_id)
return str(x)
else:
return str(html_text)
def get_tags_between_ids(first_id, href, html_soup):
h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
if h_marked:
p = h_marked.next_sibling
tags = []
while p:
if p.name == 'h1' and p.attrs.get('class') == 'internal-mark':
break
tags.append(p)
p = p.next_sibling
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f'Warning: no match for {first_id, href}'
return tags