diff --git a/src/data_objects.py b/src/data_objects.py new file mode 100644 index 0000000..c1a9517 --- /dev/null +++ b/src/data_objects.py @@ -0,0 +1,62 @@ +import re +from typing import Union + +from ebooklib.epub import Section, Link + + +""" +These are data structures which form mapping from NCX to python data structures. +""" + + +class NavPoint: + def __init__(self, obj: Union[Link, Section]=None, ): + self.href, self.id = self.parse_href_id(obj) + self.title = obj.title + + @staticmethod + def parse_href_id(item: Union[Link, Section]): + reg = '(.+\..+\#)(.+)' + match = re.search(reg, item.href) + href, div_id = None, None + if match: + div_id = match.group(2) + if match.group(1): + href = match.group(1)[:-1] + else: + reg2 = '(.+\..+)' + match2 = re.search(reg2, item.href) + if match2 and match2.group(1): + href = match2.group(1) + + return href, div_id + + def __str__(self): + return '' % (self.href, self.id) + + +""" +These are data structures which form mapping to livecarta json structure. +""" + + +class ChapterItem: + def __init__(self, title, content, sub_items): + self.title = title + self.content = content + self.sub_items = sub_items + + def to_dict(self): + tmp = [] + if self.sub_items: + for i in self.sub_items: + tmp.append(i.to_dict()) + + return { + "title": self.title, + "contents": [self.content], + "sub_items": tmp + } + + def __str__(self): + return '' % self.title diff --git a/src/epub_converter.py b/src/epub_converter.py new file mode 100644 index 0000000..28ae341 --- /dev/null +++ b/src/epub_converter.py @@ -0,0 +1,205 @@ +import codecs +import json +import re +from collections import defaultdict +from typing import Dict, Union + +import ebooklib +from bs4 import BeautifulSoup +from ebooklib import epub +from ebooklib.epub import Link, Section + +from src.data_objects import ChapterItem, NavPoint +from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids + + +class EpubBookAdapter: + def __init__(self, file): + self.file = file + self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib + self.id_anchor_exist_in_nav_points = False + self.href2soup_html = self.build_href2soup_content() + # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap + # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo) + self.href2ids = defaultdict(list) + self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf + self.build_adjacency_list_from_toc(self.ebooklib_book.toc) + self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed + self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} + + if not self.is_toc_valid(): + self.build_adjacency_list_from_spine() + + self.build_anchor2soup() + + # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list + # self.add_missed_items_from_spine() # to contents to the chapter after which it placed in spine + + def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: + # using EpubElements + # for now just for HTML objects, as it is simplest chapter + # todo: check if other chapters exist + nodes = dict() + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + html_text = item.get_body_content() + soup = BeautifulSoup(html_text, features='lxml') + nodes[item.file_name] = soup + + return nodes + + def build_manifest_id2href(self): + links = dict() + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + links[item.id] = item.file_name + + return links + + def build_adjacency_list_from_toc(self, element, lvl=0): + # use book.toc as a root + # todo: read _create_section in get_nav + # todo: try list on hrefs, extra info in another db + + if isinstance(element, Link): + # todo: check if link exists + node = NavPoint(element) + if node.id: + self.id_anchor_exist_in_nav_points = True + self.href2ids[node.href].append(node.id) + self.adjacency_list[node] = None + return node + + elif isinstance(element, tuple): + first, second = element + assert isinstance(first, Section) + node = NavPoint(first) + if node.id: + self.id_anchor_exist_in_nav_points = True + self.href2ids[node.href].append(node.id) + + sub_nodes = [] + for i in second: + sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) + + self.adjacency_list[node] = sub_nodes + return node + + elif isinstance(element, list) and (lvl == 0): + sub_nodes = [] + for i in element: + sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) + + self.adjacency_list[-1] = sub_nodes + + else: + assert 0, f'Error. Element is not tuple/Link instance: {type(element)}' + + def is_toc_valid(self): + if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): + return False + return True + + def build_adjacency_list_from_spine(self): + manifest_id2href = self.build_manifest_id2href() + self.adjacency_list = { + -1: [] + } + for id_, _ in self.ebooklib_book.spine: + node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) + self.adjacency_list[-1].append(node) + + def mark_and_line_href2soup_html(self): + # mark + for href in self.href2soup_html: + ids = self.href2ids[href] + for i in ids: + soup = self.href2soup_html[href] + tag = soup.find(id=i) + new_h = soup.new_tag('h1') + new_h.attrs['class'] = 'internal-mark' + new_h.attrs['id'] = i + tag.insert_before(new_h) + + # go to line structure + for href in self.href2soup_html: + soup = self.href2soup_html[href] + self.href2soup_html[href] = unwrap_structural_tags(soup) + + def build_one_anchored_section(self, node): + """ + к этому моементу html soup уже существует в линейном виде + - если не в линейном - то мы не виноваты + + есть 3 случая: + id оборачивает весь контент, + id оборачивает контент чаптера и под-чаптера, + id только указывает на заголовок + + во всех 3х случаях мы знаем где начало заголовка. Поэтому + глава - это все теги от текущего заголовка - до какого угодно следущющего + + заголовок принимается в расчет если в toc есть указание id,тогда заголовок - + это любой тег с id из toc + :return: + """ + if node.id: + soup = self.href2soup_html[node.href] + chapter_tags = get_tags_between_ids(first_id=node.id, href=node.href, html_soup=soup) + new_tree = BeautifulSoup('', 'html.parser') + for tag in chapter_tags: + new_tree.append(tag) + self.id_anchor2soup[(node.href, node.id)] = new_tree + + if self.adjacency_list.get(node): + for sub_node in self.adjacency_list[node]: + self.build_one_anchored_section(sub_node) + + print(f'Chapter: {node.href, node.id} is split.') + + def build_anchor2soup(self): + nav_points = self.adjacency_list[-1] + if self.id_anchor_exist_in_nav_points: + for point in nav_points: + self.build_one_anchored_section(point) + + def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem: + title = node.title + if node.id: + content = self.id_anchor2soup[(node.href, node.id)] + else: + content = self.href2soup_html[node.href] + content_preprocessed = str(content) # todo self.preprocess_html(content, node.id) + content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed) + sub_nodes = [] + # warning! not EpubHtmlItems won;t be added to chapter + if self.adjacency_list.get(node): + for sub_node in self.adjacency_list[node]: + sub_chapter_item = self.node2livecarta_chapter_item(sub_node) + sub_nodes.append(sub_chapter_item) + + # print(f'Chapter: {title} is prepared.') + return ChapterItem(title, content_preprocessed, sub_nodes) + + +if __name__ == "__main__": + adapter = EpubBookAdapter('/home/katerina/PycharmProjects/Jenia/converter/epub/calibri.epub') + + top_level_nav_points = adapter.adjacency_list[-1] + top_level_chapters = [] + + for nav_point in top_level_nav_points: + chapter = adapter.node2livecarta_chapter_item(nav_point) + top_level_chapters.append(chapter) + + l = [x.to_dict() for x in top_level_chapters] + + tmp = { + "content": l + } + + output_file = open('output.out', 'w') + output_file.write(str(tmp)) + + with codecs.open('tmp.json', 'w', encoding='utf-8') as f: + json.dump(tmp, f, ensure_ascii=False) + + diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py new file mode 100644 index 0000000..8e1eb7c --- /dev/null +++ b/src/html_epub_preprocessor.py @@ -0,0 +1,104 @@ +import re + +from bs4 import BeautifulSoup, NavigableString + + +def preprocess_image(): + pass + + +def preprocess_table(): + pass + + +def preprocess_quote(): + pass + + +def clean_heading_in_content(): + pass + + +def preprocess_footnotes(): + pass + + +def add_fonts(): + pass + + +def unwrap_structural_tags(body_tag): + divs = body_tag.find_all("div") + for div in divs: + div.unwrap() + + secs = body_tag.find_all("section") + for s in secs: + s.unwrap() + + articles = body_tag.find_all("article") + for s in articles: + s.unwrap() + + articles = body_tag.find_all("main") + for s in articles: + s.unwrap() + + articles = body_tag.find_all("body") + for s in articles: + s.unwrap() + + # articles = body_tag.find_all("html") + # for s in articles: + # s.unwrap() + + spans = body_tag.find_all("span") + # not all cases, if span has

s and NavigableString, it won't unwrap + for s in spans: + if not s.string and s.contents: + is_string = [isinstance(child, NavigableString) for child in s.contents] + if any(is_string): + pass + else: + s.unwrap() + + for node in body_tag: + if isinstance(node, NavigableString): + content = str(node) + content = re.sub(r'([\n\t\xa0])', ' ', content) + content = content.strip() + if content: + tag = body_tag.new_tag('p') + tag.append(str(node)) + node.replace_with(tag) + + return body_tag + + +def str2html_soup(html_text: str, element_id=None): + html_soup = BeautifulSoup(html_text, features='lxml') + if element_id: + x = html_soup.find(id=element_id) + return str(x) + else: + return str(html_text) + + +def get_tags_between_ids(first_id, href, html_soup): + h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'}) + if h_marked: + p = h_marked.next_sibling + tags = [] + while p: + if p.name == 'h1' and p.attrs.get('class') == 'internal-mark': + break + tags.append(p) + p = p.next_sibling + + tags = [tag.extract() for tag in tags] + html_soup.smooth() + + else: + assert 0, f'Warning: no match for {first_id, href}' + + return tags