From 1df37b6122eaebd1327a1bd49a18f6ed1e0e2094 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Mon, 19 Apr 2021 11:20:20 +0300 Subject: [PATCH] epub converter: add footnotes, list processing --- src/epub_converter.py | 19 ++++-- src/html_epub_preprocessor.py | 116 ++++++++++++++++++++++++++++++---- 2 files changed, 120 insertions(+), 15 deletions(-) diff --git a/src/epub_converter.py b/src/epub_converter.py index bb8ee2b..b54eec4 100644 --- a/src/epub_converter.py +++ b/src/epub_converter.py @@ -13,9 +13,17 @@ from ebooklib.utils import debug from src.data_objects import ChapterItem, NavPoint from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \ - preprocess_image + preprocess_image, preprocess_footnotes +# epub3 examples: +# https://github.com/IDPF/epub3-samples +# specification: +# https://idpf.github.io/epub-vocabs/structure/ +# footnotes: +# http://www.theheratik.net/books/tech-epub/chapter-8/ +# http://kb.daisy.org/publishing/docs/html/epub-type.html +# todo: http://kb.daisy.org/publishing/docs/html/notes.html # todo: https://docs.python.org/3/howto/unicode.html @@ -34,6 +42,10 @@ class EpubBookAdapter: self.id_anchor_exist_in_nav_points = False self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() + self.footnotes = [] + for href in self.href2soup_html: + self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html, + noteref_attr_name='data-type')) # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo) self.href2ids = defaultdict(list) @@ -71,8 +83,6 @@ class EpubBookAdapter: def build_adjacency_list_from_toc(self, element, lvl=0): # use book.toc as a root - # todo: read _create_section in get_nav - # todo: try list on hrefs, extra info in another db if isinstance(element, Link): # todo: check if link exists @@ -210,7 +220,8 @@ if __name__ == "__main__": l = [x.to_dict() for x in top_level_chapters] tmp = { - "content": l + "content": l, + "footnotes": adapter.footnotes } with codecs.open('tmp.json', 'w', encoding='utf-8') as f: diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 0f24c9c..c615f35 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -1,8 +1,9 @@ import os import pathlib import re +from typing import List -from bs4 import BeautifulSoup, NavigableString +from bs4 import BeautifulSoup, NavigableString, Tag from src.access import Access @@ -25,15 +26,16 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id return link -def preprocess_image(body_tag, href2img_content, path_to_html, access=None): +def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None): img_tags = body_tag.find_all('img') for img in img_tags: path_to_img_from_html = img.attrs.get('src') html_folder = os.path.dirname(path_to_html) - path_to_img_from_root = os.path.normpath(os.path.join(html_folder ,path_to_img_from_html)) + path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html)) - assert path_to_img_from_root in href2img_content, f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' + assert path_to_img_from_root in href2img_content, \ + f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' img_content = href2img_content[path_to_img_from_root] if access is not None: @@ -44,6 +46,10 @@ def preprocess_image(body_tag, href2img_content, path_to_html, access=None): img.attrs['src'] = str(new_folder) +def preprocess_figure(): + pass + + def preprocess_table(): pass @@ -52,7 +58,20 @@ def preprocess_quote(): pass -def clean_heading_in_content(content, title: str): +def _process_lists(body_tag): + """ + Function to process tags
  • . + Unwrap

    tags. + """ + li_tags = body_tag.find_all("li") + + for il_tag in li_tags: + if il_tag.p: + il_tag.attrs.update(il_tag.p.attrs) + il_tag.p.unwrap() + + +def clean_heading_in_content(content: Tag, title: str): for child in content.contents: if child.text and re.sub(r'([\n\t\xa0])', '', child.text): if title == child.text: @@ -60,8 +79,82 @@ def clean_heading_in_content(content, title: str): break -def preprocess_footnotes(): - pass +def replace_with_livecarta_anchor_tag(anchor, i): + new_tag = BeautifulSoup(features='lxml').new_tag('sup') + new_tag['class'] = 'footnote-element' + new_tag['data-id'] = i + 1 + new_tag['id'] = f'footnote-{i + 1}' + new_tag.string = '*' + anchor.replace_with(new_tag) + + +def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> List[str]: + """ + This function should be earlier that adding fonts in pipeline. + +

    Here is an example footnote1

    + + + """ + footnotes = [] + noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'}) + bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')]) + noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags] + [tag.decompose() for tag in bad_noterefs_tags] + + def parse_a_tag_href(s: str): + assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' + f, id_ = s.split('#') + return f, id_ + + def verify_footnote_tag(tags: list): + assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}' + if len(tags) == 0: + anchored_tags = list(target_html_tag.find_all(id=element_id)) + if len(anchored_tags): + print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}') + return anchored_tags + else: + assert 0, f'Error, No element with id: {href} found.' + + return tags + + def get_footnote_tags2str(t): + unicode_string = '' + for child in t.children: + if type(child) is NavigableString: + unicode_string += str(child) + else: + unicode_string += child.decode_contents() + + return unicode_string.strip() + + def remove_internal_links_with_text(t): + for tag_a in t.find_all('a', {'href': re.compile('(^.+\.(html|xhtml)#.+)|(^#.+)')}): + tag_a.decompose() + + for i, noteref_tag in enumerate(noterefs_tags): + href = noteref_tag.attrs['href'] + file, element_id = parse_a_tag_href(href) + if not file: + target_html_tag = source_html_tag + else: + target_html_tag = href2soup_html[file] + + possible_footnote = 'note|footnote|endnote|rearenote' + expected_footnote_tags = list(target_html_tag.find_all(id=element_id, + attrs={'epub:type': re.compile(possible_footnote)})) + + expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) + footnote_tag = expected_footnote_tags[0] + replace_with_livecarta_anchor_tag(noteref_tag, i) + remove_internal_links_with_text(footnote_tag) + content = get_footnote_tags2str(footnote_tag) + + footnote_tag.decompose() + footnotes.append(content) + + return footnotes def add_fonts(): @@ -145,11 +238,11 @@ def get_tags_between_ids(first_id, href, html_soup): return tags -def prepare_title_and_content(title, content: BeautifulSoup): +def prepare_title_and_content(title, content_tag: BeautifulSoup): title_str = BeautifulSoup(title, features='lxml').string # 0. cleaning \n to_remove = [] - for child in content.contents: + for child in content_tag.contents: if isinstance(child, NavigableString): s = re.sub(r'([\n\t\xa0])', '', child.string) if s == '': @@ -157,8 +250,9 @@ def prepare_title_and_content(title, content: BeautifulSoup): [x.extract() for x in to_remove] # 1. rule#1 for heading removal - clean_heading_in_content(content, title_str) + clean_heading_in_content(content_tag, title_str) + _process_lists(content_tag) - content_str = re.sub(r'([\n\t\xa0])', ' ', str(content)) + content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) return title_str, content_str