epub converter: add footnotes, list processing

This commit is contained in:
shirshasa
2021-04-19 11:20:20 +03:00
parent c6a4a5fac4
commit 1df37b6122
2 changed files with 120 additions and 15 deletions

View File

@@ -13,9 +13,17 @@ from ebooklib.utils import debug
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
preprocess_image preprocess_image, preprocess_footnotes
# epub3 examples:
# https://github.com/IDPF/epub3-samples
# specification:
# https://idpf.github.io/epub-vocabs/structure/
# footnotes:
# http://www.theheratik.net/books/tech-epub/chapter-8/
# http://kb.daisy.org/publishing/docs/html/epub-type.html
# todo: http://kb.daisy.org/publishing/docs/html/notes.html
# todo: https://docs.python.org/3/howto/unicode.html # todo: https://docs.python.org/3/howto/unicode.html
@@ -34,6 +42,10 @@ class EpubBookAdapter:
self.id_anchor_exist_in_nav_points = False self.id_anchor_exist_in_nav_points = False
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.footnotes = []
for href in self.href2soup_html:
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html,
noteref_attr_name='data-type'))
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo) # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
self.href2ids = defaultdict(list) self.href2ids = defaultdict(list)
@@ -71,8 +83,6 @@ class EpubBookAdapter:
def build_adjacency_list_from_toc(self, element, lvl=0): def build_adjacency_list_from_toc(self, element, lvl=0):
# use book.toc as a root # use book.toc as a root
# todo: read _create_section in get_nav
# todo: try list on hrefs, extra info in another db
if isinstance(element, Link): if isinstance(element, Link):
# todo: check if link exists # todo: check if link exists
@@ -210,7 +220,8 @@ if __name__ == "__main__":
l = [x.to_dict() for x in top_level_chapters] l = [x.to_dict() for x in top_level_chapters]
tmp = { tmp = {
"content": l "content": l,
"footnotes": adapter.footnotes
} }
with codecs.open('tmp.json', 'w', encoding='utf-8') as f: with codecs.open('tmp.json', 'w', encoding='utf-8') as f:

View File

@@ -1,8 +1,9 @@
import os import os
import pathlib import pathlib
import re import re
from typing import List
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString, Tag
from src.access import Access from src.access import Access
@@ -25,15 +26,16 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
return link return link
def preprocess_image(body_tag, href2img_content, path_to_html, access=None): def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
img_tags = body_tag.find_all('img') img_tags = body_tag.find_all('img')
for img in img_tags: for img in img_tags:
path_to_img_from_html = img.attrs.get('src') path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html) html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(html_folder ,path_to_img_from_html)) path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html))
assert path_to_img_from_root in href2img_content, f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root] img_content = href2img_content[path_to_img_from_root]
if access is not None: if access is not None:
@@ -44,6 +46,10 @@ def preprocess_image(body_tag, href2img_content, path_to_html, access=None):
img.attrs['src'] = str(new_folder) img.attrs['src'] = str(new_folder)
def preprocess_figure():
pass
def preprocess_table(): def preprocess_table():
pass pass
@@ -52,7 +58,20 @@ def preprocess_quote():
pass pass
def clean_heading_in_content(content, title: str): def _process_lists(body_tag):
"""
Function to process tags <li>.
Unwrap <p> tags.
"""
li_tags = body_tag.find_all("li")
for il_tag in li_tags:
if il_tag.p:
il_tag.attrs.update(il_tag.p.attrs)
il_tag.p.unwrap()
def clean_heading_in_content(content: Tag, title: str):
for child in content.contents: for child in content.contents:
if child.text and re.sub(r'([\n\t\xa0])', '', child.text): if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
if title == child.text: if title == child.text:
@@ -60,8 +79,82 @@ def clean_heading_in_content(content, title: str):
break break
def preprocess_footnotes(): def replace_with_livecarta_anchor_tag(anchor, i):
pass new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
anchor.replace_with(new_tag)
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> List[str]:
"""
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str):
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#')
return f, id_
def verify_footnote_tag(tags: list):
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
return tags
def get_footnote_tags2str(t):
unicode_string = ''
for child in t.children:
if type(child) is NavigableString:
unicode_string += str(child)
else:
unicode_string += child.decode_contents()
return unicode_string.strip()
def remove_internal_links_with_text(t):
for tag_a in t.find_all('a', {'href': re.compile('(^.+\.(html|xhtml)#.+)|(^#.+)')}):
tag_a.decompose()
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html[file]
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={'epub:type': re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
replace_with_livecarta_anchor_tag(noteref_tag, i)
remove_internal_links_with_text(footnote_tag)
content = get_footnote_tags2str(footnote_tag)
footnote_tag.decompose()
footnotes.append(content)
return footnotes
def add_fonts(): def add_fonts():
@@ -145,11 +238,11 @@ def get_tags_between_ids(first_id, href, html_soup):
return tags return tags
def prepare_title_and_content(title, content: BeautifulSoup): def prepare_title_and_content(title, content_tag: BeautifulSoup):
title_str = BeautifulSoup(title, features='lxml').string title_str = BeautifulSoup(title, features='lxml').string
# 0. cleaning \n # 0. cleaning \n
to_remove = [] to_remove = []
for child in content.contents: for child in content_tag.contents:
if isinstance(child, NavigableString): if isinstance(child, NavigableString):
s = re.sub(r'([\n\t\xa0])', '', child.string) s = re.sub(r'([\n\t\xa0])', '', child.string)
if s == '': if s == '':
@@ -157,8 +250,9 @@ def prepare_title_and_content(title, content: BeautifulSoup):
[x.extract() for x in to_remove] [x.extract() for x in to_remove]
# 1. rule#1 for heading removal # 1. rule#1 for heading removal
clean_heading_in_content(content, title_str) clean_heading_in_content(content_tag, title_str)
_process_lists(content_tag)
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content)) content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
return title_str, content_str return title_str, content_str