forked from LiveCarta/BookConverter
epub converter: add footnotes, list processing
This commit is contained in:
@@ -13,9 +13,17 @@ from ebooklib.utils import debug
|
|||||||
|
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
|
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
|
||||||
preprocess_image
|
preprocess_image, preprocess_footnotes
|
||||||
|
|
||||||
|
|
||||||
|
# epub3 examples:
|
||||||
|
# https://github.com/IDPF/epub3-samples
|
||||||
|
# specification:
|
||||||
|
# https://idpf.github.io/epub-vocabs/structure/
|
||||||
|
# footnotes:
|
||||||
|
# http://www.theheratik.net/books/tech-epub/chapter-8/
|
||||||
|
# http://kb.daisy.org/publishing/docs/html/epub-type.html
|
||||||
|
# todo: http://kb.daisy.org/publishing/docs/html/notes.html
|
||||||
# todo: https://docs.python.org/3/howto/unicode.html
|
# todo: https://docs.python.org/3/howto/unicode.html
|
||||||
|
|
||||||
|
|
||||||
@@ -34,6 +42,10 @@ class EpubBookAdapter:
|
|||||||
|
|
||||||
self.id_anchor_exist_in_nav_points = False
|
self.id_anchor_exist_in_nav_points = False
|
||||||
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
||||||
|
self.footnotes = []
|
||||||
|
for href in self.href2soup_html:
|
||||||
|
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html,
|
||||||
|
noteref_attr_name='data-type'))
|
||||||
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
|
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
|
||||||
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
|
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
|
||||||
self.href2ids = defaultdict(list)
|
self.href2ids = defaultdict(list)
|
||||||
@@ -71,8 +83,6 @@ class EpubBookAdapter:
|
|||||||
|
|
||||||
def build_adjacency_list_from_toc(self, element, lvl=0):
|
def build_adjacency_list_from_toc(self, element, lvl=0):
|
||||||
# use book.toc as a root
|
# use book.toc as a root
|
||||||
# todo: read _create_section in get_nav
|
|
||||||
# todo: try list on hrefs, extra info in another db
|
|
||||||
|
|
||||||
if isinstance(element, Link):
|
if isinstance(element, Link):
|
||||||
# todo: check if link exists
|
# todo: check if link exists
|
||||||
@@ -210,7 +220,8 @@ if __name__ == "__main__":
|
|||||||
l = [x.to_dict() for x in top_level_chapters]
|
l = [x.to_dict() for x in top_level_chapters]
|
||||||
|
|
||||||
tmp = {
|
tmp = {
|
||||||
"content": l
|
"content": l,
|
||||||
|
"footnotes": adapter.footnotes
|
||||||
}
|
}
|
||||||
|
|
||||||
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
|
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||||
|
|
||||||
from src.access import Access
|
from src.access import Access
|
||||||
|
|
||||||
@@ -25,15 +26,16 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
|
|||||||
return link
|
return link
|
||||||
|
|
||||||
|
|
||||||
def preprocess_image(body_tag, href2img_content, path_to_html, access=None):
|
def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
|
||||||
img_tags = body_tag.find_all('img')
|
img_tags = body_tag.find_all('img')
|
||||||
|
|
||||||
for img in img_tags:
|
for img in img_tags:
|
||||||
path_to_img_from_html = img.attrs.get('src')
|
path_to_img_from_html = img.attrs.get('src')
|
||||||
html_folder = os.path.dirname(path_to_html)
|
html_folder = os.path.dirname(path_to_html)
|
||||||
path_to_img_from_root = os.path.normpath(os.path.join(html_folder ,path_to_img_from_html))
|
path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html))
|
||||||
|
|
||||||
assert path_to_img_from_root in href2img_content, f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
|
assert path_to_img_from_root in href2img_content, \
|
||||||
|
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
|
||||||
|
|
||||||
img_content = href2img_content[path_to_img_from_root]
|
img_content = href2img_content[path_to_img_from_root]
|
||||||
if access is not None:
|
if access is not None:
|
||||||
@@ -44,6 +46,10 @@ def preprocess_image(body_tag, href2img_content, path_to_html, access=None):
|
|||||||
img.attrs['src'] = str(new_folder)
|
img.attrs['src'] = str(new_folder)
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_figure():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def preprocess_table():
|
def preprocess_table():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -52,7 +58,20 @@ def preprocess_quote():
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def clean_heading_in_content(content, title: str):
|
def _process_lists(body_tag):
|
||||||
|
"""
|
||||||
|
Function to process tags <li>.
|
||||||
|
Unwrap <p> tags.
|
||||||
|
"""
|
||||||
|
li_tags = body_tag.find_all("li")
|
||||||
|
|
||||||
|
for il_tag in li_tags:
|
||||||
|
if il_tag.p:
|
||||||
|
il_tag.attrs.update(il_tag.p.attrs)
|
||||||
|
il_tag.p.unwrap()
|
||||||
|
|
||||||
|
|
||||||
|
def clean_heading_in_content(content: Tag, title: str):
|
||||||
for child in content.contents:
|
for child in content.contents:
|
||||||
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
|
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
|
||||||
if title == child.text:
|
if title == child.text:
|
||||||
@@ -60,8 +79,82 @@ def clean_heading_in_content(content, title: str):
|
|||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def preprocess_footnotes():
|
def replace_with_livecarta_anchor_tag(anchor, i):
|
||||||
pass
|
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||||
|
new_tag['class'] = 'footnote-element'
|
||||||
|
new_tag['data-id'] = i + 1
|
||||||
|
new_tag['id'] = f'footnote-{i + 1}'
|
||||||
|
new_tag.string = '*'
|
||||||
|
anchor.replace_with(new_tag)
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> List[str]:
|
||||||
|
"""
|
||||||
|
This function should be earlier that adding fonts in pipeline.
|
||||||
|
|
||||||
|
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
|
||||||
|
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
||||||
|
|
||||||
|
"""
|
||||||
|
footnotes = []
|
||||||
|
noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'})
|
||||||
|
bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')])
|
||||||
|
noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
|
||||||
|
[tag.decompose() for tag in bad_noterefs_tags]
|
||||||
|
|
||||||
|
def parse_a_tag_href(s: str):
|
||||||
|
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
|
||||||
|
f, id_ = s.split('#')
|
||||||
|
return f, id_
|
||||||
|
|
||||||
|
def verify_footnote_tag(tags: list):
|
||||||
|
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
|
||||||
|
if len(tags) == 0:
|
||||||
|
anchored_tags = list(target_html_tag.find_all(id=element_id))
|
||||||
|
if len(anchored_tags):
|
||||||
|
print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
|
||||||
|
return anchored_tags
|
||||||
|
else:
|
||||||
|
assert 0, f'Error, No element with id: {href} found.'
|
||||||
|
|
||||||
|
return tags
|
||||||
|
|
||||||
|
def get_footnote_tags2str(t):
|
||||||
|
unicode_string = ''
|
||||||
|
for child in t.children:
|
||||||
|
if type(child) is NavigableString:
|
||||||
|
unicode_string += str(child)
|
||||||
|
else:
|
||||||
|
unicode_string += child.decode_contents()
|
||||||
|
|
||||||
|
return unicode_string.strip()
|
||||||
|
|
||||||
|
def remove_internal_links_with_text(t):
|
||||||
|
for tag_a in t.find_all('a', {'href': re.compile('(^.+\.(html|xhtml)#.+)|(^#.+)')}):
|
||||||
|
tag_a.decompose()
|
||||||
|
|
||||||
|
for i, noteref_tag in enumerate(noterefs_tags):
|
||||||
|
href = noteref_tag.attrs['href']
|
||||||
|
file, element_id = parse_a_tag_href(href)
|
||||||
|
if not file:
|
||||||
|
target_html_tag = source_html_tag
|
||||||
|
else:
|
||||||
|
target_html_tag = href2soup_html[file]
|
||||||
|
|
||||||
|
possible_footnote = 'note|footnote|endnote|rearenote'
|
||||||
|
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
|
||||||
|
attrs={'epub:type': re.compile(possible_footnote)}))
|
||||||
|
|
||||||
|
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
|
||||||
|
footnote_tag = expected_footnote_tags[0]
|
||||||
|
replace_with_livecarta_anchor_tag(noteref_tag, i)
|
||||||
|
remove_internal_links_with_text(footnote_tag)
|
||||||
|
content = get_footnote_tags2str(footnote_tag)
|
||||||
|
|
||||||
|
footnote_tag.decompose()
|
||||||
|
footnotes.append(content)
|
||||||
|
|
||||||
|
return footnotes
|
||||||
|
|
||||||
|
|
||||||
def add_fonts():
|
def add_fonts():
|
||||||
@@ -145,11 +238,11 @@ def get_tags_between_ids(first_id, href, html_soup):
|
|||||||
return tags
|
return tags
|
||||||
|
|
||||||
|
|
||||||
def prepare_title_and_content(title, content: BeautifulSoup):
|
def prepare_title_and_content(title, content_tag: BeautifulSoup):
|
||||||
title_str = BeautifulSoup(title, features='lxml').string
|
title_str = BeautifulSoup(title, features='lxml').string
|
||||||
# 0. cleaning \n
|
# 0. cleaning \n
|
||||||
to_remove = []
|
to_remove = []
|
||||||
for child in content.contents:
|
for child in content_tag.contents:
|
||||||
if isinstance(child, NavigableString):
|
if isinstance(child, NavigableString):
|
||||||
s = re.sub(r'([\n\t\xa0])', '', child.string)
|
s = re.sub(r'([\n\t\xa0])', '', child.string)
|
||||||
if s == '':
|
if s == '':
|
||||||
@@ -157,8 +250,9 @@ def prepare_title_and_content(title, content: BeautifulSoup):
|
|||||||
|
|
||||||
[x.extract() for x in to_remove]
|
[x.extract() for x in to_remove]
|
||||||
# 1. rule#1 for heading removal
|
# 1. rule#1 for heading removal
|
||||||
clean_heading_in_content(content, title_str)
|
clean_heading_in_content(content_tag, title_str)
|
||||||
|
_process_lists(content_tag)
|
||||||
|
|
||||||
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content))
|
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
||||||
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||||
return title_str, content_str
|
return title_str, content_str
|
||||||
|
|||||||
Reference in New Issue
Block a user