epub converter: add internal links processing

- fix cleaning headings
- fix when chapter mark is not on the 1 level
This commit is contained in:
shirshasa
2021-05-25 12:58:14 +03:00
parent 0ac20999b5
commit 3eac136e07
2 changed files with 146 additions and 59 deletions

View File

@@ -1,6 +1,7 @@
import codecs import codecs
import json import json
import logging import logging
import re
from os.path import dirname, normpath, join from os.path import dirname, normpath, join
from collections import defaultdict from collections import defaultdict
from typing import Dict, Union from typing import Dict, Union
@@ -22,9 +23,9 @@ class EpubPostprocessor:
def __init__(self, file, access=None, logger=None): def __init__(self, file, access=None, logger=None):
self.file = file self.file = file
self.access = access self.access = access
self.logger = logger self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
self.internal_links_found = 0
self.logger.log('Image processing.') self.logger.log('Image processing.')
self.href2img_bytes = {} self.href2img_bytes = {}
self.old_image_path2_aws_path = {} self.old_image_path2_aws_path = {}
@@ -42,13 +43,13 @@ class EpubPostprocessor:
self.id_anchor_exist_in_nav_points = False self.id_anchor_exist_in_nav_points = False
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.logger.log('CSS processing.') self.logger.log('CSS files processing.')
self.html_href2css_href = {} self.html_href2css_href = {}
self.css_href2content = {} self.css_href2content = {}
self.build_css_content() self.build_css_content()
# add css # add css
# self.logger.log('CSS styles adding processing.') self.logger.log('CSS styles adding.')
# self.add_css_styles2soup() self.add_css_styles2soup()
self.logger.log('Footnotes processing.') self.logger.log('Footnotes processing.')
self.footnotes = [] self.footnotes = []
@@ -57,16 +58,17 @@ class EpubPostprocessor:
self.logger.log(f'Added {len(self.footnotes)} footnotes.') self.logger.log(f'Added {len(self.footnotes)} footnotes.')
self.logger.log('TOC processing.') self.logger.log('TOC processing.')
self.href2ids = defaultdict(list) self.href2ids = defaultdict(list)
self.added_to_toc_hrefs = [] self.added_to_toc_hrefs = set()
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf
self.build_adjacency_list_from_toc(self.ebooklib_book.toc) self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed # build simple toc from spine if needed
if not self.is_toc_valid(): if not self.is_toc_valid():
self.build_adjacency_list_from_spine() self.build_adjacency_list_from_spine()
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs] not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
self.logger.log(f'html documents not added to TOC: {not_added}') self.logger.log(f'Html documents not added to TOC: {not_added}.')
# read anchored blocks, split html into separate block # read anchored blocks, split html into separate block
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
self.process_internal_links()
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
self.build_anchor2soup() self.build_anchor2soup()
@@ -131,7 +133,7 @@ class EpubPostprocessor:
self.id_anchor_exist_in_nav_points = True self.id_anchor_exist_in_nav_points = True
self.href2ids[node.href].append(node.id) self.href2ids[node.href].append(node.id)
self.adjacency_list[node] = None self.adjacency_list[node] = None
self.added_to_toc_hrefs.append(node.href) self.added_to_toc_hrefs.add(node.href)
return node return node
elif isinstance(element, tuple): elif isinstance(element, tuple):
@@ -147,7 +149,7 @@ class EpubPostprocessor:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[node] = sub_nodes self.adjacency_list[node] = sub_nodes
self.added_to_toc_hrefs.append(node.href) self.added_to_toc_hrefs.add(node.href)
return node return node
elif isinstance(element, list) and (lvl == 0): elif isinstance(element, list) and (lvl == 0):
@@ -173,7 +175,7 @@ class EpubPostprocessor:
for id_, _ in self.ebooklib_book.spine: for id_, _ in self.ebooklib_book.spine:
node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
self.adjacency_list[-1].append(node) self.adjacency_list[-1].append(node)
self.added_to_toc_hrefs.append(node.href) self.added_to_toc_hrefs.add(node.href)
def mark_and_line_href2soup_html(self): def mark_and_line_href2soup_html(self):
# mark # mark
@@ -182,8 +184,8 @@ class EpubPostprocessor:
for i in ids: for i in ids:
soup = self.href2soup_html[href] soup = self.href2soup_html[href]
tag = soup.find(id=i) tag = soup.find(id=i)
new_h = soup.new_tag('h1') new_h = soup.new_tag('tmp')
new_h.attrs['class'] = 'internal-mark' new_h.attrs['class'] = 'converter-chapter-mark'
new_h.attrs['id'] = i new_h.attrs['id'] = i
tag.insert_before(new_h) tag.insert_before(new_h)
@@ -192,6 +194,64 @@ class EpubPostprocessor:
soup = self.href2soup_html[href] soup = self.href2soup_html[href]
self.href2soup_html[href] = unwrap_structural_tags(soup) self.href2soup_html[href] = unwrap_structural_tags(soup)
@staticmethod
def _create_unique_id(href, id_):
return re.sub(r'([^\w\s])|_|-', '', href) + id_
def process_internal_links(self):
# rebuild ids to be unique in all documents
for href in self.added_to_toc_hrefs:
for tag in self.href2soup_html[href].find_all(attrs={'id': re.compile(r'.+')}):
if tag.attrs.get('class') == 'converter-chapter-mark':
continue
new_id = self._create_unique_id(href, tag.attrs['id'])
tag.attrs['id'] = new_id
# write placeholder to all internal links
internal_link_reg = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
for href in self.added_to_toc_hrefs:
soup = self.href2soup_html[href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg}):
href_in_link, id_in_link = internal_link_tag.attrs['href'].split('#')
if not href_in_link:
href_in_link = href
# find full path
full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path]
if not full_path:
self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.')
internal_link_tag.attrs['converter-mark'] = 'bad-link'
continue
if len(full_path) > 1:
self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}'
f' while {internal_link_tag} processing. The first one will be chosen.')
href_in_link = full_path[0]
new_id = self._create_unique_id(href_in_link, id_in_link)
anchor_soup = self.href2soup_html[href_in_link]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
if anchor_tags:
if len(anchor_tags) > 1:
self.logger.log(f'Warning in {href}: multiple anchors: {anchor_tags} found.'
f' While processing {internal_link_tag}')
anchor_tag = anchor_tags[0]
# if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
anchor_tag.attrs['class'] = 'link-anchor'
del internal_link_tag.attrs['href']
self.internal_links_found += 1
else:
internal_link_tag.attrs['converter-mark'] = 'bad-link'
if 'page' not in id_in_link:
self.logger.log(f'Error in {href}. While processing {internal_link_tag} no anchor found.'
f' Should be anchor with new id={new_id} in {href_in_link} file.'
f' Old id={id_in_link}')
def build_one_anchored_section(self, node): def build_one_anchored_section(self, node):
""" """
к этому моементу html soup уже существует в линейном виде к этому моементу html soup уже существует в линейном виде
@@ -248,11 +308,12 @@ class EpubPostprocessor:
# warning! not EpubHtmlItems won;t be added to chapter # warning! not EpubHtmlItems won;t be added to chapter
if self.adjacency_list.get(node): if self.adjacency_list.get(node):
for sub_node in self.adjacency_list[node]: for sub_node in self.adjacency_list[node]:
sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl+1) sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item) sub_nodes.append(sub_chapter_item)
if self.logger: if self.logger:
self.logger.log(f'Chapter: {title} is prepared.') indent = ' ' * lvl
self.logger.log(f'{indent}Chapter: {title} is prepared.')
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self): def convert_to_dict(self):
@@ -264,6 +325,8 @@ class EpubPostprocessor:
top_level_chapters.append(chapter) top_level_chapters.append(chapter)
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
self.logger.log(f'Internal links found: {self.internal_links_found}.')
self.logger.log('End conversion.')
return { return {
"content": top_level_dict_chapters, "content": top_level_dict_chapters,
@@ -275,6 +338,8 @@ if __name__ == "__main__":
logger = logging.getLogger('epub') logger = logging.getLogger('epub')
file_handler = logging.StreamHandler() file_handler = logging.StreamHandler()
logger.addHandler(file_handler) logger.addHandler(file_handler)
file_handler = logging.FileHandler('epub.log', mode='w+')
logger.addHandler(file_handler)
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)

View File

@@ -115,8 +115,12 @@ def _process_lists(body_tag):
def clean_headings_content(content: Tag, title: str): def clean_headings_content(content: Tag, title: str):
for child in content.contents: for child in content.contents:
if child.text and re.sub(r'([\n\t\xa0])', '', child.text): if isinstance(child, NavigableString):
text = re.sub(r'([\n\t\xa0])', ' ', child.text) text = child
else:
text = child.text
if text and re.sub(r'([\n\t\xa0])', '', text):
text = re.sub(r'([\n\t\xa0])', ' ', text)
text = re.sub(r' +', ' ', text).strip() text = re.sub(r' +', ' ', text).strip()
if title == text: if title == text:
child.extract() child.extract()
@@ -196,10 +200,6 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
return unicode_string.strip() return unicode_string.strip()
def remove_internal_links_with_text(t):
for tag_a in t.find_all('a', {'href': re.compile('(^.+\.(html|xhtml)#.+)|(^#.+)')}):
tag_a.decompose()
for i, noteref_tag in enumerate(noterefs_tags): for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href'] href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href) file, element_id = parse_a_tag_href(href)
@@ -208,7 +208,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
else: else:
target_html_tag = href2soup_html.get(file) target_html_tag = href2soup_html.get(file)
if not target_html_tag: if not target_html_tag:
print(f'Error. for\n{noteref_tag}\ninvalid path: {file} found.') print(f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue continue
possible_footnote = 'note|footnote|endnote|rearenote' possible_footnote = 'note|footnote|endnote|rearenote'
@@ -218,7 +218,6 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0] footnote_tag = expected_footnote_tags[0]
replace_with_livecarta_anchor_tag(noteref_tag, i) replace_with_livecarta_anchor_tag(noteref_tag, i)
remove_internal_links_with_text(footnote_tag)
content = get_footnote_tags2str(footnote_tag) content = get_footnote_tags2str(footnote_tag)
footnote_tag.decompose() footnote_tag.decompose()
@@ -227,45 +226,19 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
return footnotes return footnotes
def add_fonts():
pass
def unwrap_structural_tags(body_tag): def unwrap_structural_tags(body_tag):
def _add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
new_tag = body_tag.new_tag("span")
new_tag.attrs['id'] = tag_to_be_removed.attrs['id']
tag_to_be_removed.insert_before(new_tag)
structural_tags_names = [ structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p' 'figure', 'footer', 'iframe', 'span', 'p'
] ]
# should be before other tags processing, not to remove converter empty tags with id
for div in body_tag.find_all("div"):
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
div.unwrap()
for s in body_tag.find_all("section"):
s.unwrap()
for s in body_tag.find_all("article"):
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
s.unwrap()
for s in body_tag.find_all("body"):
s.unwrap()
for s in body_tag.find_all("html"):
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# not all cases, if span has <p>s and NavigableString, it won't unwrap # not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in body_tag.find_all("span"): for s in body_tag.find_all("span"):
if s.contents: if s.contents:
@@ -274,6 +247,55 @@ def unwrap_structural_tags(body_tag):
continue continue
s.unwrap() s.unwrap()
for div in body_tag.find_all("div"):
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
_add_span_to_save_ids_for_links(div)
div.unwrap()
for s in body_tag.find_all("section"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("article"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("body"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("html"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# check marks for chapter starting are on the same 1 level
marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
parents_marks_are_body = [x.parent == body_tag for x in marks]
# fix marks to be on 1 level
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level.'
_preprocessing_headings(body_tag) _preprocessing_headings(body_tag)
for node in body_tag: for node in body_tag:
@@ -290,12 +312,12 @@ def unwrap_structural_tags(body_tag):
def get_tags_between_ids(first_id, href, html_soup): def get_tags_between_ids(first_id, href, html_soup):
h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'}) h_marked = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
if h_marked: if h_marked:
p = h_marked.next_sibling p = h_marked.next_sibling
tags = [] tags = []
while p: while p:
if p.name == 'h1' and p.attrs.get('class') == 'internal-mark': if p.name == 'tmp' and p.attrs.get('class') == 'converter-chapter-mark':
break break
tags.append(p) tags.append(p)
p = p.next_sibling p = p.next_sibling
@@ -330,7 +352,7 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr
preprocess_table(chapter_tag) preprocess_table(chapter_tag)
# 2. class removal # 2. class removal
for tag in chapter_tag.find_all(recursive=True): for tag in chapter_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class'): if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']):
del tag.attrs['class'] del tag.attrs['class']
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))