Updates to presets

This commit is contained in:
Kiryl
2022-06-21 11:47:26 +03:00
parent 73513e63b5
commit c62192d028
9 changed files with 668 additions and 739 deletions

View File

@@ -17,10 +17,12 @@ from bs4 import BeautifulSoup, Tag
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\
prepare_title, prepare_content, update_images_src_links, preprocess_footnotes
from src.epub_converter.html_epub_preprocessor import process_structural_tags, get_tags_between_chapter_marks,\
prepare_title, prepare_content
class EpubConverter:
@@ -57,26 +59,27 @@ class EpubConverter:
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
self.logger.log('Image processing.')
self.logger.log("Image processing.")
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.img_href2img_bytes[file_name] = content
self.logger.log('HTML files reading.')
self.logger.log("HTML files reading.")
self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content()
# TODO Presets
self.logger.log('Process CSS inline styles.')
self.logger.log("Process CSS inline styles.")
self.process_inline_styles_in_html_soup()
self.logger.log('CSS files processing.')
self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log('CSS styles adding.')
self.logger.log("CSS styles adding.")
self.add_css_styles_to_html_soup()
self.logger.log('Footnotes processing.')
# todo presets
self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup:
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
self.html_href2html_body_soup)
@@ -85,27 +88,28 @@ class EpubConverter:
self.footnotes.extend(footnotes_tags)
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
noteref.attrs['data-id'] = i + 1
noteref.attrs['id'] = f'footnote-{i + 1}'
footnote.attrs['href'] = f'#footnote-{i + 1}'
noteref.attrs["data-id"] = i + 1
noteref.attrs["id"] = f"footnote-{i + 1}"
footnote.attrs["href"] = f"#footnote-{i + 1}"
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
self.logger.log('TOC processing.')
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
self.logger.log("TOC processing.")
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed
if self.is_toc_empty():
self.build_adjacency_list_from_spine()
not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f'Html documents not added to TOC: {not_added}.')
self.logger.log(f"Html documents not added to TOC: {not_added}.")
self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.')
self.label_chapters_ids_with_tmp_id()
self.logger.log(f"Html internal links and structure processing.")
self.label_chapters_ids_with_lc_id()
# used only after parsed toc, ids from toc needed
self.process_html_soup_structure_to_line()
self.process_internal_links()
self.logger.log(f'Building chapters content.')
self.logger.log(f"Define chapters content.")
self.define_chapters_content()
self.logger.log(f"Converting html_nodes to LiveCarta chapter items.")
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
@@ -115,7 +119,7 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_body_text = item.get_body_content()
# html.parser closes tags if needed
soup = BeautifulSoup(html_body_text, features='html.parser')
soup = BeautifulSoup(html_body_text, features="html.parser")
nodes[item.file_name] = soup
return nodes
@@ -123,15 +127,15 @@ class EpubConverter:
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace('\\', '/')
join(html_folder, path_to_css_from_html)).replace("\\", "/")
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search('"(.*)"', str(css_obj.content)).group(1)
re.search("'(.*)'", str(css_obj.content)).group(1)
css_obj = self.ebooklib_book.get_item_with_href(
path_to_css_from_root)
assert css_obj, f'Css style {css_href} was not in manifest.'
assert css_obj, f"Css style {css_href} was not in manifest."
css_content: str = css_obj.get_content().decode()
return css_content
@@ -140,11 +144,11 @@ class EpubConverter:
for html_href in self.html_href2html_body_soup:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')})
attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs['style']
tag_initial_inline_style.attrs['style'] = \
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]:
@@ -167,23 +171,23 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content
html_href = item.file_name
soup_html_content = BeautifulSoup(html_content, features='lxml')
soup_html_content = BeautifulSoup(html_content, features="lxml")
# check if file links to css file
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
for tag in soup_html_content.find_all("link", attrs={"type": "text/css"}):
# alternate page of original page (e.g. another language)
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
if tag.attrs.get("rel") and ("alternate" in tag.attrs["rel"]):
continue
css_href = tag.attrs.get('href')
css_href = tag.attrs.get("href")
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_file_content(
self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all('style')):
for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string
html_href2css_href[html_href].append(f'href{i}')
css_href2css_content[f'href{i}'] = build_css_file_content(
html_href2css_href[html_href].append(f"href{i}")
css_href2css_content[f"href{i}"] = build_css_file_content(
css_content)
return html_href2css_href, css_href2css_content
@@ -195,7 +199,7 @@ class EpubConverter:
"""
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
css = ''
css = ""
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
@@ -243,7 +247,7 @@ class EpubConverter:
sub_nodes = []
for elem in second:
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
if ("section" in first.title.lower() or "part" in first.title.lower()) and lvl == 1:
self.offset_sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl))
else:
@@ -267,7 +271,7 @@ class EpubConverter:
self.adjacency_list[-1] = nodes
else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
assert 0, f"Error. Element is not tuple/Link/list instance: {type(element)}"
def is_toc_empty(self) -> bool:
"""Function checks is toc empty"""
@@ -297,36 +301,36 @@ class EpubConverter:
"""Function add files that not added to adjacency list"""
for i, file in enumerate(not_added):
nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file))
Section(f"To check #{i}, filename: {file}", file))
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(file)
def label_chapters_ids_with_tmp_id(self):
def label_chapters_ids_with_lc_id(self):
for html_href in self.html_href2html_body_soup:
ids = self.html_href2subchapter_ids[html_href]
for i in ids:
soup = self.html_href2html_body_soup[html_href]
tag = soup.find(id=i)
new_h = soup.new_tag('tmp')
new_h.attrs['class'] = 'converter-chapter-mark'
new_h.attrs['id'] = i
new_h = soup.new_tag("tmp")
new_h.attrs["class"] = "converter-chapter-mark"
new_h.attrs["id"] = i
tag.insert_before(new_h)
def process_html_soup_structure_to_line(self):
# go to line structure
for html_href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
self.html_href2html_body_soup[html_href] = process_structural_tags(soup)
@staticmethod
def create_unique_id(href, id_):
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
@staticmethod
def create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs['id'] = id_
new_anchor_span.attrs['class'] = 'link-anchor'
new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs["class"] = "link-anchor"
new_anchor_span.string = "\xa0"
return new_anchor_span
@@ -353,18 +357,18 @@ class EpubConverter:
"""
dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join(
dir_name, href_in_link)).replace('\\', '/')
dir_name, href_in_link)).replace("\\", "/")
full_path = [
path for path in self.hrefs_added_to_toc if normed_path in path]
if not full_path:
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.')
internal_link_tag.attrs['converter-mark'] = 'bad-link'
self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
f"While processing href in {internal_link_tag}.")
internal_link_tag.attrs["converter-mark"] = "bad-link"
return None
if len(full_path) > 1:
self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
f' while {internal_link_tag} processing. The first one will be chosen.')
self.logger.log(f"Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}"
f" while {internal_link_tag} processing. The first one will be chosen.")
return full_path[0]
@@ -387,30 +391,30 @@ class EpubConverter:
"""
# 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
if tag.attrs.get('class') == 'converter-chapter-mark':
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
if tag.attrs.get("class") == "converter-chapter-mark":
continue
if tag.attrs.get('class') == 'footnote-element':
if tag.attrs.get("class") == "footnote-element":
continue
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
tag.attrs['id'] = new_id
new_id = self.create_unique_id(toc_href, tag.attrs["id"])
tag.attrs["id"] = new_id
# 2a. process anchor which is a whole xhtml file
internal_link_reg1 = re.compile(
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
r"(^(?!https?://).+\.(htm|html|xhtml)$)")
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
a_tag_href = internal_link_tag.attrs['href']
for internal_link_tag in soup.find_all("a", {"href": internal_link_reg1}):
a_tag_href = internal_link_tag.attrs["href"]
# find full path
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc:
continue
new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
@@ -418,22 +422,22 @@ class EpubConverter:
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href']
del internal_link_tag.attrs["href"]
# 2b. process anchor which is an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)')
internal_link_reg2 = re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split(
'#')
for internal_link_tag in soup.find_all("a", {"href": internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
"#")
# find full path
if a_tag_href:
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
internal_link_tag)
else:
a_tag_href_matched_to_toc = os.path.normpath(
toc_href).replace('\\', '/')
toc_href).replace("\\", "/")
if not a_tag_href_matched_to_toc:
continue
@@ -442,45 +446,45 @@ class EpubConverter:
a_tag_href_matched_to_toc, a_tag_id)
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id, })
anchor_tags = anchor_soup.find_all(attrs={"id": new_id, })
anchor_tags = anchor_tags or anchor_soup.find_all(
attrs={'id': a_tag_id}) # if link is a footnote
attrs={"id": a_tag_id}) # if link is a footnote
if anchor_tags:
if len(anchor_tags) > 1:
self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n'
f'{anchor_tags}\n'
f' While processing {internal_link_tag}')
self.logger.log(f"Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n"
f"{anchor_tags}\n"
f" While processing {internal_link_tag}")
anchor_tag = anchor_tags[0]
assert anchor_tag.attrs['id'] in [new_id, a_tag_id]
assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
# if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
# create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs['id'] not in self.internal_anchors:
if anchor_tag.attrs["id"] not in self.internal_anchors:
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id)
del anchor_tag.attrs['id']
del internal_link_tag.attrs['href']
del anchor_tag.attrs["id"]
del internal_link_tag.attrs["href"]
else:
internal_link_tag.attrs['converter-mark'] = 'bad-link'
self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={a_tag_id}')
internal_link_tag.attrs["converter-mark"] = "bad-link"
self.logger.log(f"Error in {toc_href}. While processing {internal_link_tag} no anchor found."
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
f" Old id={a_tag_id}")
def build_one_chapter(self, nav_point: NavPoint):
def detect_one_chapter(self, nav_point: NavPoint):
"""
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
3 cases:
id wraps all chapter content,
id wraps chapter's content + subchapters' content
id wraps chapter"s content + subchapters" content
id points to the start of title of a chapter
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter"s id
and id of the next chapter/subchapter
Parameters
----------
@@ -496,7 +500,7 @@ class EpubConverter:
soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser')
new_tree = BeautifulSoup("", "html.parser")
for tag in chapter_tags:
new_tree.append(tag)
self.href_chapter_id2soup_html[(
@@ -504,16 +508,30 @@ class EpubConverter:
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
self.build_one_chapter(sub_node)
self.detect_one_chapter(sub_node)
def define_chapters_content(self):
"""Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points:
self.build_one_chapter(point)
self.detect_one_chapter(point)
def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
"""
Function prepare style, tags to json structure
Parameters
----------
nav_point: NavPoint
lvl: int
level of chapter
Returns
-------
ChapterItem
built chapter
"""
title = nav_point.title
if nav_point.id:
content: BeautifulSoup = self.href_chapter_id2soup_html[(
@@ -526,7 +544,7 @@ class EpubConverter:
access=self.access,
path2aws_path=self.book_image_src_path2aws_path,
book_id=self.file_path.stem
if hasattr(self.file_path, 'stem') else 'book_id')
if hasattr(self.file_path, "stem") else "book_id")
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)
@@ -534,15 +552,16 @@ class EpubConverter:
remove_title_from_chapter=is_chapter)
sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter
# if it doesn't have subchapters
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
sub_chapter_item = self.node_to_livecarta_chapter_item(
sub_chapter_item = self.html_node_to_livecarta_chapter_item(
sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item)
if self.logger:
indent = ' ' * lvl
self.logger.log(f'{indent}Chapter: {title} is prepared.')
indent = " " * lvl
self.logger.log(f"{indent}Chapter: {title} is prepared.")
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self) -> dict:
@@ -550,12 +569,13 @@ class EpubConverter:
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
for nav_point in top_level_nav_points:
chapter = self.node_to_livecarta_chapter_item(nav_point)
# loop through to level chapters
for tl_nav_point in top_level_nav_points:
chapter = self.html_node_to_livecarta_chapter_item(tl_nav_point)
top_level_chapters.append(chapter)
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
self.logger.log(f'Anchors found: {len(self.internal_anchors)}.')
self.logger.log('End conversion.')
self.logger.log(f"Anchors found: {len(self.internal_anchors)}.")
self.logger.log("End conversion.")
return {
"content": top_level_dict_chapters,
@@ -564,12 +584,12 @@ class EpubConverter:
if __name__ == "__main__":
epub_file_path = '../../epub/9781614382264.epub'
epub_file_path = "../../epub/9781614382264.epub"
logger_object = BookLogger(
name='epub', book_id=epub_file_path.split('/')[-1])
name="epub", book_id=epub_file_path.split("/")[-1])
json_converter = EpubConverter(epub_file_path, logger=logger_object)
content_dict = json_converter.convert_to_dict()
with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json:
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
json.dump(content_dict, f_json, ensure_ascii=False)