Annotations in Epub converter

This commit is contained in:
Kiryl
2022-08-03 14:39:13 +03:00
parent 7453029295
commit 78e3ad8911
16 changed files with 259 additions and 192 deletions

View File

@@ -9,8 +9,8 @@ from pathlib import Path
from itertools import chain
from premailer import transform
from collections import defaultdict
from typing import Dict, Union, List
from bs4 import BeautifulSoup, NavigableString, Tag
from typing import List, Tuple, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString
from src.util.helpers import BookLogger
from src.epub_converter.css_processor import CSSPreprocessor
@@ -39,7 +39,8 @@ class EpubConverter:
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
# key = -1 for top level NavPoints
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
self.adjacency_list: Dict[Union[NavPoint, -1],
Union[List[NavPoint], None]] = {}
# list to offset Chapter_i on 1st level
self.offset_sub_nodes = []
@@ -70,7 +71,8 @@ class EpubConverter:
BeautifulSoup] = self.build_href2soup_content()
self.logger.log("CSS inline style processing.")
self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
self.css_processor.process_inline_styles_in_html_soup(
self.html_href2html_body_soup)
self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log("CSS styles fusion(inline+file).")
@@ -107,7 +109,6 @@ class EpubConverter:
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
# for now just for HTML objects, as it is the simplest chapter
nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_body_text = item.get_body_content()
@@ -116,7 +117,7 @@ class EpubConverter:
nodes[item.file_name] = soup
return nodes
def build_html_and_css_relations(self) -> tuple[dict, dict]:
def build_html_and_css_relations(self) -> Tuple[Dict[str, List[str]], Dict[str, str]]:
"""
Function is designed to get 2 dictionaries:
The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them
@@ -130,8 +131,8 @@ class EpubConverter:
"""
# dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list)
css_href2css_content: dict = {}
html_href2css_href: Dict[str, List[str]] = defaultdict(list)
css_href2css_content: Dict[str, str] = {}
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content
@@ -213,7 +214,9 @@ class EpubConverter:
html_content, css)
self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
def build_adjacency_list_from_toc(self,
element: Union[Link, Tuple[Section, List], List[Union[Link, Tuple]]],
lvl: int = 0) -> NavPoint:
"""
Function
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
@@ -304,7 +307,7 @@ class EpubConverter:
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added: list):
def add_not_added_files_to_adjacency_list(self, not_added: List[str]):
"""Function add files that not added to adjacency list"""
for i, file in enumerate(not_added):
nav_point = NavPoint(
@@ -315,7 +318,7 @@ class EpubConverter:
def label_subchapters_with_lc_tag(self):
for html_href in self.html_href2html_body_soup:
ids, soup = self.html_href2subchapters_ids[html_href], \
self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href]
for i in ids:
tag = soup.find(id=i)
tmp_tag = soup.new_tag("lc_tmp")
@@ -345,10 +348,13 @@ class EpubConverter:
mark.parent.unwrap()
@staticmethod
def create_unique_id(href, id_):
def create_unique_id(href: str, id_: str) -> str:
return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
def match_href_to_path_from_toc(self,
cur_file_path: str,
href_in_link: str,
internal_link_tag: Tag) -> Union[None, str]:
"""
Function used to find full path to file that is parsed from tag link
TOC: a/b/c.xhtml
@@ -387,7 +393,7 @@ class EpubConverter:
return full_path[0]
@staticmethod
def create_new_anchor_span(soup, id_):
def create_new_anchor_span(soup: BeautifulSoup, id_: str) -> Tag:
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs["class"] = "link-anchor"
@@ -415,7 +421,8 @@ class EpubConverter:
for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
new_id = self.create_unique_id(toc_href, tag.attrs["id"])
new_id = self.create_unique_id(
toc_href, tag.attrs["id"])
tag.attrs["id"] = new_id
def process_file_anchor():
@@ -427,11 +434,13 @@ class EpubConverter:
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag)
if a_tag_href_matched_to_toc:
new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
new_id = self.create_unique_id(
a_tag_href_matched_to_toc, "")
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
# insert a new span to the beginning of the file
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
@@ -442,7 +451,8 @@ class EpubConverter:
soup = self.html_href2html_body_soup[toc_href]
# process_file_element_anchor
for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#")
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
"#")
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag) if a_tag_href \
else path.normpath(toc_href).replace("\\", "/")
@@ -452,7 +462,8 @@ class EpubConverter:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote
anchor_soup.find_all(
attrs={"id": a_tag_id}) # if link is a footnote
if anchor_tags:
if len(anchor_tags) > 1:
self.logger.log(f"Warning in {toc_href}: multiple anchors:"
@@ -487,7 +498,9 @@ class EpubConverter:
process_file_element_anchor()
@staticmethod
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
def get_tags_between_chapter_marks(first_id: str,
href: str,
html_soup: BeautifulSoup) -> List[Union[Tag, NavigableString]]:
"""
Get tags between LiveCarta chapter marks
Parameters
@@ -568,7 +581,7 @@ class EpubConverter:
for tl_nav_point in top_level_nav_points:
self.detect_one_chapter(tl_nav_point)
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl: int = 1) -> ChapterItem:
"""
Function prepare style, tags to json structure
Parameters
@@ -584,18 +597,18 @@ class EpubConverter:
built chapter
"""
title = nav_point.title
title: str = nav_point.title
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
if nav_point.id else self.html_href2html_body_soup[nav_point.href]
indent = " " * lvl
indent: str = " " * lvl
self.logger.log(indent + f"Chapter: {title} is processing.")
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
is_chapter: bool = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
self.logger.log(indent + "Process title.")
title_preprocessed = self.html_processor.prepare_title(title)
title_preprocessed: str = self.html_processor.prepare_title(title)
self.logger.log(indent + "Process content.")
content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
content_preprocessed: BeautifulSoup = self.html_processor.prepare_content(
title_preprocessed, content, remove_title_from_chapter=is_chapter)
self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
self.img_href2img_bytes,
@@ -613,7 +626,7 @@ class EpubConverter:
sub_nodes.append(sub_chapter_item)
return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes)
def convert_to_dict(self) -> dict:
def convert_to_dict(self) -> Dict[str, List[Dict[str, Union[List, str]]]]:
"""Function which convert list of html nodes to appropriate json structure"""
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
@@ -633,7 +646,7 @@ class EpubConverter:
if __name__ == "__main__":
epub_file_path = "../../books/epub/9780763774134.epub"
epub_file_path = "../../books/epub/9781119646044.epub"
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])