forked from LiveCarta/BookConverter
Annotations in Epub converter
This commit is contained in:
@@ -9,8 +9,8 @@ from pathlib import Path
|
||||
from itertools import chain
|
||||
from premailer import transform
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Union, List
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
from typing import List, Tuple, Dict, Union
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
from src.epub_converter.css_processor import CSSPreprocessor
|
||||
@@ -39,7 +39,8 @@ class EpubConverter:
|
||||
|
||||
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
|
||||
# key = -1 for top level NavPoints
|
||||
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
|
||||
self.adjacency_list: Dict[Union[NavPoint, -1],
|
||||
Union[List[NavPoint], None]] = {}
|
||||
|
||||
# list to offset Chapter_i on 1st level
|
||||
self.offset_sub_nodes = []
|
||||
@@ -70,7 +71,8 @@ class EpubConverter:
|
||||
BeautifulSoup] = self.build_href2soup_content()
|
||||
|
||||
self.logger.log("CSS inline style processing.")
|
||||
self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
|
||||
self.css_processor.process_inline_styles_in_html_soup(
|
||||
self.html_href2html_body_soup)
|
||||
self.logger.log("CSS files processing.")
|
||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||
self.logger.log("CSS styles fusion(inline+file).")
|
||||
@@ -107,7 +109,6 @@ class EpubConverter:
|
||||
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
||||
# using EpubElements
|
||||
# for now just for HTML objects, as it is the simplest chapter
|
||||
|
||||
nodes = dict()
|
||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
html_body_text = item.get_body_content()
|
||||
@@ -116,7 +117,7 @@ class EpubConverter:
|
||||
nodes[item.file_name] = soup
|
||||
return nodes
|
||||
|
||||
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
||||
def build_html_and_css_relations(self) -> Tuple[Dict[str, List[str]], Dict[str, str]]:
|
||||
"""
|
||||
Function is designed to get 2 dictionaries:
|
||||
The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them
|
||||
@@ -130,8 +131,8 @@ class EpubConverter:
|
||||
|
||||
"""
|
||||
# dictionary: href of html to related css files
|
||||
html_href2css_href: defaultdict = defaultdict(list)
|
||||
css_href2css_content: dict = {}
|
||||
html_href2css_href: Dict[str, List[str]] = defaultdict(list)
|
||||
css_href2css_content: Dict[str, str] = {}
|
||||
|
||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
html_content = item.content
|
||||
@@ -213,7 +214,9 @@ class EpubConverter:
|
||||
html_content, css)
|
||||
self.html_href2html_body_soup[html_href] = html_content
|
||||
|
||||
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
||||
def build_adjacency_list_from_toc(self,
|
||||
element: Union[Link, Tuple[Section, List], List[Union[Link, Tuple]]],
|
||||
lvl: int = 0) -> NavPoint:
|
||||
"""
|
||||
Function
|
||||
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
|
||||
@@ -304,7 +307,7 @@ class EpubConverter:
|
||||
self.adjacency_list[-1].append(nav_point)
|
||||
self.hrefs_added_to_toc.add(nav_point.href)
|
||||
|
||||
def add_not_added_files_to_adjacency_list(self, not_added: list):
|
||||
def add_not_added_files_to_adjacency_list(self, not_added: List[str]):
|
||||
"""Function add files that not added to adjacency list"""
|
||||
for i, file in enumerate(not_added):
|
||||
nav_point = NavPoint(
|
||||
@@ -315,7 +318,7 @@ class EpubConverter:
|
||||
def label_subchapters_with_lc_tag(self):
|
||||
for html_href in self.html_href2html_body_soup:
|
||||
ids, soup = self.html_href2subchapters_ids[html_href], \
|
||||
self.html_href2html_body_soup[html_href]
|
||||
self.html_href2html_body_soup[html_href]
|
||||
for i in ids:
|
||||
tag = soup.find(id=i)
|
||||
tmp_tag = soup.new_tag("lc_tmp")
|
||||
@@ -345,10 +348,13 @@ class EpubConverter:
|
||||
mark.parent.unwrap()
|
||||
|
||||
@staticmethod
|
||||
def create_unique_id(href, id_):
|
||||
def create_unique_id(href: str, id_: str) -> str:
|
||||
return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
|
||||
|
||||
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
|
||||
def match_href_to_path_from_toc(self,
|
||||
cur_file_path: str,
|
||||
href_in_link: str,
|
||||
internal_link_tag: Tag) -> Union[None, str]:
|
||||
"""
|
||||
Function used to find full path to file that is parsed from tag link
|
||||
TOC: a/b/c.xhtml
|
||||
@@ -387,7 +393,7 @@ class EpubConverter:
|
||||
return full_path[0]
|
||||
|
||||
@staticmethod
|
||||
def create_new_anchor_span(soup, id_):
|
||||
def create_new_anchor_span(soup: BeautifulSoup, id_: str) -> Tag:
|
||||
new_anchor_span = soup.new_tag("span")
|
||||
new_anchor_span.attrs["id"] = id_
|
||||
new_anchor_span.attrs["class"] = "link-anchor"
|
||||
@@ -415,7 +421,8 @@ class EpubConverter:
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
|
||||
if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
|
||||
new_id = self.create_unique_id(toc_href, tag.attrs["id"])
|
||||
new_id = self.create_unique_id(
|
||||
toc_href, tag.attrs["id"])
|
||||
tag.attrs["id"] = new_id
|
||||
|
||||
def process_file_anchor():
|
||||
@@ -427,11 +434,13 @@ class EpubConverter:
|
||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
|
||||
toc_href, a_tag_href, internal_link_tag)
|
||||
if a_tag_href_matched_to_toc:
|
||||
new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
|
||||
new_id = self.create_unique_id(
|
||||
a_tag_href_matched_to_toc, "")
|
||||
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
|
||||
if new_id not in self.internal_anchors:
|
||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
||||
new_anchor_span = self.create_new_anchor_span(
|
||||
soup, new_id)
|
||||
# insert a new span to the beginning of the file
|
||||
anchor_soup.insert(0, new_anchor_span)
|
||||
self.internal_anchors.add(new_id)
|
||||
@@ -442,7 +451,8 @@ class EpubConverter:
|
||||
soup = self.html_href2html_body_soup[toc_href]
|
||||
# process_file_element_anchor
|
||||
for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
|
||||
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#")
|
||||
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
|
||||
"#")
|
||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
|
||||
toc_href, a_tag_href, internal_link_tag) if a_tag_href \
|
||||
else path.normpath(toc_href).replace("\\", "/")
|
||||
@@ -452,7 +462,8 @@ class EpubConverter:
|
||||
|
||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||
anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
|
||||
anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote
|
||||
anchor_soup.find_all(
|
||||
attrs={"id": a_tag_id}) # if link is a footnote
|
||||
if anchor_tags:
|
||||
if len(anchor_tags) > 1:
|
||||
self.logger.log(f"Warning in {toc_href}: multiple anchors:"
|
||||
@@ -487,7 +498,9 @@ class EpubConverter:
|
||||
process_file_element_anchor()
|
||||
|
||||
@staticmethod
|
||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||
def get_tags_between_chapter_marks(first_id: str,
|
||||
href: str,
|
||||
html_soup: BeautifulSoup) -> List[Union[Tag, NavigableString]]:
|
||||
"""
|
||||
Get tags between LiveCarta chapter marks
|
||||
Parameters
|
||||
@@ -568,7 +581,7 @@ class EpubConverter:
|
||||
for tl_nav_point in top_level_nav_points:
|
||||
self.detect_one_chapter(tl_nav_point)
|
||||
|
||||
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl: int = 1) -> ChapterItem:
|
||||
"""
|
||||
Function prepare style, tags to json structure
|
||||
Parameters
|
||||
@@ -584,18 +597,18 @@ class EpubConverter:
|
||||
built chapter
|
||||
|
||||
"""
|
||||
title = nav_point.title
|
||||
title: str = nav_point.title
|
||||
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
|
||||
if nav_point.id else self.html_href2html_body_soup[nav_point.href]
|
||||
|
||||
indent = " " * lvl
|
||||
indent: str = " " * lvl
|
||||
self.logger.log(indent + f"Chapter: {title} is processing.")
|
||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||
is_chapter: bool = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||
self.logger.log(indent + "Process title.")
|
||||
title_preprocessed = self.html_processor.prepare_title(title)
|
||||
title_preprocessed: str = self.html_processor.prepare_title(title)
|
||||
self.logger.log(indent + "Process content.")
|
||||
content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
|
||||
remove_title_from_chapter=is_chapter)
|
||||
content_preprocessed: BeautifulSoup = self.html_processor.prepare_content(
|
||||
title_preprocessed, content, remove_title_from_chapter=is_chapter)
|
||||
|
||||
self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
|
||||
self.img_href2img_bytes,
|
||||
@@ -613,7 +626,7 @@ class EpubConverter:
|
||||
sub_nodes.append(sub_chapter_item)
|
||||
return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes)
|
||||
|
||||
def convert_to_dict(self) -> dict:
|
||||
def convert_to_dict(self) -> Dict[str, List[Dict[str, Union[List, str]]]]:
|
||||
"""Function which convert list of html nodes to appropriate json structure"""
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
top_level_chapters = []
|
||||
@@ -633,7 +646,7 @@ class EpubConverter:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
epub_file_path = "../../books/epub/9780763774134.epub"
|
||||
epub_file_path = "../../books/epub/9781119646044.epub"
|
||||
logger_object = BookLogger(
|
||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user