forked from LiveCarta/BookConverter
Wrote documentation for every func/class in .py
This commit is contained in:
@@ -20,7 +20,7 @@ from src.livecarta_config import LiveCartaConfig
|
||||
from src.data_objects import ChapterItem, NavPoint
|
||||
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
|
||||
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \
|
||||
update_src_links_in_images, preprocess_footnotes
|
||||
update_images_src_links, preprocess_footnotes
|
||||
|
||||
|
||||
class EpubConverter:
|
||||
@@ -48,7 +48,7 @@ class EpubConverter:
|
||||
# flag to be updated while ebooklib.toc is parsed
|
||||
self.id_anchor_exist_in_nav_points = False
|
||||
self.img_href2img_bytes = {} # file path to bytes
|
||||
self.old_image_path2aws_path = {} # file path from <a> to generated aws path
|
||||
self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path
|
||||
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
||||
self.noterefs: List[Tag] = [] # start of the footnote
|
||||
self.footnotes: List[Tag] = [] # end of the footnote
|
||||
@@ -124,12 +124,12 @@ class EpubConverter:
|
||||
return css_content
|
||||
|
||||
def build_html_and_css_relations(self):
|
||||
'''
|
||||
"""
|
||||
This function is designed to get 2 dictionaries:
|
||||
The first is css_href2css_content. It is created to connect href of css to content of css
|
||||
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
|
||||
...2... = key2value
|
||||
'''
|
||||
"""
|
||||
|
||||
# dictionary: href of html to related css files
|
||||
html_href2css_href: defaultdict = defaultdict(list)
|
||||
@@ -159,10 +159,10 @@ class EpubConverter:
|
||||
return html_href2css_href, css_href2css_content,
|
||||
|
||||
def add_css_styles_to_html_soup(self):
|
||||
'''
|
||||
"""
|
||||
This function is designed to update html_href2html_body_soup
|
||||
And add to html_inline_style css_style_content
|
||||
'''
|
||||
"""
|
||||
for html_href in self.html_href2html_body_soup:
|
||||
if self.html_href2css_href.get(html_href):
|
||||
css = ''
|
||||
@@ -179,6 +179,7 @@ class EpubConverter:
|
||||
|
||||
return links
|
||||
|
||||
# t_nodes = []
|
||||
def build_adjacency_list_from_toc(self, element, lvl=0):
|
||||
"""
|
||||
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
|
||||
@@ -211,25 +212,31 @@ class EpubConverter:
|
||||
|
||||
sub_nodes = []
|
||||
for i in second:
|
||||
# if 'chapter' in (i.title.lower() if isinstance(i, Link) else i[0].title.lower()):
|
||||
# self.t_nodes.append(self.build_adjacency_list_from_toc(i, lvl))
|
||||
# else:
|
||||
sub_nodes.append(
|
||||
self.build_adjacency_list_from_toc(i, lvl + 1))
|
||||
|
||||
self.adjacency_list[nav_point] = sub_nodes
|
||||
self.hrefs_added_to_toc.add(nav_point.href)
|
||||
return nav_point
|
||||
|
||||
elif isinstance(element, list) and (lvl == 0):
|
||||
sub_nodes = []
|
||||
nodes = []
|
||||
for i in element:
|
||||
sub_nodes.append(
|
||||
nodes.append(
|
||||
self.build_adjacency_list_from_toc(i, lvl + 1))
|
||||
|
||||
self.adjacency_list[-1] = sub_nodes
|
||||
# for j in self.t_nodes:
|
||||
# nodes.append(j)
|
||||
# self.t_nodes = []
|
||||
#
|
||||
# self.adjacency_list[-1] = nodes
|
||||
|
||||
else:
|
||||
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
|
||||
|
||||
def is_toc_empty(self):
|
||||
""" Function checks is toc empty """
|
||||
# there is no toc in ebook or no top chapters
|
||||
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
||||
return True
|
||||
@@ -247,6 +254,7 @@ class EpubConverter:
|
||||
self.hrefs_added_to_toc.add(nav_point.href)
|
||||
|
||||
def add_not_added_files_to_adjacency_list(self, not_added):
|
||||
""" Function add files that not added to adjacency list """
|
||||
for i, file in enumerate(not_added):
|
||||
nav_point = NavPoint(
|
||||
Section(f'To check #{i}, filename: {file}', file))
|
||||
@@ -315,6 +323,11 @@ class EpubConverter:
|
||||
return full_path[0]
|
||||
|
||||
def process_internal_links(self):
|
||||
"""
|
||||
Function
|
||||
- processing internal links in a book
|
||||
- make ids unique
|
||||
"""
|
||||
# 1. rebuild ids to be unique in all documents
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
|
||||
@@ -429,6 +442,7 @@ class EpubConverter:
|
||||
self.build_one_chapter(sub_node)
|
||||
|
||||
def define_chapters_content(self):
|
||||
""" Function build chapters content starts from top level chapters """
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
if self.id_anchor_exist_in_nav_points:
|
||||
for point in top_level_nav_points:
|
||||
@@ -441,12 +455,12 @@ class EpubConverter:
|
||||
nav_point.href, nav_point.id)]
|
||||
else:
|
||||
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
|
||||
self.old_image_path2aws_path = update_src_links_in_images(content,
|
||||
self.img_href2img_bytes,
|
||||
path_to_html=nav_point.href,
|
||||
access=self.access,
|
||||
path2aws_path=self.old_image_path2aws_path,
|
||||
book_id=lambda x: self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')
|
||||
self.book_image_src_path2aws_path = update_images_src_links(content,
|
||||
self.img_href2img_bytes,
|
||||
path_to_html=nav_point.href,
|
||||
access=self.access,
|
||||
path2aws_path=self.book_image_src_path2aws_path,
|
||||
book_id=self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')
|
||||
|
||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||
title_preprocessed = prepare_title(title)
|
||||
@@ -466,6 +480,7 @@ class EpubConverter:
|
||||
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
|
||||
|
||||
def convert_to_dict(self):
|
||||
""" Function which convert list of html nodes to appropriate json structure. """
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
top_level_chapters = []
|
||||
|
||||
@@ -491,7 +506,7 @@ if __name__ == "__main__":
|
||||
|
||||
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
||||
|
||||
json_converter = EpubConverter('../../epub/9781641051217.epub',
|
||||
json_converter = EpubConverter('../../epub/9781614382263.epub',
|
||||
logger=logger_object)
|
||||
tmp = json_converter.convert_to_dict()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user