Wrote documentation for every func/class in .py

This commit is contained in:
Kiryl
2021-12-10 10:53:40 +03:00
parent ef3502cd0a
commit 4b1109e6b4
13 changed files with 198 additions and 172 deletions

View File

@@ -20,7 +20,7 @@ from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \
update_src_links_in_images, preprocess_footnotes
update_images_src_links, preprocess_footnotes
class EpubConverter:
@@ -48,7 +48,7 @@ class EpubConverter:
# flag to be updated while ebooklib.toc is parsed
self.id_anchor_exist_in_nav_points = False
self.img_href2img_bytes = {} # file path to bytes
self.old_image_path2aws_path = {} # file path from <a> to generated aws path
self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
@@ -124,12 +124,12 @@ class EpubConverter:
return css_content
def build_html_and_css_relations(self):
'''
"""
This function is designed to get 2 dictionaries:
The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
...2... = key2value
'''
"""
# dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list)
@@ -159,10 +159,10 @@ class EpubConverter:
return html_href2css_href, css_href2css_content,
def add_css_styles_to_html_soup(self):
'''
"""
This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content
'''
"""
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
css = ''
@@ -179,6 +179,7 @@ class EpubConverter:
return links
# t_nodes = []
def build_adjacency_list_from_toc(self, element, lvl=0):
"""
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
@@ -211,25 +212,31 @@ class EpubConverter:
sub_nodes = []
for i in second:
# if 'chapter' in (i.title.lower() if isinstance(i, Link) else i[0].title.lower()):
# self.t_nodes.append(self.build_adjacency_list_from_toc(i, lvl))
# else:
sub_nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[nav_point] = sub_nodes
self.hrefs_added_to_toc.add(nav_point.href)
return nav_point
elif isinstance(element, list) and (lvl == 0):
sub_nodes = []
nodes = []
for i in element:
sub_nodes.append(
nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[-1] = sub_nodes
# for j in self.t_nodes:
# nodes.append(j)
# self.t_nodes = []
#
# self.adjacency_list[-1] = nodes
else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
def is_toc_empty(self):
""" Function checks is toc empty """
# there is no toc in ebook or no top chapters
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return True
@@ -247,6 +254,7 @@ class EpubConverter:
self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added):
""" Function add files that not added to adjacency list """
for i, file in enumerate(not_added):
nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file))
@@ -315,6 +323,11 @@ class EpubConverter:
return full_path[0]
def process_internal_links(self):
"""
Function
- processing internal links in a book
- make ids unique
"""
# 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
@@ -429,6 +442,7 @@ class EpubConverter:
self.build_one_chapter(sub_node)
def define_chapters_content(self):
""" Function build chapters content starts from top level chapters """
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points:
@@ -441,12 +455,12 @@ class EpubConverter:
nav_point.href, nav_point.id)]
else:
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
self.old_image_path2aws_path = update_src_links_in_images(content,
self.img_href2img_bytes,
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.old_image_path2aws_path,
book_id=lambda x: self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')
self.book_image_src_path2aws_path = update_images_src_links(content,
self.img_href2img_bytes,
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.book_image_src_path2aws_path,
book_id=self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)
@@ -466,6 +480,7 @@ class EpubConverter:
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self):
""" Function which convert list of html nodes to appropriate json structure. """
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
@@ -491,7 +506,7 @@ if __name__ == "__main__":
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubConverter('../../epub/9781641051217.epub',
json_converter = EpubConverter('../../epub/9781614382263.epub',
logger=logger_object)
tmp = json_converter.convert_to_dict()