Function annotations

This commit is contained in:
Kiryl
2022-04-29 17:44:07 +03:00
parent 8de1d0d042
commit 37533e9b67
5 changed files with 187 additions and 130 deletions

View File

@@ -1,7 +1,6 @@
import re
import json
import codecs
import logging
import os
from os.path import dirname, normpath, join
from itertools import chain
@@ -51,7 +50,8 @@ class EpubConverter:
# flag to be updated while ebooklib.toc is parsed
self.id_anchor_exist_in_nav_points = False
self.img_href2img_bytes = {} # file path to bytes
self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path
# file path from <a> to generated aws path
self.book_image_src_path2aws_path = {}
self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
@@ -116,7 +116,6 @@ class EpubConverter:
return nodes
def get_css_content(self, css_href, html_href):
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
@@ -132,8 +131,8 @@ class EpubConverter:
The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
...2... = key2value
"""
"""
# dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list)
css_href2css_content: dict = {}
@@ -165,6 +164,7 @@ class EpubConverter:
"""
This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content
"""
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
@@ -191,8 +191,8 @@ class EpubConverter:
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
:param lvl: level of depth
"""
"""
if isinstance(element, Link):
nav_point = NavPoint(element)
if nav_point.id:
@@ -215,7 +215,8 @@ class EpubConverter:
sub_nodes = []
for elem in second:
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
self.offset_sub_nodes.append(self.build_adjacency_list_from_toc(elem, lvl))
self.offset_sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl))
else:
sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl + 1))
@@ -239,8 +240,8 @@ class EpubConverter:
else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
def is_toc_empty(self):
""" Function checks is toc empty """
def is_toc_empty(self) -> bool:
"""Function checks is toc empty"""
# there is no toc in ebook or no top chapters
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return True
@@ -258,7 +259,7 @@ class EpubConverter:
self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added):
""" Function add files that not added to adjacency list """
"""Function add files that not added to adjacency list"""
for i, file in enumerate(not_added):
nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file))
@@ -295,19 +296,26 @@ class EpubConverter:
new_anchor_span.string = "\xa0"
return new_anchor_span
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
"""
Function used to find full path to file that is parsed from tag link
TOC: a/b/c.xhtml
b/c.xhtml -> a/b/c.xhtml
c.xhtml -> a/b/c.xhtml
Parameters
----------
cur_file_path: str
path to current file with tag link
href_in_link: str
filename got from tag link, like file1.xhtml
internal_link_tag: Tag
tag object that is parsed now
Used to find full path to file that is parsed from tag link
Returns
-------
full_path[0]: s
prepared content
:param cur_file_path: path to current file with tag link
:param href_in_link: filename got from tag link, like file1.xhtml
:param internal_link_tag: tag object that is parsed now
:return:
"""
dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join(
@@ -331,6 +339,12 @@ class EpubConverter:
Function
- processing internal links in a book
- make ids unique
Steps
----------
1. rebuild ids to be unique in all documents
2a. process anchor which is a whole xhtml file
2b. process anchor which is an element in xhtml file
"""
# 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc:
@@ -344,7 +358,7 @@ class EpubConverter:
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
tag.attrs['id'] = new_id
# 2.a) process anchor which is a whole xhtml file
# 2a. process anchor which is a whole xhtml file
internal_link_reg1 = re.compile(
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
for toc_href in self.hrefs_added_to_toc:
@@ -367,7 +381,7 @@ class EpubConverter:
del internal_link_tag.attrs['href']
# 2.b) process anchor which is an element in xhtml file
# 2b. process anchor which is an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
@@ -418,9 +432,9 @@ class EpubConverter:
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={a_tag_id}')
def build_one_chapter(self, nav_point):
def build_one_chapter(self, nav_point: NavPoint):
"""
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
3 cases:
id wraps all chapter content,
@@ -429,7 +443,13 @@ class EpubConverter:
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
and id of the next chapter/subchapter
Parameters
----------
nav_point: NavPoint
Returns
-------
None
"""
if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href]
@@ -446,7 +466,7 @@ class EpubConverter:
self.build_one_chapter(sub_node)
def define_chapters_content(self):
""" Function build chapters content starts from top level chapters """
"""Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points:
@@ -483,8 +503,8 @@ class EpubConverter:
self.logger.log(f'{indent}Chapter: {title} is prepared.')
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self):
""" Function which convert list of html nodes to appropriate json structure. """
def convert_to_dict(self) -> dict:
"""Function which convert list of html nodes to appropriate json structure"""
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
@@ -502,7 +522,7 @@ class EpubConverter:
if __name__ == "__main__":
filename = '9781641051217'
filename = '9781614382264'
logger_object = BookLogger(name='epub', book_id=filename)
json_converter = EpubConverter(f'../../epub/{filename}.epub',