forked from LiveCarta/BookConverter
Function annotations
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
import re
|
||||
import json
|
||||
import codecs
|
||||
import logging
|
||||
import os
|
||||
from os.path import dirname, normpath, join
|
||||
from itertools import chain
|
||||
@@ -51,7 +50,8 @@ class EpubConverter:
|
||||
# flag to be updated while ebooklib.toc is parsed
|
||||
self.id_anchor_exist_in_nav_points = False
|
||||
self.img_href2img_bytes = {} # file path to bytes
|
||||
self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path
|
||||
# file path from <a> to generated aws path
|
||||
self.book_image_src_path2aws_path = {}
|
||||
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
||||
self.noterefs: List[Tag] = [] # start of the footnote
|
||||
self.footnotes: List[Tag] = [] # end of the footnote
|
||||
@@ -116,7 +116,6 @@ class EpubConverter:
|
||||
return nodes
|
||||
|
||||
def get_css_content(self, css_href, html_href):
|
||||
|
||||
path_to_css_from_html = css_href
|
||||
html_folder = dirname(html_href)
|
||||
path_to_css_from_root = normpath(
|
||||
@@ -132,8 +131,8 @@ class EpubConverter:
|
||||
The first is css_href2css_content. It is created to connect href of css to content of css
|
||||
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
|
||||
...2... = key2value
|
||||
"""
|
||||
|
||||
"""
|
||||
# dictionary: href of html to related css files
|
||||
html_href2css_href: defaultdict = defaultdict(list)
|
||||
css_href2css_content: dict = {}
|
||||
@@ -165,6 +164,7 @@ class EpubConverter:
|
||||
"""
|
||||
This function is designed to update html_href2html_body_soup
|
||||
And add to html_inline_style css_style_content
|
||||
|
||||
"""
|
||||
for html_href in self.html_href2html_body_soup:
|
||||
if self.html_href2css_href.get(html_href):
|
||||
@@ -191,8 +191,8 @@ class EpubConverter:
|
||||
|
||||
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
|
||||
:param lvl: level of depth
|
||||
"""
|
||||
|
||||
"""
|
||||
if isinstance(element, Link):
|
||||
nav_point = NavPoint(element)
|
||||
if nav_point.id:
|
||||
@@ -215,7 +215,8 @@ class EpubConverter:
|
||||
sub_nodes = []
|
||||
for elem in second:
|
||||
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
|
||||
self.offset_sub_nodes.append(self.build_adjacency_list_from_toc(elem, lvl))
|
||||
self.offset_sub_nodes.append(
|
||||
self.build_adjacency_list_from_toc(elem, lvl))
|
||||
else:
|
||||
sub_nodes.append(
|
||||
self.build_adjacency_list_from_toc(elem, lvl + 1))
|
||||
@@ -239,8 +240,8 @@ class EpubConverter:
|
||||
else:
|
||||
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
|
||||
|
||||
def is_toc_empty(self):
|
||||
""" Function checks is toc empty """
|
||||
def is_toc_empty(self) -> bool:
|
||||
"""Function checks is toc empty"""
|
||||
# there is no toc in ebook or no top chapters
|
||||
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
||||
return True
|
||||
@@ -258,7 +259,7 @@ class EpubConverter:
|
||||
self.hrefs_added_to_toc.add(nav_point.href)
|
||||
|
||||
def add_not_added_files_to_adjacency_list(self, not_added):
|
||||
""" Function add files that not added to adjacency list """
|
||||
"""Function add files that not added to adjacency list"""
|
||||
for i, file in enumerate(not_added):
|
||||
nav_point = NavPoint(
|
||||
Section(f'To check #{i}, filename: {file}', file))
|
||||
@@ -295,19 +296,26 @@ class EpubConverter:
|
||||
new_anchor_span.string = "\xa0"
|
||||
return new_anchor_span
|
||||
|
||||
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
||||
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
|
||||
"""
|
||||
Function used to find full path to file that is parsed from tag link
|
||||
TOC: a/b/c.xhtml
|
||||
|
||||
b/c.xhtml -> a/b/c.xhtml
|
||||
c.xhtml -> a/b/c.xhtml
|
||||
Parameters
|
||||
----------
|
||||
cur_file_path: str
|
||||
path to current file with tag link
|
||||
href_in_link: str
|
||||
filename got from tag link, like file1.xhtml
|
||||
internal_link_tag: Tag
|
||||
tag object that is parsed now
|
||||
|
||||
Used to find full path to file that is parsed from tag link
|
||||
Returns
|
||||
-------
|
||||
full_path[0]: s
|
||||
prepared content
|
||||
|
||||
:param cur_file_path: path to current file with tag link
|
||||
:param href_in_link: filename got from tag link, like file1.xhtml
|
||||
:param internal_link_tag: tag object that is parsed now
|
||||
:return:
|
||||
"""
|
||||
dir_name = os.path.dirname(cur_file_path)
|
||||
normed_path = os.path.normpath(os.path.join(
|
||||
@@ -331,6 +339,12 @@ class EpubConverter:
|
||||
Function
|
||||
- processing internal links in a book
|
||||
- make ids unique
|
||||
Steps
|
||||
----------
|
||||
1. rebuild ids to be unique in all documents
|
||||
2a. process anchor which is a whole xhtml file
|
||||
2b. process anchor which is an element in xhtml file
|
||||
|
||||
"""
|
||||
# 1. rebuild ids to be unique in all documents
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
@@ -344,7 +358,7 @@ class EpubConverter:
|
||||
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
|
||||
tag.attrs['id'] = new_id
|
||||
|
||||
# 2.a) process anchor which is a whole xhtml file
|
||||
# 2a. process anchor which is a whole xhtml file
|
||||
internal_link_reg1 = re.compile(
|
||||
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
@@ -367,7 +381,7 @@ class EpubConverter:
|
||||
|
||||
del internal_link_tag.attrs['href']
|
||||
|
||||
# 2.b) process anchor which is an element in xhtml file
|
||||
# 2b. process anchor which is an element in xhtml file
|
||||
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
soup = self.html_href2html_body_soup[toc_href]
|
||||
@@ -418,9 +432,9 @@ class EpubConverter:
|
||||
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
|
||||
f' Old id={a_tag_id}')
|
||||
|
||||
def build_one_chapter(self, nav_point):
|
||||
def build_one_chapter(self, nav_point: NavPoint):
|
||||
"""
|
||||
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
||||
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
||||
|
||||
3 cases:
|
||||
id wraps all chapter content,
|
||||
@@ -429,7 +443,13 @@ class EpubConverter:
|
||||
|
||||
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
|
||||
and id of the next chapter/subchapter
|
||||
Parameters
|
||||
----------
|
||||
nav_point: NavPoint
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
if nav_point.id:
|
||||
soup = self.html_href2html_body_soup[nav_point.href]
|
||||
@@ -446,7 +466,7 @@ class EpubConverter:
|
||||
self.build_one_chapter(sub_node)
|
||||
|
||||
def define_chapters_content(self):
|
||||
""" Function build chapters content starts from top level chapters """
|
||||
"""Function build chapters content, starts from top level chapters"""
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
if self.id_anchor_exist_in_nav_points:
|
||||
for point in top_level_nav_points:
|
||||
@@ -483,8 +503,8 @@ class EpubConverter:
|
||||
self.logger.log(f'{indent}Chapter: {title} is prepared.')
|
||||
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
|
||||
|
||||
def convert_to_dict(self):
|
||||
""" Function which convert list of html nodes to appropriate json structure. """
|
||||
def convert_to_dict(self) -> dict:
|
||||
"""Function which convert list of html nodes to appropriate json structure"""
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
top_level_chapters = []
|
||||
|
||||
@@ -502,7 +522,7 @@ class EpubConverter:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
filename = '9781641051217'
|
||||
filename = '9781614382264'
|
||||
logger_object = BookLogger(name='epub', book_id=filename)
|
||||
|
||||
json_converter = EpubConverter(f'../../epub/{filename}.epub',
|
||||
|
||||
Reference in New Issue
Block a user