Add processor (no stem in file)

This commit is contained in:
Kiryl
2021-12-01 16:08:19 +03:00
parent ad6be84c4b
commit ef3502cd0a
2 changed files with 16 additions and 10 deletions

View File

@@ -30,9 +30,11 @@ class EpubConverter:
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file)
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
self.html_href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# main container for all epub .xhtml files
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
# enumerate all subchapter id for each file
self.html_href2subchapter_ids = defaultdict(list)
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
# key = -1 for top level NavPoints
@@ -43,7 +45,8 @@ class EpubConverter:
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
self.internal_anchors = set()
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
# flag to be updated while ebooklib.toc is parsed
self.id_anchor_exist_in_nav_points = False
self.img_href2img_bytes = {} # file path to bytes
self.old_image_path2aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is
@@ -191,7 +194,8 @@ class EpubConverter:
nav_point = NavPoint(element)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.html_href2subchapter_ids[nav_point.href].append(nav_point.id)
self.html_href2subchapter_ids[nav_point.href].append(
nav_point.id)
self.adjacency_list[nav_point] = None
self.hrefs_added_to_toc.add(nav_point.href)
return nav_point
@@ -202,7 +206,8 @@ class EpubConverter:
nav_point = NavPoint(first)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.html_href2subchapter_ids[nav_point.href].append(nav_point.id)
self.html_href2subchapter_ids[nav_point.href].append(
nav_point.id)
sub_nodes = []
for i in second:
@@ -263,7 +268,8 @@ class EpubConverter:
# go to line structure
for html_href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(
soup)
@staticmethod
def create_unique_id(href, id_):
@@ -440,12 +446,12 @@ class EpubConverter:
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.old_image_path2aws_path,
book_id=self.file.stem or 'book_id')
book_id=lambda x: self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)
content_preprocessed = prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
remove_title_from_chapter=is_chapter)
sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter
if self.adjacency_list.get(nav_point):

View File

@@ -56,7 +56,7 @@ def update_src_links_in_images(body_tag: Tag,
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(
path_to_img_from_root, img_content, book_id)
path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):