Make todos & refactor code

This commit is contained in:
Kiryl
2021-11-02 12:06:34 +03:00
parent 8c37482616
commit 479695e185
5 changed files with 314 additions and 242 deletions

View File

@@ -1,27 +1,28 @@
import os
import re
import json
import codecs
import logging
import os
from os.path import dirname, normpath, join
from itertools import chain
from collections import defaultdict
from typing import Dict, Union, List
from os.path import dirname, normpath, join
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup, Tag
from ebooklib.epub import Link, Section
from bs4 import BeautifulSoup, Tag
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \
update_src_links_in_images, preprocess_footnotes
class EpubConverter:
def __init__(self, file, access=None, logger=None):
self.file = file
@@ -29,9 +30,9 @@ class EpubConverter:
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file)
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
self.html_href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
# key = -1 for top level NavPoints
@@ -42,8 +43,8 @@ class EpubConverter:
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
self.internal_anchors = set()
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
self.href2img_bytes = {} # file path to bytes
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
self.img_href2img_bytes = {} # file path to bytes
self.old_image_path2aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote
@@ -54,11 +55,11 @@ class EpubConverter:
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.href2img_bytes[file_name] = content
self.img_href2img_bytes[file_name] = content
self.logger.log('HTML files reading.')
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content()
self.logger.log('CSS files processing.')
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
@@ -84,12 +85,14 @@ class EpubConverter:
# build simple toc from spine if needed
if self.is_toc_empty():
self.build_adjacency_list_from_spine()
not_added = [x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f'Html documents not added to TOC: {not_added}.')
self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.')
self.label_chapters_ids_with_tmp_id()
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
# used only after parsed toc, ids from toc needed
self.process_html_soup_structure_to_line()
self.process_internal_links()
self.logger.log(f'Building chapters content.')
self.define_chapters_content()
@@ -110,7 +113,8 @@ class EpubConverter:
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace('\\', '/')
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
assert css_obj, f'Css style {css_href} was not in manifest.'
css_content: str = css_obj.get_content().decode()
@@ -124,14 +128,16 @@ class EpubConverter:
...2... = key2value
'''
html_href2css_href: defaultdict = defaultdict(list) # dictionary: href of html to related css files
# dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list)
css_href2css_content: dict = {}
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content
html_href = item.file_name
soup_html_content = BeautifulSoup(html_content, features='lxml')
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): #check if file links to css file
# check if file links to css file
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
continue
css_href = tag.attrs.get('href')
@@ -144,7 +150,8 @@ class EpubConverter:
for i, tag in enumerate(soup_html_content.find_all('style')):
css_content = tag.string
html_href2css_href[html_href].append(f'href{i}')
css_href2css_content[f'href{i}'] = build_css_content(css_content)
css_href2css_content[f'href{i}'] = build_css_content(
css_content)
return html_href2css_href, css_href2css_content,
@@ -153,14 +160,14 @@ class EpubConverter:
This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content
'''
for href in self.html_href2html_body_soup:
if self.html_href2css_href.get(href):
css =''
for key in self.html_href2css_href[href]:
css += self.css_href2css_content[key]
content: BeautifulSoup = self.html_href2html_body_soup[href]
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
css = ''
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
content: BeautifulSoup = self.html_href2html_body_soup[html_href]
content = convert_html_soup_with_css_style(content, css)
self.html_href2html_body_soup[href] = content
self.html_href2html_body_soup[html_href] = content
def build_manifest_id2html_href(self):
links = dict()
@@ -173,18 +180,18 @@ class EpubConverter:
"""
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
key = -1 if root, value = None if leaf
key = -1 if root(top chapters),
value = None if leaf(least chapters)
:param element: [Link, tuple, list] - element that appears in TOC( usually parsed from nav.ncx)
:param lvl: level of depth
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
:param lvl: level of depth
"""
if isinstance(element, Link):
# todo: check if link exists
nav_point = NavPoint(element)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
self.html_href2subchapter_ids[nav_point.href].append(nav_point.id)
self.adjacency_list[nav_point] = None
self.hrefs_added_to_toc.add(nav_point.href)
return nav_point
@@ -195,11 +202,12 @@ class EpubConverter:
nav_point = NavPoint(first)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
self.html_href2subchapter_ids[nav_point.href].append(nav_point.id)
sub_nodes = []
for i in second:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
sub_nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[nav_point] = sub_nodes
self.hrefs_added_to_toc.add(nav_point.href)
@@ -208,39 +216,43 @@ class EpubConverter:
elif isinstance(element, list) and (lvl == 0):
sub_nodes = []
for i in element:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
sub_nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[-1] = sub_nodes
else:
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
def is_toc_empty(self):
# there is no toc in ebook or no top chapters
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return True
return False
def build_adjacency_list_from_spine(self):
manifest_id2href = self.build_manifest_id2html_href()
manifest_id2html_href = self.build_manifest_id2html_href()
self.adjacency_list = {
-1: []
}
for id_, _ in self.ebooklib_book.spine:
nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
nav_point = NavPoint(
Section(manifest_id2html_href[id_], manifest_id2html_href[id_]))
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added):
for i, file in enumerate(not_added):
nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file))
nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file))
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(file)
def label_chapters_ids_with_tmp_id(self):
for href in self.html_href2html_body_soup:
ids = self.href2subchapter_ids[href]
for html_href in self.html_href2html_body_soup:
ids = self.html_href2subchapter_ids[html_href]
for i in ids:
soup = self.html_href2html_body_soup[href]
soup = self.html_href2html_body_soup[html_href]
tag = soup.find(id=i)
new_h = soup.new_tag('tmp')
new_h.attrs['class'] = 'converter-chapter-mark'
@@ -249,9 +261,9 @@ class EpubConverter:
def process_html_soup_structure_to_line(self):
# go to line structure
for href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[href]
self.html_href2html_body_soup[href] = unwrap_structural_tags(soup)
for html_href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
@staticmethod
def create_unique_id(href, id_):
@@ -280,8 +292,10 @@ class EpubConverter:
:return:
"""
dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
full_path = [path for path in self.hrefs_added_to_toc if normed_path in path]
normed_path = os.path.normpath(os.path.join(
dir_name, href_in_link)).replace('\\', '/')
full_path = [
path for path in self.hrefs_added_to_toc if normed_path in path]
if not full_path:
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.')
@@ -291,7 +305,7 @@ class EpubConverter:
if len(full_path) > 1:
self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
f' while {internal_link_tag} processing. The first one will be chosen.')
return full_path[0]
def process_internal_links(self):
@@ -308,13 +322,15 @@ class EpubConverter:
tag.attrs['id'] = new_id
# 2.a) process anchor which is a whole xhtml file
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(htm|html|xhtml)$)')
internal_link_reg1 = re.compile(
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
a_tag_href = internal_link_tag.attrs['href']
# find full path
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc:
continue
new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
@@ -322,7 +338,8 @@ class EpubConverter:
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
# insert a new span to the begin of the file
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href']
@@ -332,20 +349,26 @@ class EpubConverter:
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split(
'#')
# find full path
if a_tag_href:
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
internal_link_tag)
else:
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
a_tag_href_matched_to_toc = os.path.normpath(
toc_href).replace('\\', '/')
if not a_tag_href_matched_to_toc:
continue
new_id = self.create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
new_id = self.create_unique_id(
a_tag_href_matched_to_toc, a_tag_id)
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote
anchor_tags = anchor_soup.find_all(attrs={'id': new_id, })
anchor_tags = anchor_tags or anchor_soup.find_all(
attrs={'id': a_tag_id}) # if link is a footnote
if anchor_tags:
if len(anchor_tags) > 1:
@@ -359,7 +382,8 @@ class EpubConverter:
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
# create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs['id'] not in self.internal_anchors:
new_anchor_span = self.create_new_anchor_span(soup, new_id)
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id)
del anchor_tag.attrs['id']
@@ -386,11 +410,13 @@ class EpubConverter:
"""
if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup)
chapter_tags = get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser')
for tag in chapter_tags:
new_tree.append(tag)
self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] = new_tree
self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)] = new_tree
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
@@ -405,25 +431,27 @@ class EpubConverter:
def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
title = nav_point.title
if nav_point.id:
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
content: BeautifulSoup = self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)]
else:
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
self.old_image_path2aws_path = update_src_links_in_images(content,
self.href2img_bytes,
self.img_href2img_bytes,
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.old_image_path2aws_path)
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
remove_title_from_chapter=is_chapter)
title_preprocessed = prepare_title(title)
content_preprocessed = prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
sub_nodes = []
# warning! not EpubHtmlItems won;t be added to chapter
# warning! not EpubHtmlItems won't be added to chapter
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
sub_chapter_item = self.node_to_livecarta_chapter_item(sub_node, lvl + 1)
sub_chapter_item = self.node_to_livecarta_chapter_item(
sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item)
if self.logger:
@@ -451,16 +479,16 @@ class EpubConverter:
if __name__ == "__main__":
logger = logging.getLogger('epub')
file_handler = logging.StreamHandler()
logger.addHandler(file_handler)
file_handler = logging.FileHandler('../epub.log', mode='w+')
stream_handler = logging.StreamHandler()
logger.addHandler(stream_handler)
file_handler = logging.FileHandler('../../epub.log', mode='w+')
logger.addHandler(file_handler)
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubConverter('../../epub/Cook.epub',
json_converter = EpubConverter('../../epub/9781634259804.epub',
logger=logger_object)
tmp = json_converter.convert_to_dict()
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
json.dump(tmp, f, ensure_ascii=False)
with codecs.open('../../json/tmp.json', 'w', encoding='utf-8') as f:
json.dump(tmp, f, ensure_ascii=False)