Function annotations

This commit is contained in:
Kiryl
2022-04-29 17:44:07 +03:00
parent 8de1d0d042
commit 37533e9b67
5 changed files with 187 additions and 130 deletions

View File

@@ -38,7 +38,7 @@ class NavPoint:
def flatten(x): def flatten(x):
"""magic function from stackoverflow for list flattening""" """Magic function from stackoverflow for list flattening"""
atom = lambda i: not isinstance(i, list) atom = lambda i: not isinstance(i, list)
nil = lambda i: not i nil = lambda i: not i
car = lambda i: i[0] car = lambda i: i[0]

View File

@@ -28,24 +28,27 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_tag_values(value): def convert_tag_values(value: str) -> str:
"""Function 1. converts values of tags from em/%/pt to px """
2. find closest font-size px Function
- converts values of tags from em/%/pt to px
- find closest font-size px
Parameters Parameters
---------- ----------
value: str value: str
Returns Returns
------- -------
converted value: str value: str
"""
"""
def find_closest_size(value): def find_closest_size(value):
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr)) possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1]) last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index] return sizes_px[last_possible_size_index]
font_size_regexp = re.compile(r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') font_size_regexp = re.compile(
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
has_style_attrs = re.search(font_size_regexp, value) has_style_attrs = re.search(font_size_regexp, value)
if has_style_attrs: if has_style_attrs:
if has_style_attrs.group(1): if has_style_attrs.group(1):
@@ -61,8 +64,7 @@ def convert_tag_values(value):
return value return value
"""
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention. Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted. If property has empty list, it means that any value can be converted.
@@ -164,17 +166,20 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
def check_style_to_be_tag(style) -> List[tuple]: def check_style_to_be_tag(style) -> List[tuple]:
"""Function search style properties that can be converted to tags. """
Function searches style properties that can be converted to tags.
It searches for them and prepare list of properties to be removed from style string It searches for them and prepare list of properties to be removed from style string
Parameters Parameters
---------- ----------
style: str style: str
<tag style="..."> <tag style="...">
Returns Returns
------- -------
properties to remove: list to_remove: list
""" properties to remove
"""
to_remove = [] to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style: if f'{k[0]}:{k[1]}' in style:
@@ -203,7 +208,7 @@ def update_css_style_types_to_livecarta_convention(css_rule, style_type):
def build_css_content(css_content): def build_css_content(css_content):
""" Build css content with livecarta convention """ """Build css content with livecarta convention"""
sheet = cssutils.parseString(css_content, validate=False) sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet: for css_rule in sheet:
@@ -227,7 +232,7 @@ class TagStyleConverter:
@staticmethod @staticmethod
def remove_white_if_no_bgcolor(style_, tag): def remove_white_if_no_bgcolor(style_, tag):
""" Function remove white color if there is no text bg color """ """Function remove text white color if there is no bg color"""
if 'background' in style_: if 'background' in style_:
return style_ return style_
@@ -264,9 +269,11 @@ class TagStyleConverter:
item = item.split(':') item = item.split(':')
if item[0] in ['text-indent', 'margin-left', 'margin']: if item[0] in ['text-indent', 'margin-left', 'margin']:
if len(item[1].split(' ')) == 3: if len(item[1].split(' ')) == 3:
item[1] = convert_tag_values(item[1].split(' ')[-2]) # split returns middle value item[1] = convert_tag_values(item[1].split(
' ')[-2]) # split returns middle value
else: else:
item[1] = convert_tag_values(item[1].split(' ')[-1]) # split returns last value item[1] = convert_tag_values(item[1].split(
' ')[-1]) # split returns last value
clean_style += item[0] + ': ' + item[1] + '; ' clean_style += item[0] + ': ' + item[1] + '; '
margin_left_regexp = re.compile( margin_left_regexp = re.compile(
@@ -360,7 +367,7 @@ class TagStyleConverter:
s = f'{attr}:{value};' s = f'{attr}:{value};'
self.style = self.style.replace(s, '') self.style = self.style.replace(s, '')
self.style = self.style.strip() self.style = self.style.strip()
if i == 0: if not i:
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)] attr, value)]
new_tags.append(self.tag_with_inline_style) new_tags.append(self.tag_with_inline_style)
@@ -388,7 +395,7 @@ class TagStyleConverter:
@staticmethod @staticmethod
def wrap_span_in_p_to_save_style_attrs(tag): def wrap_span_in_p_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in p -> span """ """Function designed to save style attrs that cannot be in p -> span"""
if tag.name == 'p' and tag.attrs.get('style'): if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']] if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
@@ -402,7 +409,6 @@ class TagStyleConverter:
if has_p_style_attrs: if has_p_style_attrs:
p_style += item + ';' p_style += item + ';'
initial_style = initial_style.replace(item + ';', '') initial_style = initial_style.replace(item + ';', '')
# here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top' # here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & ( styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
'-' + attr not in initial_style) for attr in styles_cant_be_in_p] '-' + attr not in initial_style) for attr in styles_cant_be_in_p]
@@ -410,30 +416,30 @@ class TagStyleConverter:
# if find styles that cannot be in <p> -> wrap them in span # if find styles that cannot be in <p> -> wrap them in span
tag.name = 'span' tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p') p_tag = BeautifulSoup(features='lxml').new_tag('p')
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, initial_style) has_p_style_attr = re.search(p_attrs_regexp, initial_style)
span_style = initial_style if not has_li_style_attr else initial_style.replace( span_style = initial_style if not has_p_style_attr else initial_style.replace(
has_li_style_attr.group(1), '') has_p_style_attr.group(1), '')
p_tag.attrs['style'] = p_style p_tag.attrs['style'] = p_style
tag.attrs['style'] = span_style tag.attrs['style'] = span_style
tag.wrap(p_tag) tag.wrap(p_tag)
else: tag.attrs['style'] = p_style else:
tag.attrs['style'] = p_style
@staticmethod @staticmethod
def wrap_span_in_li_to_save_style_attrs(tag): def wrap_span_in_li_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in li -> span """ """Function designed to save style attrs that cannot be in li -> span"""
if tag.name == 'li' and tag.attrs.get('style'): if tag.name == 'li' and tag.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']] attr not in ['text-align', 'list-style-type']]
styles_to_be_saved = [attr in tag.attrs.get( styles_to_be_saved_in_span = [attr in tag.attrs.get(
'style') for attr in styles_cant_be_in_li] 'style') for attr in styles_cant_be_in_li]
if any(styles_to_be_saved): if any(styles_to_be_saved_in_span):
tag.name = 'span' tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li') li_tag = BeautifulSoup(features='lxml').new_tag('li')
span_style = tag.attrs['style'] span_style = tag.attrs['style']
li_style = '' li_style = ''
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
re.compile(r'(list-style-type:(\w+);)')]: re.compile(r'(list-style-type:(\w+);)')]:
has_li_style_attrs = re.search( has_li_style_attrs = re.search(
@@ -442,39 +448,38 @@ class TagStyleConverter:
li_style += has_li_style_attrs.group(1) li_style += has_li_style_attrs.group(1)
span_style = span_style.replace( span_style = span_style.replace(
has_li_style_attrs.group(1), '') has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = li_style li_tag.attrs['style'] = li_style
tag.attrs['style'] = span_style tag.attrs['style'] = span_style
tag.wrap(li_tag) tag.wrap(li_tag)
@staticmethod @staticmethod
def wrap_span_in_ul_ol_to_save_style_attrs(tag): def wrap_span_in_ul_ol_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in ul/ol -> span """ """Function designed to save style attrs that cannot be in ul/ol -> span"""
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [ styles_cant_be_in_ul_ol = [
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
check = [attr in tag.attrs.get('style') styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
for attr in styles_cant_be_in_ul_ol] for attr in styles_cant_be_in_ul_ol]
if any(check): if any(styles_to_be_saved_in_span):
tag.name = 'span' tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('ul') oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
span_style = tag.attrs['style'] span_style = tag.attrs['style']
possible_li_attrs_regexp = re.compile( possible_uol_attrs_regexp = re.compile(
r'(list-style-type:(\w+);)') r'(list-style-type:(\w+);)')
has_li_style_attrs = re.search( has_uol_style_attrs = re.search(
possible_li_attrs_regexp, span_style) possible_uol_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1): if has_uol_style_attrs and has_uol_style_attrs.group(1):
oul_style = has_li_style_attrs.group(1) oul_style = has_uol_style_attrs.group(1)
span_style = span_style.replace(oul_style, '') span_style = span_style.replace(oul_style, '')
li_tag.attrs['style'] = oul_style oul_tag.attrs['style'] = oul_style
tag.attrs['style'] = span_style tag.attrs['style'] = span_style
tag.wrap(li_tag) tag.wrap(oul_tag)
@staticmethod @staticmethod
def wrap_span_in_h_to_save_style_attrs(tag): def wrap_span_in_h_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in h -> span """ """Function designed to save style attrs that cannot be in h -> span"""
h_regexp = re.compile('(^h[1-9]$)') h_regexp = re.compile('(^h[1-9]$)')
if re.search(h_regexp, tag.name) and tag.attrs.get('style'): if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
@@ -482,10 +487,10 @@ class TagStyleConverter:
tag.name = 'span' tag.name = 'span'
tag.wrap(h_tag) tag.wrap(h_tag)
style = tag.attrs['style'] style = tag.attrs['style']
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, style) has_h_style_attr = re.search(h_attrs_regexp, style)
tag.attrs['style'] = style if not has_li_style_attr else style.replace( tag.attrs['style'] = style if not has_h_style_attr else style.replace(
has_li_style_attr.group(1), '') has_h_style_attr.group(1), '')
def convert_initial_tag(self): def convert_initial_tag(self):
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags() self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
@@ -496,8 +501,8 @@ class TagStyleConverter:
return self.tag_with_inline_style return self.tag_with_inline_style
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
""" Function adds styles from .css to inline style """ """Function adds styles from .css to inline style"""
css_text = css_text.replace( css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '') '@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = [] livecarta_tmp_ids = []

View File

@@ -1,7 +1,6 @@
import re import re
import json import json
import codecs import codecs
import logging
import os import os
from os.path import dirname, normpath, join from os.path import dirname, normpath, join
from itertools import chain from itertools import chain
@@ -51,7 +50,8 @@ class EpubConverter:
# flag to be updated while ebooklib.toc is parsed # flag to be updated while ebooklib.toc is parsed
self.id_anchor_exist_in_nav_points = False self.id_anchor_exist_in_nav_points = False
self.img_href2img_bytes = {} # file path to bytes self.img_href2img_bytes = {} # file path to bytes
self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path # file path from <a> to generated aws path
self.book_image_src_path2aws_path = {}
self.footnotes_contents: List[str] = [] # to be sent on server as is self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote self.footnotes: List[Tag] = [] # end of the footnote
@@ -116,7 +116,6 @@ class EpubConverter:
return nodes return nodes
def get_css_content(self, css_href, html_href): def get_css_content(self, css_href, html_href):
path_to_css_from_html = css_href path_to_css_from_html = css_href
html_folder = dirname(html_href) html_folder = dirname(html_href)
path_to_css_from_root = normpath( path_to_css_from_root = normpath(
@@ -132,8 +131,8 @@ class EpubConverter:
The first is css_href2css_content. It is created to connect href of css to content of css The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
...2... = key2value ...2... = key2value
"""
"""
# dictionary: href of html to related css files # dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list) html_href2css_href: defaultdict = defaultdict(list)
css_href2css_content: dict = {} css_href2css_content: dict = {}
@@ -165,6 +164,7 @@ class EpubConverter:
""" """
This function is designed to update html_href2html_body_soup This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content And add to html_inline_style css_style_content
""" """
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href): if self.html_href2css_href.get(html_href):
@@ -191,8 +191,8 @@ class EpubConverter:
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx) :param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
:param lvl: level of depth :param lvl: level of depth
"""
"""
if isinstance(element, Link): if isinstance(element, Link):
nav_point = NavPoint(element) nav_point = NavPoint(element)
if nav_point.id: if nav_point.id:
@@ -215,7 +215,8 @@ class EpubConverter:
sub_nodes = [] sub_nodes = []
for elem in second: for elem in second:
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1: if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
self.offset_sub_nodes.append(self.build_adjacency_list_from_toc(elem, lvl)) self.offset_sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl))
else: else:
sub_nodes.append( sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl + 1)) self.build_adjacency_list_from_toc(elem, lvl + 1))
@@ -239,8 +240,8 @@ class EpubConverter:
else: else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
def is_toc_empty(self): def is_toc_empty(self) -> bool:
""" Function checks is toc empty """ """Function checks is toc empty"""
# there is no toc in ebook or no top chapters # there is no toc in ebook or no top chapters
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return True return True
@@ -258,7 +259,7 @@ class EpubConverter:
self.hrefs_added_to_toc.add(nav_point.href) self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added): def add_not_added_files_to_adjacency_list(self, not_added):
""" Function add files that not added to adjacency list """ """Function add files that not added to adjacency list"""
for i, file in enumerate(not_added): for i, file in enumerate(not_added):
nav_point = NavPoint( nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file)) Section(f'To check #{i}, filename: {file}', file))
@@ -295,19 +296,26 @@ class EpubConverter:
new_anchor_span.string = "\xa0" new_anchor_span.string = "\xa0"
return new_anchor_span return new_anchor_span
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag): def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
""" """
Function used to find full path to file that is parsed from tag link
TOC: a/b/c.xhtml TOC: a/b/c.xhtml
b/c.xhtml -> a/b/c.xhtml b/c.xhtml -> a/b/c.xhtml
c.xhtml -> a/b/c.xhtml c.xhtml -> a/b/c.xhtml
Parameters
----------
cur_file_path: str
path to current file with tag link
href_in_link: str
filename got from tag link, like file1.xhtml
internal_link_tag: Tag
tag object that is parsed now
Used to find full path to file that is parsed from tag link Returns
-------
full_path[0]: s
prepared content
:param cur_file_path: path to current file with tag link
:param href_in_link: filename got from tag link, like file1.xhtml
:param internal_link_tag: tag object that is parsed now
:return:
""" """
dir_name = os.path.dirname(cur_file_path) dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join( normed_path = os.path.normpath(os.path.join(
@@ -331,6 +339,12 @@ class EpubConverter:
Function Function
- processing internal links in a book - processing internal links in a book
- make ids unique - make ids unique
Steps
----------
1. rebuild ids to be unique in all documents
2a. process anchor which is a whole xhtml file
2b. process anchor which is an element in xhtml file
""" """
# 1. rebuild ids to be unique in all documents # 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc: for toc_href in self.hrefs_added_to_toc:
@@ -344,7 +358,7 @@ class EpubConverter:
new_id = self.create_unique_id(toc_href, tag.attrs['id']) new_id = self.create_unique_id(toc_href, tag.attrs['id'])
tag.attrs['id'] = new_id tag.attrs['id'] = new_id
# 2.a) process anchor which is a whole xhtml file # 2a. process anchor which is a whole xhtml file
internal_link_reg1 = re.compile( internal_link_reg1 = re.compile(
r'(^(?!https?://).+\.(htm|html|xhtml)$)') r'(^(?!https?://).+\.(htm|html|xhtml)$)')
for toc_href in self.hrefs_added_to_toc: for toc_href in self.hrefs_added_to_toc:
@@ -367,7 +381,7 @@ class EpubConverter:
del internal_link_tag.attrs['href'] del internal_link_tag.attrs['href']
# 2.b) process anchor which is an element in xhtml file # 2b. process anchor which is an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)') internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
for toc_href in self.hrefs_added_to_toc: for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href] soup = self.html_href2html_body_soup[toc_href]
@@ -418,9 +432,9 @@ class EpubConverter:
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={a_tag_id}') f' Old id={a_tag_id}')
def build_one_chapter(self, nav_point): def build_one_chapter(self, nav_point: NavPoint):
""" """
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
3 cases: 3 cases:
id wraps all chapter content, id wraps all chapter content,
@@ -429,7 +443,13 @@ class EpubConverter:
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
and id of the next chapter/subchapter and id of the next chapter/subchapter
Parameters
----------
nav_point: NavPoint
Returns
-------
None
""" """
if nav_point.id: if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href] soup = self.html_href2html_body_soup[nav_point.href]
@@ -446,7 +466,7 @@ class EpubConverter:
self.build_one_chapter(sub_node) self.build_one_chapter(sub_node)
def define_chapters_content(self): def define_chapters_content(self):
""" Function build chapters content starts from top level chapters """ """Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points: if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points: for point in top_level_nav_points:
@@ -483,8 +503,8 @@ class EpubConverter:
self.logger.log(f'{indent}Chapter: {title} is prepared.') self.logger.log(f'{indent}Chapter: {title} is prepared.')
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self): def convert_to_dict(self) -> dict:
""" Function which convert list of html nodes to appropriate json structure. """ """Function which convert list of html nodes to appropriate json structure"""
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = [] top_level_chapters = []
@@ -502,7 +522,7 @@ class EpubConverter:
if __name__ == "__main__": if __name__ == "__main__":
filename = '9781641051217' filename = '9781614382264'
logger_object = BookLogger(name='epub', book_id=filename) logger_object = BookLogger(name='epub', book_id=filename)
json_converter = EpubConverter(f'../../epub/{filename}.epub', json_converter = EpubConverter(f'../../epub/{filename}.epub',

View File

@@ -2,7 +2,7 @@ from src.book_solver import BookSolver
from src.epub_converter.epub_converter import EpubConverter from src.epub_converter.epub_converter import EpubConverter
class EpubBook(BookSolver): class EpubBook(BookSolver):
""" Class of .epub type book - child of BookSolver """ """Class of .epub type book - child of BookSolver"""
def __init__(self, book_id=0, access=None, main_logger=None): def __init__(self, book_id=0, access=None, main_logger=None):
super().__init__(book_id, access, main_logger) super().__init__(book_id, access, main_logger)

View File

@@ -9,7 +9,7 @@ from src.access import Access
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
def save_image_locally(img_file_path, img_content, book_id): def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images locally""" """Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join( new_path = pathlib.Path(os.path.join(
@@ -24,19 +24,19 @@ def save_image_locally(img_file_path, img_content, book_id):
return new_img_path return new_img_path
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id): def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images to Amazon web service""" """Function saves all images to Amazon web service"""
link_path = access.send_image( link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content) img_file_path, doc_id=book_id, img_content=img_content)
return link_path return link_path
def update_images_src_links(body_tag: Tag, def update_images_src_links(body_tag: BeautifulSoup,
href2img_content: dict, href2img_content: dict,
path_to_html, path_to_html: str,
access=None, access=None,
path2aws_path=None, path2aws_path: dict = None,
book_id=None): book_id: str = None) -> dict:
"""Function makes dictionary image_src_path -> Amazon web service_path""" """Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all('img') img_tags = body_tag.find_all('img')
@@ -99,13 +99,22 @@ def preprocess_table(body_tag: BeautifulSoup):
table.attrs['border'] = '1' table.attrs['border'] = '1'
def process_lists(body_tag): def process_lists(body_tag: BeautifulSoup):
""" """
Function to process tags <li>. Function
Unwrap <p> tags. - process tags <li>.
""" - unwrap <p> tags.
li_tags = body_tag.find_all("li") Parameters
----------
body_tag: Tag, soup object
Returns
-------
None
"""
li_tags = body_tag.find_all("li")
for li_tag in li_tags: for li_tag in li_tags:
if li_tag.p: if li_tag.p:
li_tag.attrs.update(li_tag.p.attrs) li_tag.attrs.update(li_tag.p.attrs)
@@ -113,7 +122,7 @@ def process_lists(body_tag):
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
"""Function inserts span before tag to be removed(aren't supported by livecarta)""" """Function inserts span before tag aren't supported by livecarta"""
new_tag = main_tag.new_tag("span") new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or '' new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or '' new_tag.attrs['class'] = class_ or ''
@@ -121,8 +130,8 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
tag.insert_before(new_tag) tag.insert_before(new_tag)
def clean_headings_content(content: Tag, title: str): def clean_headings_content(content: BeautifulSoup, title: str):
def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag): def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
if tag_to_be_removed.attrs.get('id'): if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag, insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed, tag_to_be_removed,
@@ -194,6 +203,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p> <p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside> <aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
""" """
footnotes = [] footnotes = []
noterefs_tags = source_html_tag.find_all( noterefs_tags = source_html_tag.find_all(
@@ -258,21 +268,28 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
return footnotes, new_noterefs_tags, new_footnotes_tags return footnotes, new_noterefs_tags, new_footnotes_tags
def unwrap_structural_tags(body_tag): def unwrap_structural_tags(body_tag: BeautifulSoup):
"""Main function that works with structure of html. Make changes inplace. """
Main function that works with structure of html. Make changes inplace.
Parameters
----------
body_tag: Tag, soup object
Steps
----------
1. Extracts tags that are not needed 1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree. 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed. Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
This tag must have a body_tag as a parent. This tag must have a body_tag as a parent.
Otherwise, it is wrapped with some tags. Like: Otherwise, it is wrapped with some tags. Like:
<p> <span id='123', class='converter-chapter-mark'> </span> </p> <p> <span id='123', class='converter-chapter-mark'> </span> </p>
3. Headings that are not supported by livecarta converts to <p> 3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString 4. Wrapping NavigableString
:param body_tag: Tag, soup object
:return: None Returns
-------
None
""" """
def preserve_class_in_aside_tag(tag_): def preserve_class_in_aside_tag(tag_):
@@ -284,10 +301,18 @@ def unwrap_structural_tags(body_tag):
if not tag_.parent.attrs.get('class'): if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class tag_.parent.attrs['class'] = tag_class
def preserve_class_in_section_tag(tag_) -> bool: def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
""" """
to save css style inherited from class, copy class to child <p> Function saves css style inherited from class, copies class to child <p>
returns True, if <section> could be unwrapped returns True, if <section> could be unwrapped
Parameters
----------
tag_: Tag, soup object
Returns
-------
None
""" """
# this is for Wiley books with boxes # this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance( tag_class = tag_.attrs['class'] if not isinstance(
@@ -314,9 +339,11 @@ def unwrap_structural_tags(body_tag):
class_=tag_to_be_removed.attrs.get('class')) class_=tag_to_be_removed.attrs.get('class'))
def replace_div_tag_with_table(): def replace_div_tag_with_table():
"""Function replace <div> with <table>: """
Function replace <div> with <table>:
1. Convert div with certain classes to tables 1. Convert div with certain classes to tables
2. Add background color to div with background-color 2. Add background color to div with background-color
""" """
for div in body_tag.find_all("div"): for div in body_tag.find_all("div"):
if div.attrs.get('class'): if div.attrs.get('class'):
@@ -431,22 +458,22 @@ def unwrap_structural_tags(body_tag):
return body_tag return body_tag
def get_tags_between_chapter_marks(first_id, href, html_soup): def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""After processing on a first_id that corresponds to current chapter, """After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted from initial html_soup all tags from current chapter are extracted
Parameters Parameters
---------- ----------
first_id : first_id:
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
href : href:
Name of current chapter's file Name of current chapter's file
html_soup : html_soup: Tag, soup object
Soup object of current file Soup object of current file
Returns Returns
------- -------
tags : list [Tag, NavigableString] tags: list [Tag, NavigableString]
Chapter's tags Chapter's tags
""" """
@@ -536,37 +563,33 @@ def prepare_formatted(text: str) -> str:
return text return text
def wrap_preformatted_span_with_table(main_tag, old_tag): def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
"""Function wraps <span> with <table>""" """Function wraps <span> with <table>"""
table = main_tag.new_tag("table") table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
table.attrs['border'] = '1px #ccc;' "tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
table.attrs['style'] = 'width:100%;' table.attrs['border'], table.attrs['style'] = '1px #ccc;', 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['bgcolor'] = '#f5f5f5' td.attrs['bgcolor'] = '#f5f5f5'
# td.attrs['border-radius'] = '4px' # td.attrs['border-radius'] = '4px'
old_tag.wrap(td) span_tag.wrap(td)
td.wrap(tr) td.wrap(tr)
tr.wrap(tbody) tr.wrap(tbody)
tbody.wrap(table) tbody.wrap(table)
return table return table
def preprocess_pre_tags(chapter_tag): def preprocess_pre_tags(chapter_tag: BeautifulSoup):
"""Function preprocessing <pre> tags """
Function preprocessing <pre> tags
Parameters Parameters
---------- ----------
chapter_tag: BeautifulSoup chapter_tag: Tag, soup object
Steps Steps
---------- ----------
1. cleaning \n 1. Process NavigableString
2. heading removal 2. Process Tags and their children
3. processing tags
4. class removal
"""
"""
for pre in chapter_tag.find_all("pre"): for pre in chapter_tag.find_all("pre"):
new_tag = BeautifulSoup(features='lxml').new_tag("span") new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy() new_tag.attrs = pre.attrs.copy()
@@ -599,17 +622,26 @@ def preprocess_pre_tags(chapter_tag):
"font-size: 14px; white-space: nowrap;" "font-size: 14px; white-space: nowrap;"
pre.replace_with(new_tag) pre.replace_with(new_tag)
table = wrap_preformatted_span_with_table(chapter_tag, new_tag) table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
# add <p> to save brs
p_for_br = chapter_tag.new_tag("p") p_for_br = chapter_tag.new_tag("p")
p_for_br.string = "\xa0" p_for_br.string = "\xa0"
table.insert_after(p_for_br) table.insert_after(p_for_br)
def preprocess_code_tags(chapter_tag: Tag): def preprocess_code_tags(chapter_tag: BeautifulSoup):
"""Function that
- transform <code>, <kdb>, <var> tags into span
- add code style to this tags
""" """
Function
- transform <code>, <kdb>, <var> tags into span
- add code style to this tags
Parameters
----------
chapter_tag: Tag, soup object
Returns
-------
None
"""
for code in chapter_tag.find_all(re.compile("code|kbd|var")): for code in chapter_tag.find_all(re.compile("code|kbd|var")):
code.name = "span" code.name = "span"
if code.parent.name == "pre": if code.parent.name == "pre":
@@ -620,7 +652,6 @@ def preprocess_code_tags(chapter_tag: Tag):
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;' code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
def prepare_title(title_of_chapter: str) -> str: def prepare_title(title_of_chapter: str) -> str:
"""Function finalise processing/cleaning title""" """Function finalise processing/cleaning title"""
title_str = BeautifulSoup(title_of_chapter, features='lxml').string title_str = BeautifulSoup(title_of_chapter, features='lxml').string
@@ -631,18 +662,19 @@ def prepare_title(title_of_chapter: str) -> str:
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""Function finalise processing/cleaning content """
Function finalise processing/cleaning content
Parameters Parameters
---------- ----------
title_str: str title_str: str
content_tag: BeautifulSoup content_tag: Tag, soup object
remove_title_from_chapter: bool remove_title_from_chapter: bool
Steps Steps
---------- ----------
1. cleaning \n 1. find \n
2. heading removal 2. heading removal
3. processing tags 3. processing tags
4. class removal 4. class removal
@@ -651,9 +683,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
------- -------
content_tag: str content_tag: str
prepared content prepared content
"""
# 0. cleaning \n """
# 1. find \n
to_remove = [] to_remove = []
for child in content_tag.contents: for child in content_tag.contents:
if isinstance(child, NavigableString): if isinstance(child, NavigableString):
@@ -661,18 +693,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
if s == '': if s == '':
to_remove.append(child) to_remove.append(child)
# 1. heading removal # 2. heading removal
if remove_title_from_chapter: if remove_title_from_chapter:
clean_headings_content(content_tag, title_str) clean_headings_content(content_tag, title_str)
# 2. processing tags (<li>, <table>, <code>, <pre>, <block>) # 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
process_lists(content_tag) process_lists(content_tag)
preprocess_table(content_tag) preprocess_table(content_tag)
preprocess_code_tags(content_tag) preprocess_code_tags(content_tag)
preprocess_pre_tags(content_tag) preprocess_pre_tags(content_tag)
preprocess_block_tags(content_tag) preprocess_block_tags(content_tag)
# 3. class removal # 4. class removal
for tag in content_tag.find_all(recursive=True): for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']): 'footnote-element']):