forked from LiveCarta/BookConverter
Function annotations
This commit is contained in:
@@ -38,7 +38,7 @@ class NavPoint:
|
||||
|
||||
|
||||
def flatten(x):
|
||||
"""magic function from stackoverflow for list flattening"""
|
||||
"""Magic function from stackoverflow for list flattening"""
|
||||
atom = lambda i: not isinstance(i, list)
|
||||
nil = lambda i: not i
|
||||
car = lambda i: i[0]
|
||||
|
||||
@@ -28,24 +28,27 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
|
||||
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
||||
|
||||
|
||||
def convert_tag_values(value):
|
||||
"""Function 1. converts values of tags from em/%/pt to px
|
||||
2. find closest font-size px
|
||||
def convert_tag_values(value: str) -> str:
|
||||
"""
|
||||
Function
|
||||
- converts values of tags from em/%/pt to px
|
||||
- find closest font-size px
|
||||
Parameters
|
||||
----------
|
||||
value: str
|
||||
|
||||
Returns
|
||||
-------
|
||||
converted value: str
|
||||
"""
|
||||
value: str
|
||||
|
||||
"""
|
||||
def find_closest_size(value):
|
||||
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
|
||||
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
||||
return sizes_px[last_possible_size_index]
|
||||
|
||||
font_size_regexp = re.compile(r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
|
||||
font_size_regexp = re.compile(
|
||||
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
|
||||
has_style_attrs = re.search(font_size_regexp, value)
|
||||
if has_style_attrs:
|
||||
if has_style_attrs.group(1):
|
||||
@@ -61,8 +64,7 @@ def convert_tag_values(value):
|
||||
return value
|
||||
|
||||
|
||||
|
||||
"""
|
||||
"""
|
||||
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||
Style properties that can be used to fit livecarta css style convention.
|
||||
If property has empty list, it means that any value can be converted.
|
||||
@@ -164,17 +166,20 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
||||
|
||||
|
||||
def check_style_to_be_tag(style) -> List[tuple]:
|
||||
"""Function search style properties that can be converted to tags.
|
||||
"""
|
||||
Function searches style properties that can be converted to tags.
|
||||
It searches for them and prepare list of properties to be removed from style string
|
||||
Parameters
|
||||
----------
|
||||
style: str
|
||||
<tag style="...">
|
||||
|
||||
Returns
|
||||
-------
|
||||
properties to remove: list
|
||||
"""
|
||||
to_remove: list
|
||||
properties to remove
|
||||
|
||||
"""
|
||||
to_remove = []
|
||||
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
||||
if f'{k[0]}:{k[1]}' in style:
|
||||
@@ -203,7 +208,7 @@ def update_css_style_types_to_livecarta_convention(css_rule, style_type):
|
||||
|
||||
|
||||
def build_css_content(css_content):
|
||||
""" Build css content with livecarta convention """
|
||||
"""Build css content with livecarta convention"""
|
||||
sheet = cssutils.parseString(css_content, validate=False)
|
||||
|
||||
for css_rule in sheet:
|
||||
@@ -227,7 +232,7 @@ class TagStyleConverter:
|
||||
|
||||
@staticmethod
|
||||
def remove_white_if_no_bgcolor(style_, tag):
|
||||
""" Function remove white color if there is no text bg color """
|
||||
"""Function remove text white color if there is no bg color"""
|
||||
if 'background' in style_:
|
||||
return style_
|
||||
|
||||
@@ -264,9 +269,11 @@ class TagStyleConverter:
|
||||
item = item.split(':')
|
||||
if item[0] in ['text-indent', 'margin-left', 'margin']:
|
||||
if len(item[1].split(' ')) == 3:
|
||||
item[1] = convert_tag_values(item[1].split(' ')[-2]) # split returns middle value
|
||||
item[1] = convert_tag_values(item[1].split(
|
||||
' ')[-2]) # split returns middle value
|
||||
else:
|
||||
item[1] = convert_tag_values(item[1].split(' ')[-1]) # split returns last value
|
||||
item[1] = convert_tag_values(item[1].split(
|
||||
' ')[-1]) # split returns last value
|
||||
clean_style += item[0] + ': ' + item[1] + '; '
|
||||
|
||||
margin_left_regexp = re.compile(
|
||||
@@ -360,7 +367,7 @@ class TagStyleConverter:
|
||||
s = f'{attr}:{value};'
|
||||
self.style = self.style.replace(s, '')
|
||||
self.style = self.style.strip()
|
||||
if i == 0:
|
||||
if not i:
|
||||
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
|
||||
attr, value)]
|
||||
new_tags.append(self.tag_with_inline_style)
|
||||
@@ -388,7 +395,7 @@ class TagStyleConverter:
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_p_to_save_style_attrs(tag):
|
||||
""" Function designed to save style attrs that cannot be in p -> span """
|
||||
"""Function designed to save style attrs that cannot be in p -> span"""
|
||||
if tag.name == 'p' and tag.attrs.get('style'):
|
||||
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
||||
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
|
||||
@@ -402,7 +409,6 @@ class TagStyleConverter:
|
||||
if has_p_style_attrs:
|
||||
p_style += item + ';'
|
||||
initial_style = initial_style.replace(item + ';', '')
|
||||
|
||||
# here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
|
||||
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
|
||||
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
|
||||
@@ -410,30 +416,30 @@ class TagStyleConverter:
|
||||
# if find styles that cannot be in <p> -> wrap them in span
|
||||
tag.name = 'span'
|
||||
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
||||
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||
has_li_style_attr = re.search(li_attrs_regexp, initial_style)
|
||||
span_style = initial_style if not has_li_style_attr else initial_style.replace(
|
||||
has_li_style_attr.group(1), '')
|
||||
p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||
has_p_style_attr = re.search(p_attrs_regexp, initial_style)
|
||||
span_style = initial_style if not has_p_style_attr else initial_style.replace(
|
||||
has_p_style_attr.group(1), '')
|
||||
p_tag.attrs['style'] = p_style
|
||||
tag.attrs['style'] = span_style
|
||||
tag.wrap(p_tag)
|
||||
else: tag.attrs['style'] = p_style
|
||||
else:
|
||||
tag.attrs['style'] = p_style
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_li_to_save_style_attrs(tag):
|
||||
""" Function designed to save style attrs that cannot be in li -> span """
|
||||
"""Function designed to save style attrs that cannot be in li -> span"""
|
||||
if tag.name == 'li' and tag.attrs.get('style'):
|
||||
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
|
||||
attr not in ['text-align', 'list-style-type']]
|
||||
|
||||
styles_to_be_saved = [attr in tag.attrs.get(
|
||||
styles_to_be_saved_in_span = [attr in tag.attrs.get(
|
||||
'style') for attr in styles_cant_be_in_li]
|
||||
if any(styles_to_be_saved):
|
||||
if any(styles_to_be_saved_in_span):
|
||||
tag.name = 'span'
|
||||
li_tag = BeautifulSoup(features='lxml').new_tag('li')
|
||||
span_style = tag.attrs['style']
|
||||
li_style = ''
|
||||
|
||||
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
|
||||
re.compile(r'(list-style-type:(\w+);)')]:
|
||||
has_li_style_attrs = re.search(
|
||||
@@ -442,39 +448,38 @@ class TagStyleConverter:
|
||||
li_style += has_li_style_attrs.group(1)
|
||||
span_style = span_style.replace(
|
||||
has_li_style_attrs.group(1), '')
|
||||
|
||||
li_tag.attrs['style'] = li_style
|
||||
tag.attrs['style'] = span_style
|
||||
tag.wrap(li_tag)
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
|
||||
""" Function designed to save style attrs that cannot be in ul/ol -> span """
|
||||
"""Function designed to save style attrs that cannot be in ul/ol -> span"""
|
||||
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
|
||||
styles_cant_be_in_ul_ol = [
|
||||
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
|
||||
|
||||
check = [attr in tag.attrs.get('style')
|
||||
for attr in styles_cant_be_in_ul_ol]
|
||||
if any(check):
|
||||
styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
|
||||
for attr in styles_cant_be_in_ul_ol]
|
||||
if any(styles_to_be_saved_in_span):
|
||||
tag.name = 'span'
|
||||
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
|
||||
oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
|
||||
span_style = tag.attrs['style']
|
||||
|
||||
possible_li_attrs_regexp = re.compile(
|
||||
possible_uol_attrs_regexp = re.compile(
|
||||
r'(list-style-type:(\w+);)')
|
||||
has_li_style_attrs = re.search(
|
||||
possible_li_attrs_regexp, span_style)
|
||||
if has_li_style_attrs and has_li_style_attrs.group(1):
|
||||
oul_style = has_li_style_attrs.group(1)
|
||||
has_uol_style_attrs = re.search(
|
||||
possible_uol_attrs_regexp, span_style)
|
||||
if has_uol_style_attrs and has_uol_style_attrs.group(1):
|
||||
oul_style = has_uol_style_attrs.group(1)
|
||||
span_style = span_style.replace(oul_style, '')
|
||||
li_tag.attrs['style'] = oul_style
|
||||
oul_tag.attrs['style'] = oul_style
|
||||
tag.attrs['style'] = span_style
|
||||
tag.wrap(li_tag)
|
||||
tag.wrap(oul_tag)
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_h_to_save_style_attrs(tag):
|
||||
""" Function designed to save style attrs that cannot be in h -> span """
|
||||
"""Function designed to save style attrs that cannot be in h -> span"""
|
||||
h_regexp = re.compile('(^h[1-9]$)')
|
||||
|
||||
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
|
||||
@@ -482,10 +487,10 @@ class TagStyleConverter:
|
||||
tag.name = 'span'
|
||||
tag.wrap(h_tag)
|
||||
style = tag.attrs['style']
|
||||
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||
has_li_style_attr = re.search(li_attrs_regexp, style)
|
||||
tag.attrs['style'] = style if not has_li_style_attr else style.replace(
|
||||
has_li_style_attr.group(1), '')
|
||||
h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||
has_h_style_attr = re.search(h_attrs_regexp, style)
|
||||
tag.attrs['style'] = style if not has_h_style_attr else style.replace(
|
||||
has_h_style_attr.group(1), '')
|
||||
|
||||
def convert_initial_tag(self):
|
||||
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
|
||||
@@ -496,8 +501,8 @@ class TagStyleConverter:
|
||||
return self.tag_with_inline_style
|
||||
|
||||
|
||||
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
|
||||
""" Function adds styles from .css to inline style """
|
||||
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
||||
"""Function adds styles from .css to inline style"""
|
||||
css_text = css_text.replace(
|
||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||
livecarta_tmp_ids = []
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import re
|
||||
import json
|
||||
import codecs
|
||||
import logging
|
||||
import os
|
||||
from os.path import dirname, normpath, join
|
||||
from itertools import chain
|
||||
@@ -51,7 +50,8 @@ class EpubConverter:
|
||||
# flag to be updated while ebooklib.toc is parsed
|
||||
self.id_anchor_exist_in_nav_points = False
|
||||
self.img_href2img_bytes = {} # file path to bytes
|
||||
self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path
|
||||
# file path from <a> to generated aws path
|
||||
self.book_image_src_path2aws_path = {}
|
||||
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
||||
self.noterefs: List[Tag] = [] # start of the footnote
|
||||
self.footnotes: List[Tag] = [] # end of the footnote
|
||||
@@ -116,7 +116,6 @@ class EpubConverter:
|
||||
return nodes
|
||||
|
||||
def get_css_content(self, css_href, html_href):
|
||||
|
||||
path_to_css_from_html = css_href
|
||||
html_folder = dirname(html_href)
|
||||
path_to_css_from_root = normpath(
|
||||
@@ -132,8 +131,8 @@ class EpubConverter:
|
||||
The first is css_href2css_content. It is created to connect href of css to content of css
|
||||
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
|
||||
...2... = key2value
|
||||
"""
|
||||
|
||||
"""
|
||||
# dictionary: href of html to related css files
|
||||
html_href2css_href: defaultdict = defaultdict(list)
|
||||
css_href2css_content: dict = {}
|
||||
@@ -165,6 +164,7 @@ class EpubConverter:
|
||||
"""
|
||||
This function is designed to update html_href2html_body_soup
|
||||
And add to html_inline_style css_style_content
|
||||
|
||||
"""
|
||||
for html_href in self.html_href2html_body_soup:
|
||||
if self.html_href2css_href.get(html_href):
|
||||
@@ -191,8 +191,8 @@ class EpubConverter:
|
||||
|
||||
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
|
||||
:param lvl: level of depth
|
||||
"""
|
||||
|
||||
"""
|
||||
if isinstance(element, Link):
|
||||
nav_point = NavPoint(element)
|
||||
if nav_point.id:
|
||||
@@ -215,7 +215,8 @@ class EpubConverter:
|
||||
sub_nodes = []
|
||||
for elem in second:
|
||||
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
|
||||
self.offset_sub_nodes.append(self.build_adjacency_list_from_toc(elem, lvl))
|
||||
self.offset_sub_nodes.append(
|
||||
self.build_adjacency_list_from_toc(elem, lvl))
|
||||
else:
|
||||
sub_nodes.append(
|
||||
self.build_adjacency_list_from_toc(elem, lvl + 1))
|
||||
@@ -239,8 +240,8 @@ class EpubConverter:
|
||||
else:
|
||||
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
|
||||
|
||||
def is_toc_empty(self):
|
||||
""" Function checks is toc empty """
|
||||
def is_toc_empty(self) -> bool:
|
||||
"""Function checks is toc empty"""
|
||||
# there is no toc in ebook or no top chapters
|
||||
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
||||
return True
|
||||
@@ -258,7 +259,7 @@ class EpubConverter:
|
||||
self.hrefs_added_to_toc.add(nav_point.href)
|
||||
|
||||
def add_not_added_files_to_adjacency_list(self, not_added):
|
||||
""" Function add files that not added to adjacency list """
|
||||
"""Function add files that not added to adjacency list"""
|
||||
for i, file in enumerate(not_added):
|
||||
nav_point = NavPoint(
|
||||
Section(f'To check #{i}, filename: {file}', file))
|
||||
@@ -295,19 +296,26 @@ class EpubConverter:
|
||||
new_anchor_span.string = "\xa0"
|
||||
return new_anchor_span
|
||||
|
||||
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
||||
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
|
||||
"""
|
||||
Function used to find full path to file that is parsed from tag link
|
||||
TOC: a/b/c.xhtml
|
||||
|
||||
b/c.xhtml -> a/b/c.xhtml
|
||||
c.xhtml -> a/b/c.xhtml
|
||||
Parameters
|
||||
----------
|
||||
cur_file_path: str
|
||||
path to current file with tag link
|
||||
href_in_link: str
|
||||
filename got from tag link, like file1.xhtml
|
||||
internal_link_tag: Tag
|
||||
tag object that is parsed now
|
||||
|
||||
Used to find full path to file that is parsed from tag link
|
||||
Returns
|
||||
-------
|
||||
full_path[0]: s
|
||||
prepared content
|
||||
|
||||
:param cur_file_path: path to current file with tag link
|
||||
:param href_in_link: filename got from tag link, like file1.xhtml
|
||||
:param internal_link_tag: tag object that is parsed now
|
||||
:return:
|
||||
"""
|
||||
dir_name = os.path.dirname(cur_file_path)
|
||||
normed_path = os.path.normpath(os.path.join(
|
||||
@@ -331,6 +339,12 @@ class EpubConverter:
|
||||
Function
|
||||
- processing internal links in a book
|
||||
- make ids unique
|
||||
Steps
|
||||
----------
|
||||
1. rebuild ids to be unique in all documents
|
||||
2a. process anchor which is a whole xhtml file
|
||||
2b. process anchor which is an element in xhtml file
|
||||
|
||||
"""
|
||||
# 1. rebuild ids to be unique in all documents
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
@@ -344,7 +358,7 @@ class EpubConverter:
|
||||
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
|
||||
tag.attrs['id'] = new_id
|
||||
|
||||
# 2.a) process anchor which is a whole xhtml file
|
||||
# 2a. process anchor which is a whole xhtml file
|
||||
internal_link_reg1 = re.compile(
|
||||
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
@@ -367,7 +381,7 @@ class EpubConverter:
|
||||
|
||||
del internal_link_tag.attrs['href']
|
||||
|
||||
# 2.b) process anchor which is an element in xhtml file
|
||||
# 2b. process anchor which is an element in xhtml file
|
||||
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
soup = self.html_href2html_body_soup[toc_href]
|
||||
@@ -418,9 +432,9 @@ class EpubConverter:
|
||||
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
|
||||
f' Old id={a_tag_id}')
|
||||
|
||||
def build_one_chapter(self, nav_point):
|
||||
def build_one_chapter(self, nav_point: NavPoint):
|
||||
"""
|
||||
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
||||
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
||||
|
||||
3 cases:
|
||||
id wraps all chapter content,
|
||||
@@ -429,7 +443,13 @@ class EpubConverter:
|
||||
|
||||
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
|
||||
and id of the next chapter/subchapter
|
||||
Parameters
|
||||
----------
|
||||
nav_point: NavPoint
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
if nav_point.id:
|
||||
soup = self.html_href2html_body_soup[nav_point.href]
|
||||
@@ -446,7 +466,7 @@ class EpubConverter:
|
||||
self.build_one_chapter(sub_node)
|
||||
|
||||
def define_chapters_content(self):
|
||||
""" Function build chapters content starts from top level chapters """
|
||||
"""Function build chapters content, starts from top level chapters"""
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
if self.id_anchor_exist_in_nav_points:
|
||||
for point in top_level_nav_points:
|
||||
@@ -483,8 +503,8 @@ class EpubConverter:
|
||||
self.logger.log(f'{indent}Chapter: {title} is prepared.')
|
||||
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
|
||||
|
||||
def convert_to_dict(self):
|
||||
""" Function which convert list of html nodes to appropriate json structure. """
|
||||
def convert_to_dict(self) -> dict:
|
||||
"""Function which convert list of html nodes to appropriate json structure"""
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
top_level_chapters = []
|
||||
|
||||
@@ -502,7 +522,7 @@ class EpubConverter:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
filename = '9781641051217'
|
||||
filename = '9781614382264'
|
||||
logger_object = BookLogger(name='epub', book_id=filename)
|
||||
|
||||
json_converter = EpubConverter(f'../../epub/{filename}.epub',
|
||||
|
||||
@@ -2,7 +2,7 @@ from src.book_solver import BookSolver
|
||||
from src.epub_converter.epub_converter import EpubConverter
|
||||
|
||||
class EpubBook(BookSolver):
|
||||
""" Class of .epub type book - child of BookSolver """
|
||||
"""Class of .epub type book - child of BookSolver"""
|
||||
|
||||
def __init__(self, book_id=0, access=None, main_logger=None):
|
||||
super().__init__(book_id, access, main_logger)
|
||||
|
||||
@@ -9,7 +9,7 @@ from src.access import Access
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
|
||||
|
||||
def save_image_locally(img_file_path, img_content, book_id):
|
||||
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
|
||||
"""Function saves all images locally"""
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
new_path = pathlib.Path(os.path.join(
|
||||
@@ -24,19 +24,19 @@ def save_image_locally(img_file_path, img_content, book_id):
|
||||
return new_img_path
|
||||
|
||||
|
||||
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
|
||||
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
|
||||
"""Function saves all images to Amazon web service"""
|
||||
link_path = access.send_image(
|
||||
img_file_path, doc_id=book_id, img_content=img_content)
|
||||
return link_path
|
||||
|
||||
|
||||
def update_images_src_links(body_tag: Tag,
|
||||
def update_images_src_links(body_tag: BeautifulSoup,
|
||||
href2img_content: dict,
|
||||
path_to_html,
|
||||
path_to_html: str,
|
||||
access=None,
|
||||
path2aws_path=None,
|
||||
book_id=None):
|
||||
path2aws_path: dict = None,
|
||||
book_id: str = None) -> dict:
|
||||
"""Function makes dictionary image_src_path -> Amazon web service_path"""
|
||||
img_tags = body_tag.find_all('img')
|
||||
|
||||
@@ -99,13 +99,22 @@ def preprocess_table(body_tag: BeautifulSoup):
|
||||
table.attrs['border'] = '1'
|
||||
|
||||
|
||||
def process_lists(body_tag):
|
||||
def process_lists(body_tag: BeautifulSoup):
|
||||
"""
|
||||
Function to process tags <li>.
|
||||
Unwrap <p> tags.
|
||||
"""
|
||||
li_tags = body_tag.find_all("li")
|
||||
Function
|
||||
- process tags <li>.
|
||||
- unwrap <p> tags.
|
||||
Parameters
|
||||
----------
|
||||
body_tag: Tag, soup object
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
"""
|
||||
|
||||
li_tags = body_tag.find_all("li")
|
||||
for li_tag in li_tags:
|
||||
if li_tag.p:
|
||||
li_tag.attrs.update(li_tag.p.attrs)
|
||||
@@ -113,7 +122,7 @@ def process_lists(body_tag):
|
||||
|
||||
|
||||
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||
"""Function inserts span before tag to be removed(aren't supported by livecarta)"""
|
||||
"""Function inserts span before tag aren't supported by livecarta"""
|
||||
new_tag = main_tag.new_tag("span")
|
||||
new_tag.attrs['id'] = id_ or ''
|
||||
new_tag.attrs['class'] = class_ or ''
|
||||
@@ -121,8 +130,8 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||
tag.insert_before(new_tag)
|
||||
|
||||
|
||||
def clean_headings_content(content: Tag, title: str):
|
||||
def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
|
||||
def clean_headings_content(content: BeautifulSoup, title: str):
|
||||
def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
|
||||
if tag_to_be_removed.attrs.get('id'):
|
||||
insert_span_with_attrs_before_tag(body_tag,
|
||||
tag_to_be_removed,
|
||||
@@ -194,6 +203,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
|
||||
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
|
||||
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
||||
|
||||
"""
|
||||
footnotes = []
|
||||
noterefs_tags = source_html_tag.find_all(
|
||||
@@ -258,21 +268,28 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
return footnotes, new_noterefs_tags, new_footnotes_tags
|
||||
|
||||
|
||||
def unwrap_structural_tags(body_tag):
|
||||
"""Main function that works with structure of html. Make changes inplace.
|
||||
def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
"""
|
||||
Main function that works with structure of html. Make changes inplace.
|
||||
Parameters
|
||||
----------
|
||||
body_tag: Tag, soup object
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. Extracts tags that are not needed
|
||||
|
||||
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
|
||||
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
|
||||
This tag must have a body_tag as a parent.
|
||||
Otherwise, it is wrapped with some tags. Like:
|
||||
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
|
||||
|
||||
3. Headings that are not supported by livecarta converts to <p>
|
||||
4. Wrapping NavigableString
|
||||
:param body_tag: Tag, soup object
|
||||
:return: None
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
"""
|
||||
|
||||
def preserve_class_in_aside_tag(tag_):
|
||||
@@ -284,10 +301,18 @@ def unwrap_structural_tags(body_tag):
|
||||
if not tag_.parent.attrs.get('class'):
|
||||
tag_.parent.attrs['class'] = tag_class
|
||||
|
||||
def preserve_class_in_section_tag(tag_) -> bool:
|
||||
def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
|
||||
"""
|
||||
to save css style inherited from class, copy class to child <p>
|
||||
Function saves css style inherited from class, copies class to child <p>
|
||||
returns True, if <section> could be unwrapped
|
||||
Parameters
|
||||
----------
|
||||
tag_: Tag, soup object
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
"""
|
||||
# this is for Wiley books with boxes
|
||||
tag_class = tag_.attrs['class'] if not isinstance(
|
||||
@@ -314,9 +339,11 @@ def unwrap_structural_tags(body_tag):
|
||||
class_=tag_to_be_removed.attrs.get('class'))
|
||||
|
||||
def replace_div_tag_with_table():
|
||||
"""Function replace <div> with <table>:
|
||||
"""
|
||||
Function replace <div> with <table>:
|
||||
1. Convert div with certain classes to tables
|
||||
2. Add background color to div with background-color
|
||||
|
||||
"""
|
||||
for div in body_tag.find_all("div"):
|
||||
if div.attrs.get('class'):
|
||||
@@ -431,22 +458,22 @@ def unwrap_structural_tags(body_tag):
|
||||
return body_tag
|
||||
|
||||
|
||||
def get_tags_between_chapter_marks(first_id, href, html_soup):
|
||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||
"""After processing on a first_id that corresponds to current chapter,
|
||||
from initial html_soup all tags from current chapter are extracted
|
||||
|
||||
Parameters
|
||||
----------
|
||||
first_id :
|
||||
first_id:
|
||||
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
||||
href :
|
||||
href:
|
||||
Name of current chapter's file
|
||||
html_soup :
|
||||
html_soup: Tag, soup object
|
||||
Soup object of current file
|
||||
|
||||
Returns
|
||||
-------
|
||||
tags : list [Tag, NavigableString]
|
||||
tags: list [Tag, NavigableString]
|
||||
Chapter's tags
|
||||
|
||||
"""
|
||||
@@ -536,37 +563,33 @@ def prepare_formatted(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def wrap_preformatted_span_with_table(main_tag, old_tag):
|
||||
def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
|
||||
"""Function wraps <span> with <table>"""
|
||||
table = main_tag.new_tag("table")
|
||||
table.attrs['border'] = '1px #ccc;'
|
||||
table.attrs['style'] = 'width:100%;'
|
||||
tbody = main_tag.new_tag("tbody")
|
||||
tr = main_tag.new_tag("tr")
|
||||
td = main_tag.new_tag("td")
|
||||
table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
|
||||
"tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||
table.attrs['border'], table.attrs['style'] = '1px #ccc;', 'width:100%;'
|
||||
td.attrs['bgcolor'] = '#f5f5f5'
|
||||
# td.attrs['border-radius'] = '4px'
|
||||
old_tag.wrap(td)
|
||||
span_tag.wrap(td)
|
||||
td.wrap(tr)
|
||||
tr.wrap(tbody)
|
||||
tbody.wrap(table)
|
||||
return table
|
||||
|
||||
|
||||
def preprocess_pre_tags(chapter_tag):
|
||||
"""Function preprocessing <pre> tags
|
||||
def preprocess_pre_tags(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function preprocessing <pre> tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
chapter_tag: Tag, soup object
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. cleaning \n
|
||||
2. heading removal
|
||||
3. processing tags
|
||||
4. class removal
|
||||
"""
|
||||
1. Process NavigableString
|
||||
2. Process Tags and their children
|
||||
|
||||
"""
|
||||
for pre in chapter_tag.find_all("pre"):
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag("span")
|
||||
new_tag.attrs = pre.attrs.copy()
|
||||
@@ -599,17 +622,26 @@ def preprocess_pre_tags(chapter_tag):
|
||||
"font-size: 14px; white-space: nowrap;"
|
||||
pre.replace_with(new_tag)
|
||||
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
|
||||
# add <p> to save brs
|
||||
p_for_br = chapter_tag.new_tag("p")
|
||||
p_for_br.string = "\xa0"
|
||||
table.insert_after(p_for_br)
|
||||
|
||||
|
||||
def preprocess_code_tags(chapter_tag: Tag):
|
||||
"""Function that
|
||||
- transform <code>, <kdb>, <var> tags into span
|
||||
- add code style to this tags
|
||||
def preprocess_code_tags(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function
|
||||
- transform <code>, <kdb>, <var> tags into span
|
||||
- add code style to this tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: Tag, soup object
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
"""
|
||||
for code in chapter_tag.find_all(re.compile("code|kbd|var")):
|
||||
code.name = "span"
|
||||
if code.parent.name == "pre":
|
||||
@@ -620,7 +652,6 @@ def preprocess_code_tags(chapter_tag: Tag):
|
||||
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
|
||||
|
||||
|
||||
|
||||
def prepare_title(title_of_chapter: str) -> str:
|
||||
"""Function finalise processing/cleaning title"""
|
||||
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
|
||||
@@ -631,18 +662,19 @@ def prepare_title(title_of_chapter: str) -> str:
|
||||
|
||||
|
||||
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
||||
"""Function finalise processing/cleaning content
|
||||
"""
|
||||
Function finalise processing/cleaning content
|
||||
Parameters
|
||||
----------
|
||||
title_str: str
|
||||
|
||||
content_tag: BeautifulSoup
|
||||
content_tag: Tag, soup object
|
||||
|
||||
remove_title_from_chapter: bool
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. cleaning \n
|
||||
1. find \n
|
||||
2. heading removal
|
||||
3. processing tags
|
||||
4. class removal
|
||||
@@ -651,9 +683,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
-------
|
||||
content_tag: str
|
||||
prepared content
|
||||
"""
|
||||
|
||||
# 0. cleaning \n
|
||||
"""
|
||||
# 1. find \n
|
||||
to_remove = []
|
||||
for child in content_tag.contents:
|
||||
if isinstance(child, NavigableString):
|
||||
@@ -661,18 +693,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
if s == '':
|
||||
to_remove.append(child)
|
||||
|
||||
# 1. heading removal
|
||||
# 2. heading removal
|
||||
if remove_title_from_chapter:
|
||||
clean_headings_content(content_tag, title_str)
|
||||
|
||||
# 2. processing tags (<li>, <table>, <code>, <pre>, <block>)
|
||||
# 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
|
||||
process_lists(content_tag)
|
||||
preprocess_table(content_tag)
|
||||
preprocess_code_tags(content_tag)
|
||||
preprocess_pre_tags(content_tag)
|
||||
preprocess_block_tags(content_tag)
|
||||
|
||||
# 3. class removal
|
||||
# 4. class removal
|
||||
for tag in content_tag.find_all(recursive=True):
|
||||
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
|
||||
'footnote-element']):
|
||||
|
||||
Reference in New Issue
Block a user