forked from LiveCarta/BookConverter
LAW-5444
This commit is contained in:
@@ -14,11 +14,11 @@ from src.livecarta_config import LiveCartaConfig
|
|||||||
cssutils.log.setLevel(CRITICAL)
|
cssutils.log.setLevel(CRITICAL)
|
||||||
|
|
||||||
|
|
||||||
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
|
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
|
||||||
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
|
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
|
||||||
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
|
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
|
||||||
|
|
||||||
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
|
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
|
||||||
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
|
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
|
||||||
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
|
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
|
||||||
'48px', '49px', '50px', '64px', '72px']
|
'48px', '49px', '50px', '64px', '72px']
|
||||||
@@ -28,61 +28,42 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
|
|||||||
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
||||||
|
|
||||||
|
|
||||||
def convert_font_size(value):
|
def convert_tag_values(value):
|
||||||
""" Function converts font-size in mapping """
|
"""Function 1. converts values of tags from em/%/pt to px
|
||||||
if 'pt' in value:
|
2. find closest font-size px
|
||||||
if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
|
Parameters
|
||||||
return ''
|
----------
|
||||||
else:
|
value: str
|
||||||
return value.replace('pt', 'px')
|
|
||||||
|
|
||||||
if value == '100%':
|
Returns
|
||||||
return ''
|
-------
|
||||||
try:
|
converted value: str
|
||||||
if '%' in value:
|
"""
|
||||||
value = float(value.replace('%', ''))
|
|
||||||
value = value / 100.0
|
|
||||||
elif 'em' in value:
|
|
||||||
value = float(value.replace('em', ''))
|
|
||||||
else:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
if value > 5:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
def find_closest_size(value):
|
||||||
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
|
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
|
||||||
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
||||||
return sizes_px[last_possible_size_index]
|
return sizes_px[last_possible_size_index]
|
||||||
|
|
||||||
except ValueError:
|
font_size_regexp = re.compile(r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
|
||||||
return ''
|
has_style_attrs = re.search(font_size_regexp, value)
|
||||||
|
|
||||||
|
|
||||||
def convert_indents(value):
|
|
||||||
""" Function converts text-indent and margin-left values to px """
|
|
||||||
# 30px = 3.2% = 1.25em = 23pt
|
|
||||||
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)')
|
|
||||||
has_style_attrs = re.search(text_indent_regexp, value)
|
|
||||||
if has_style_attrs:
|
if has_style_attrs:
|
||||||
if has_style_attrs.group(1):
|
if has_style_attrs.group(1):
|
||||||
value = value.replace(has_style_attrs.group(1),
|
value = float(value.replace('%', '')) / 100.0
|
||||||
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
|
return find_closest_size(value)
|
||||||
'px')
|
elif has_style_attrs.group(3):
|
||||||
|
value = float(value.replace('em', ''))
|
||||||
elif has_style_attrs.group(2):
|
return find_closest_size(value)
|
||||||
value = value.replace(has_style_attrs.group(2),
|
elif has_style_attrs.group(5):
|
||||||
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
|
return value.replace('pt', 'px')
|
||||||
'px')
|
else:
|
||||||
|
return ''
|
||||||
elif has_style_attrs.group(4):
|
|
||||||
value = value.replace(has_style_attrs.group(4),
|
|
||||||
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px')
|
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
LIVECARTA_STYLE_ATTRS = { css property: value }
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||||
Style properties that can be used to fit livecarta css style convention.
|
Style properties that can be used to fit livecarta css style convention.
|
||||||
If property has empty list, it means that any value can be converted.
|
If property has empty list, it means that any value can be converted.
|
||||||
If property has not empty list, it means that only certain property-value combinations can be transformed.
|
If property has not empty list, it means that only certain property-value combinations can be transformed.
|
||||||
@@ -115,7 +96,8 @@ LIVECARTA_STYLE_ATTRS = {
|
|||||||
'list-style-type': [],
|
'list-style-type': [],
|
||||||
'list-style-image': [],
|
'list-style-image': [],
|
||||||
'margin-left': [],
|
'margin-left': [],
|
||||||
'margin-top': []
|
'margin-top': [],
|
||||||
|
'margin': [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -132,18 +114,18 @@ def get_text_color(x):
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
||||||
|
|
||||||
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
|
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
|
||||||
to suit livecarta style convention.
|
to suit livecarta style convention.
|
||||||
"""
|
"""
|
||||||
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||||
'text-indent': convert_indents,
|
'text-indent': convert_tag_values,
|
||||||
'font-variant': lambda x: x,
|
'font-variant': lambda x: x,
|
||||||
'text-align': lambda x: x,
|
'text-align': lambda x: x,
|
||||||
'font': lambda x: '',
|
'font': lambda x: '',
|
||||||
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
|
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
|
||||||
'font-size': convert_font_size,
|
'font-size': convert_tag_values,
|
||||||
'color': get_text_color,
|
'color': get_text_color,
|
||||||
'background-color': get_bg_color,
|
'background-color': get_bg_color,
|
||||||
'background': get_bg_color,
|
'background': get_bg_color,
|
||||||
@@ -156,8 +138,9 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
|
|||||||
'border-bottom': lambda x: x if x != '0' else '',
|
'border-bottom': lambda x: x if x != '0' else '',
|
||||||
'list-style-type': lambda x: x if x in list_types else 'disc',
|
'list-style-type': lambda x: x if x in list_types else 'disc',
|
||||||
'list-style-image': lambda x: 'disc',
|
'list-style-image': lambda x: 'disc',
|
||||||
'margin-left': convert_indents,
|
'margin-left': convert_tag_values,
|
||||||
'margin-top': convert_indents
|
'margin-top': convert_tag_values,
|
||||||
|
'margin': convert_tag_values,
|
||||||
}
|
}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -181,10 +164,17 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
|||||||
|
|
||||||
|
|
||||||
def check_style_to_be_tag(style) -> List[tuple]:
|
def check_style_to_be_tag(style) -> List[tuple]:
|
||||||
"""
|
"""Function search style properties that can be converted to tags.
|
||||||
Some css style properties converts to tags.
|
It searches for them and prepare list of properties to be removed from style string
|
||||||
Search for them and prepare list of properties to be removed from style string
|
Parameters
|
||||||
|
----------
|
||||||
|
style: str
|
||||||
|
<tag style="...">
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
properties to remove: list
|
||||||
"""
|
"""
|
||||||
|
|
||||||
to_remove = []
|
to_remove = []
|
||||||
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
||||||
if f'{k[0]}:{k[1]}' in style:
|
if f'{k[0]}:{k[1]}' in style:
|
||||||
@@ -267,37 +257,40 @@ class TagStyleConverter:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def process_indents_to_px(split_style: dict) -> str:
|
def process_indents_to_px(split_style: dict) -> str:
|
||||||
""" Function cleans using convert_indents() style string and returns new clean_style """
|
"""Function cleans style string using convert_tag_values() and returns new clean_style"""
|
||||||
split_style = [k + ":" + v for k, v in split_style.items()]
|
split_style = [k + ":" + v for k, v in split_style.items()]
|
||||||
clean_style = ''
|
clean_style = ''
|
||||||
for item in split_style:
|
for item in split_style:
|
||||||
item = item.split(':')
|
item = item.split(':')
|
||||||
if item[0] in ['text-indent', 'margin-left']:
|
if item[0] in ['text-indent', 'margin-left', 'margin']:
|
||||||
item[1] = convert_indents(item[1])
|
if len(item[1].split(' ')) == 3:
|
||||||
|
item[1] = convert_tag_values(item[1].split(' ')[-2]) # split returns middle value
|
||||||
|
else:
|
||||||
|
item[1] = convert_tag_values(item[1].split(' ')[-1]) # split returns last value
|
||||||
clean_style += item[0] + ': ' + item[1] + '; '
|
clean_style += item[0] + ': ' + item[1] + '; '
|
||||||
|
|
||||||
margin_left_regexp = re.compile(
|
margin_left_regexp = re.compile(
|
||||||
r'(margin-left: *(-*\w+);*)')
|
r'((margin-left|margin): *(-*\w+);*)')
|
||||||
text_indent_regexp = re.compile(
|
text_indent_regexp = re.compile(
|
||||||
r'(text-indent: *(-*\w+);*)')
|
r'(text-indent: *(-*\w+);*)')
|
||||||
|
|
||||||
has_margin_left = re.search(margin_left_regexp, clean_style)
|
has_margin = re.search(margin_left_regexp, clean_style)
|
||||||
has_text_indent = re.search(text_indent_regexp, clean_style)
|
has_text_indent = re.search(text_indent_regexp, clean_style)
|
||||||
# formula_of_indent: indent = abs(margin_left - text_indent)
|
# formula_of_indent: indent = abs(margin - text_indent)
|
||||||
if has_margin_left:
|
if has_margin:
|
||||||
num_ml = abs(int("0" + "".join(
|
num_m = abs(int("0" + "".join(
|
||||||
filter(str.isdigit, str(has_margin_left.group(2))))))
|
filter(str.isdigit, str(has_margin.group(3))))))
|
||||||
|
|
||||||
if has_text_indent:
|
if has_text_indent:
|
||||||
num_ti = abs(int("0" + "".join(
|
num_ti = abs(int("0" + "".join(
|
||||||
filter(str.isdigit, str(has_text_indent.group(2))))))
|
filter(str.isdigit, str(has_text_indent.group(2))))))
|
||||||
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
|
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
|
||||||
str(abs(num_ml - num_ti)) + 'px; ')
|
str(abs(num_m - num_ti)) + 'px; ')
|
||||||
clean_style = clean_style.replace(has_margin_left.group(1), '')
|
clean_style = clean_style.replace(has_margin.group(1), '')
|
||||||
return clean_style
|
return clean_style
|
||||||
|
|
||||||
clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' +
|
clean_style = clean_style.replace(has_margin.group(1), 'text-indent: ' +
|
||||||
str(abs(num_ml)) + 'px; ')
|
str(abs(num_m)) + 'px; ')
|
||||||
return clean_style
|
return clean_style
|
||||||
|
|
||||||
elif has_text_indent:
|
elif has_text_indent:
|
||||||
@@ -309,7 +302,7 @@ class TagStyleConverter:
|
|||||||
|
|
||||||
def preprocess_style(self):
|
def preprocess_style(self):
|
||||||
def remove_extra_spaces(style: str) -> dict:
|
def remove_extra_spaces(style: str) -> dict:
|
||||||
""" Function to remove extra spaces in style to process clean_style """
|
"""Function to remove extra spaces in style to process clean_style"""
|
||||||
# replace all spaces between '; & letter' to ';'
|
# replace all spaces between '; & letter' to ';'
|
||||||
style = re.sub(r"; *", ";", style)
|
style = re.sub(r"; *", ";", style)
|
||||||
split_style: List = style.split(';')
|
split_style: List = style.split(';')
|
||||||
@@ -509,7 +502,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
|
|||||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||||
livecarta_tmp_ids = []
|
livecarta_tmp_ids = []
|
||||||
could_have_style_in_livecarta_regexp = re.compile(
|
could_have_style_in_livecarta_regexp = re.compile(
|
||||||
'(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
||||||
tags_with_possible_style_attr = html_soup.find_all(
|
tags_with_possible_style_attr = html_soup.find_all(
|
||||||
could_have_style_in_livecarta_regexp)
|
could_have_style_in_livecarta_regexp)
|
||||||
for i, x in enumerate(tags_with_possible_style_attr):
|
for i, x in enumerate(tags_with_possible_style_attr):
|
||||||
|
|||||||
@@ -275,7 +275,7 @@ def unwrap_structural_tags(body_tag):
|
|||||||
:return: None
|
:return: None
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _preserve_class_in_aside_tag(tag_):
|
def preserve_class_in_aside_tag(tag_):
|
||||||
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
|
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
|
||||||
# this is for Wiley books with boxes
|
# this is for Wiley books with boxes
|
||||||
tag_class = tag_.attrs['class'] if not isinstance(
|
tag_class = tag_.attrs['class'] if not isinstance(
|
||||||
@@ -561,8 +561,8 @@ def preprocess_pre_tags(chapter_tag):
|
|||||||
spans = pre.find_all("span")
|
spans = pre.find_all("span")
|
||||||
# if in <pre> there are multiple <span>, we need to add <br> after each content
|
# if in <pre> there are multiple <span>, we need to add <br> after each content
|
||||||
to_add_br = len(spans) > 1
|
to_add_br = len(spans) > 1
|
||||||
|
copy_contents = pre.contents[:]
|
||||||
for child in pre.children:
|
for child in copy_contents:
|
||||||
if isinstance(child, NavigableString):
|
if isinstance(child, NavigableString):
|
||||||
cleaned_text = prepare_formatted(str(child))
|
cleaned_text = prepare_formatted(str(child))
|
||||||
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
|
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
|
||||||
@@ -573,8 +573,8 @@ def preprocess_pre_tags(chapter_tag):
|
|||||||
else:
|
else:
|
||||||
for sub_child in child.children:
|
for sub_child in child.children:
|
||||||
if isinstance(sub_child, NavigableString):
|
if isinstance(sub_child, NavigableString):
|
||||||
cleaned_text2 = prepare_formatted(str(sub_child))
|
cleaned_text = prepare_formatted(str(sub_child))
|
||||||
sub_child.replace_with(NavigableString(cleaned_text2))
|
sub_child.replace_with(NavigableString(cleaned_text))
|
||||||
else:
|
else:
|
||||||
sub_child.string = prepare_formatted(sub_child.text)
|
sub_child.string = prepare_formatted(sub_child.text)
|
||||||
cleaned_tag = child.extract()
|
cleaned_tag = child.extract()
|
||||||
@@ -594,11 +594,15 @@ def preprocess_pre_tags(chapter_tag):
|
|||||||
|
|
||||||
def preprocess_code_tags(chapter_tag):
|
def preprocess_code_tags(chapter_tag):
|
||||||
"""Function that emulates style of <code>, <kdb>, <var>"""
|
"""Function that emulates style of <code>, <kdb>, <var>"""
|
||||||
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
|
for parent_tag in chapter_tag.find_all(re.compile("pre|p")):
|
||||||
code.name = 'span'
|
for code in parent_tag.find_all(re.compile("code|kbd|var")):
|
||||||
if code.parent.name == "pre":
|
# if code.name == "code":
|
||||||
continue
|
# parent_tag.name = "pre"
|
||||||
code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;'
|
code.name = "span"
|
||||||
|
if parent_tag.name == "pre":
|
||||||
|
continue
|
||||||
|
# if tags aren't in pre
|
||||||
|
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
|
||||||
|
|
||||||
|
|
||||||
def prepare_title(title_of_chapter: str) -> str:
|
def prepare_title(title_of_chapter: str) -> str:
|
||||||
@@ -614,11 +618,11 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
|||||||
"""Function finalise processing/cleaning content
|
"""Function finalise processing/cleaning content
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
title_str : str
|
title_str: str
|
||||||
|
|
||||||
content_tag : BeautifulSoup
|
content_tag: BeautifulSoup
|
||||||
|
|
||||||
remove_title_from_chapter : bool
|
remove_title_from_chapter: bool
|
||||||
|
|
||||||
Steps
|
Steps
|
||||||
----------
|
----------
|
||||||
@@ -629,10 +633,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
str
|
prepared content: str
|
||||||
Prepared content
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# 0. cleaning \n
|
# 0. cleaning \n
|
||||||
to_remove = []
|
to_remove = []
|
||||||
for child in content_tag.contents:
|
for child in content_tag.contents:
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class LiveCartaConfig:
|
|||||||
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
|
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
|
||||||
"Verdana": "verdana,geneva,sans-serif",
|
"Verdana": "verdana,geneva,sans-serif",
|
||||||
"monospace": "courier new,courier,monospace",
|
"monospace": "courier new,courier,monospace",
|
||||||
"sans-serif": "arial,helvetica,sans-serif",
|
"sans-serif": "arial,helvetica,sans-serif"
|
||||||
}
|
}
|
||||||
|
|
||||||
COLORS_MAP = {
|
COLORS_MAP = {
|
||||||
|
|||||||
Reference in New Issue
Block a user