This commit is contained in:
Kiryl
2022-04-28 16:26:49 +03:00
parent 46064bf247
commit c10190662b
3 changed files with 82 additions and 86 deletions

View File

@@ -14,11 +14,11 @@ from src.livecarta_config import LiveCartaConfig
cssutils.log.setLevel(CRITICAL)
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
'48px', '49px', '50px', '64px', '72px']
@@ -28,61 +28,42 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_font_size(value):
""" Function converts font-size in mapping """
if 'pt' in value:
if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
return ''
else:
return value.replace('pt', 'px')
def convert_tag_values(value):
"""Function 1. converts values of tags from em/%/pt to px
2. find closest font-size px
Parameters
----------
value: str
if value == '100%':
return ''
try:
if '%' in value:
value = float(value.replace('%', ''))
value = value / 100.0
elif 'em' in value:
value = float(value.replace('em', ''))
else:
return ''
if value > 5:
return ''
Returns
-------
converted value: str
"""
def find_closest_size(value):
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index]
except ValueError:
return ''
def convert_indents(value):
""" Function converts text-indent and margin-left values to px """
# 30px = 3.2% = 1.25em = 23pt
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)')
has_style_attrs = re.search(text_indent_regexp, value)
font_size_regexp = re.compile(r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
has_style_attrs = re.search(font_size_regexp, value)
if has_style_attrs:
if has_style_attrs.group(1):
value = value.replace(has_style_attrs.group(1),
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
'px')
elif has_style_attrs.group(2):
value = value.replace(has_style_attrs.group(2),
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
'px')
elif has_style_attrs.group(4):
value = value.replace(has_style_attrs.group(4),
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px')
value = float(value.replace('%', '')) / 100.0
return find_closest_size(value)
elif has_style_attrs.group(3):
value = float(value.replace('em', ''))
return find_closest_size(value)
elif has_style_attrs.group(5):
return value.replace('pt', 'px')
else:
return ''
return value
"""
LIVECARTA_STYLE_ATTRS = { css property: value }
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
@@ -115,7 +96,8 @@ LIVECARTA_STYLE_ATTRS = {
'list-style-type': [],
'list-style-image': [],
'margin-left': [],
'margin-top': []
'margin-top': [],
'margin': [],
}
@@ -132,18 +114,18 @@ def get_text_color(x):
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents,
'text-indent': convert_tag_values,
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_font_size,
'font-size': convert_tag_values,
'color': get_text_color,
'background-color': get_bg_color,
'background': get_bg_color,
@@ -156,8 +138,9 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
'border-bottom': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in list_types else 'disc',
'list-style-image': lambda x: 'disc',
'margin-left': convert_indents,
'margin-top': convert_indents
'margin-left': convert_tag_values,
'margin-top': convert_tag_values,
'margin': convert_tag_values,
}
"""
@@ -181,10 +164,17 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
def check_style_to_be_tag(style) -> List[tuple]:
"""
Some css style properties converts to tags.
Search for them and prepare list of properties to be removed from style string
"""Function search style properties that can be converted to tags.
It searches for them and prepare list of properties to be removed from style string
Parameters
----------
style: str
<tag style="...">
Returns
-------
properties to remove: list
"""
to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
@@ -267,37 +257,40 @@ class TagStyleConverter:
@staticmethod
def process_indents_to_px(split_style: dict) -> str:
""" Function cleans using convert_indents() style string and returns new clean_style """
"""Function cleans style string using convert_tag_values() and returns new clean_style"""
split_style = [k + ":" + v for k, v in split_style.items()]
clean_style = ''
for item in split_style:
item = item.split(':')
if item[0] in ['text-indent', 'margin-left']:
item[1] = convert_indents(item[1])
if item[0] in ['text-indent', 'margin-left', 'margin']:
if len(item[1].split(' ')) == 3:
item[1] = convert_tag_values(item[1].split(' ')[-2]) # split returns middle value
else:
item[1] = convert_tag_values(item[1].split(' ')[-1]) # split returns last value
clean_style += item[0] + ': ' + item[1] + '; '
margin_left_regexp = re.compile(
r'(margin-left: *(-*\w+);*)')
r'((margin-left|margin): *(-*\w+);*)')
text_indent_regexp = re.compile(
r'(text-indent: *(-*\w+);*)')
has_margin_left = re.search(margin_left_regexp, clean_style)
has_margin = re.search(margin_left_regexp, clean_style)
has_text_indent = re.search(text_indent_regexp, clean_style)
# formula_of_indent: indent = abs(margin_left - text_indent)
if has_margin_left:
num_ml = abs(int("0" + "".join(
filter(str.isdigit, str(has_margin_left.group(2))))))
# formula_of_indent: indent = abs(margin - text_indent)
if has_margin:
num_m = abs(int("0" + "".join(
filter(str.isdigit, str(has_margin.group(3))))))
if has_text_indent:
num_ti = abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2))))))
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
clean_style = clean_style.replace(has_margin_left.group(1), '')
str(abs(num_m - num_ti)) + 'px; ')
clean_style = clean_style.replace(has_margin.group(1), '')
return clean_style
clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' +
str(abs(num_ml)) + 'px; ')
clean_style = clean_style.replace(has_margin.group(1), 'text-indent: ' +
str(abs(num_m)) + 'px; ')
return clean_style
elif has_text_indent:
@@ -309,7 +302,7 @@ class TagStyleConverter:
def preprocess_style(self):
def remove_extra_spaces(style: str) -> dict:
""" Function to remove extra spaces in style to process clean_style """
"""Function to remove extra spaces in style to process clean_style"""
# replace all spaces between '; & letter' to ';'
style = re.sub(r"; *", ";", style)
split_style: List = style.split(';')
@@ -509,7 +502,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
'@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []
could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_possible_style_attr = html_soup.find_all(
could_have_style_in_livecarta_regexp)
for i, x in enumerate(tags_with_possible_style_attr):

View File

@@ -275,7 +275,7 @@ def unwrap_structural_tags(body_tag):
:return: None
"""
def _preserve_class_in_aside_tag(tag_):
def preserve_class_in_aside_tag(tag_):
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(
@@ -561,8 +561,8 @@ def preprocess_pre_tags(chapter_tag):
spans = pre.find_all("span")
# if in <pre> there are multiple <span>, we need to add <br> after each content
to_add_br = len(spans) > 1
for child in pre.children:
copy_contents = pre.contents[:]
for child in copy_contents:
if isinstance(child, NavigableString):
cleaned_text = prepare_formatted(str(child))
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
@@ -573,8 +573,8 @@ def preprocess_pre_tags(chapter_tag):
else:
for sub_child in child.children:
if isinstance(sub_child, NavigableString):
cleaned_text2 = prepare_formatted(str(sub_child))
sub_child.replace_with(NavigableString(cleaned_text2))
cleaned_text = prepare_formatted(str(sub_child))
sub_child.replace_with(NavigableString(cleaned_text))
else:
sub_child.string = prepare_formatted(sub_child.text)
cleaned_tag = child.extract()
@@ -594,11 +594,15 @@ def preprocess_pre_tags(chapter_tag):
def preprocess_code_tags(chapter_tag):
"""Function that emulates style of <code>, <kdb>, <var>"""
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
code.name = 'span'
if code.parent.name == "pre":
continue
code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;'
for parent_tag in chapter_tag.find_all(re.compile("pre|p")):
for code in parent_tag.find_all(re.compile("code|kbd|var")):
# if code.name == "code":
# parent_tag.name = "pre"
code.name = "span"
if parent_tag.name == "pre":
continue
# if tags aren't in pre
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
def prepare_title(title_of_chapter: str) -> str:
@@ -614,11 +618,11 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
"""Function finalise processing/cleaning content
Parameters
----------
title_str : str
title_str: str
content_tag : BeautifulSoup
content_tag: BeautifulSoup
remove_title_from_chapter : bool
remove_title_from_chapter: bool
Steps
----------
@@ -629,10 +633,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
Returns
-------
str
Prepared content
prepared content: str
"""
# 0. cleaning \n
to_remove = []
for child in content_tag.contents:

View File

@@ -22,7 +22,7 @@ class LiveCartaConfig:
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
"Verdana": "verdana,geneva,sans-serif",
"monospace": "courier new,courier,monospace",
"sans-serif": "arial,helvetica,sans-serif",
"sans-serif": "arial,helvetica,sans-serif"
}
COLORS_MAP = {