This commit is contained in:
Kiryl
2022-04-28 16:26:49 +03:00
parent 46064bf247
commit c10190662b
3 changed files with 82 additions and 86 deletions

View File

@@ -14,11 +14,11 @@ from src.livecarta_config import LiveCartaConfig
cssutils.log.setLevel(CRITICAL) cssutils.log.setLevel(CRITICAL)
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px', sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
'48px', '49px', '50px', '64px', '72px'] '48px', '49px', '50px', '64px', '72px']
@@ -28,61 +28,42 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_font_size(value): def convert_tag_values(value):
""" Function converts font-size in mapping """ """Function 1. converts values of tags from em/%/pt to px
if 'pt' in value: 2. find closest font-size px
if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE: Parameters
return '' ----------
else: value: str
return value.replace('pt', 'px')
if value == '100%': Returns
return '' -------
try: converted value: str
if '%' in value: """
value = float(value.replace('%', ''))
value = value / 100.0
elif 'em' in value:
value = float(value.replace('em', ''))
else:
return ''
if value > 5:
return ''
def find_closest_size(value):
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr)) possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1]) last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index] return sizes_px[last_possible_size_index]
except ValueError: font_size_regexp = re.compile(r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
return '' has_style_attrs = re.search(font_size_regexp, value)
def convert_indents(value):
""" Function converts text-indent and margin-left values to px """
# 30px = 3.2% = 1.25em = 23pt
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)')
has_style_attrs = re.search(text_indent_regexp, value)
if has_style_attrs: if has_style_attrs:
if has_style_attrs.group(1): if has_style_attrs.group(1):
value = value.replace(has_style_attrs.group(1), value = float(value.replace('%', '')) / 100.0
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) + return find_closest_size(value)
'px') elif has_style_attrs.group(3):
value = float(value.replace('em', ''))
elif has_style_attrs.group(2): return find_closest_size(value)
value = value.replace(has_style_attrs.group(2), elif has_style_attrs.group(5):
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) + return value.replace('pt', 'px')
'px') else:
return ''
elif has_style_attrs.group(4):
value = value.replace(has_style_attrs.group(4),
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px')
return value return value
"""
LIVECARTA_STYLE_ATTRS = { css property: value }
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention. Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted. If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed. If property has not empty list, it means that only certain property-value combinations can be transformed.
@@ -115,7 +96,8 @@ LIVECARTA_STYLE_ATTRS = {
'list-style-type': [], 'list-style-type': [],
'list-style-image': [], 'list-style-image': [],
'margin-left': [], 'margin-left': [],
'margin-top': [] 'margin-top': [],
'margin': [],
} }
@@ -132,18 +114,18 @@ def get_text_color(x):
""" """
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention. to suit livecarta style convention.
""" """
LIVECARTA_STYLE_ATTRS_MAPPING = { LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents, 'text-indent': convert_tag_values,
'font-variant': lambda x: x, 'font-variant': lambda x: x,
'text-align': lambda x: x, 'text-align': lambda x: x,
'font': lambda x: '', 'font': lambda x: '',
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()), 'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_font_size, 'font-size': convert_tag_values,
'color': get_text_color, 'color': get_text_color,
'background-color': get_bg_color, 'background-color': get_bg_color,
'background': get_bg_color, 'background': get_bg_color,
@@ -156,8 +138,9 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
'border-bottom': lambda x: x if x != '0' else '', 'border-bottom': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-type': lambda x: x if x in list_types else 'disc',
'list-style-image': lambda x: 'disc', 'list-style-image': lambda x: 'disc',
'margin-left': convert_indents, 'margin-left': convert_tag_values,
'margin-top': convert_indents 'margin-top': convert_tag_values,
'margin': convert_tag_values,
} }
""" """
@@ -181,10 +164,17 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
def check_style_to_be_tag(style) -> List[tuple]: def check_style_to_be_tag(style) -> List[tuple]:
"""Function search style properties that can be converted to tags.
It searches for them and prepare list of properties to be removed from style string
Parameters
----------
style: str
<tag style="...">
Returns
-------
properties to remove: list
""" """
Some css style properties converts to tags.
Search for them and prepare list of properties to be removed from style string
"""
to_remove = [] to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style: if f'{k[0]}:{k[1]}' in style:
@@ -267,37 +257,40 @@ class TagStyleConverter:
@staticmethod @staticmethod
def process_indents_to_px(split_style: dict) -> str: def process_indents_to_px(split_style: dict) -> str:
""" Function cleans using convert_indents() style string and returns new clean_style """ """Function cleans style string using convert_tag_values() and returns new clean_style"""
split_style = [k + ":" + v for k, v in split_style.items()] split_style = [k + ":" + v for k, v in split_style.items()]
clean_style = '' clean_style = ''
for item in split_style: for item in split_style:
item = item.split(':') item = item.split(':')
if item[0] in ['text-indent', 'margin-left']: if item[0] in ['text-indent', 'margin-left', 'margin']:
item[1] = convert_indents(item[1]) if len(item[1].split(' ')) == 3:
item[1] = convert_tag_values(item[1].split(' ')[-2]) # split returns middle value
else:
item[1] = convert_tag_values(item[1].split(' ')[-1]) # split returns last value
clean_style += item[0] + ': ' + item[1] + '; ' clean_style += item[0] + ': ' + item[1] + '; '
margin_left_regexp = re.compile( margin_left_regexp = re.compile(
r'(margin-left: *(-*\w+);*)') r'((margin-left|margin): *(-*\w+);*)')
text_indent_regexp = re.compile( text_indent_regexp = re.compile(
r'(text-indent: *(-*\w+);*)') r'(text-indent: *(-*\w+);*)')
has_margin_left = re.search(margin_left_regexp, clean_style) has_margin = re.search(margin_left_regexp, clean_style)
has_text_indent = re.search(text_indent_regexp, clean_style) has_text_indent = re.search(text_indent_regexp, clean_style)
# formula_of_indent: indent = abs(margin_left - text_indent) # formula_of_indent: indent = abs(margin - text_indent)
if has_margin_left: if has_margin:
num_ml = abs(int("0" + "".join( num_m = abs(int("0" + "".join(
filter(str.isdigit, str(has_margin_left.group(2)))))) filter(str.isdigit, str(has_margin.group(3))))))
if has_text_indent: if has_text_indent:
num_ti = abs(int("0" + "".join( num_ti = abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2)))))) filter(str.isdigit, str(has_text_indent.group(2))))))
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ') str(abs(num_m - num_ti)) + 'px; ')
clean_style = clean_style.replace(has_margin_left.group(1), '') clean_style = clean_style.replace(has_margin.group(1), '')
return clean_style return clean_style
clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' + clean_style = clean_style.replace(has_margin.group(1), 'text-indent: ' +
str(abs(num_ml)) + 'px; ') str(abs(num_m)) + 'px; ')
return clean_style return clean_style
elif has_text_indent: elif has_text_indent:
@@ -309,7 +302,7 @@ class TagStyleConverter:
def preprocess_style(self): def preprocess_style(self):
def remove_extra_spaces(style: str) -> dict: def remove_extra_spaces(style: str) -> dict:
""" Function to remove extra spaces in style to process clean_style """ """Function to remove extra spaces in style to process clean_style"""
# replace all spaces between '; & letter' to ';' # replace all spaces between '; & letter' to ';'
style = re.sub(r"; *", ";", style) style = re.sub(r"; *", ";", style)
split_style: List = style.split(';') split_style: List = style.split(';')
@@ -509,7 +502,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
'@namespace epub "http://www.idpf.org/2007/ops";', '') '@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = [] livecarta_tmp_ids = []
could_have_style_in_livecarta_regexp = re.compile( could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_possible_style_attr = html_soup.find_all( tags_with_possible_style_attr = html_soup.find_all(
could_have_style_in_livecarta_regexp) could_have_style_in_livecarta_regexp)
for i, x in enumerate(tags_with_possible_style_attr): for i, x in enumerate(tags_with_possible_style_attr):

View File

@@ -275,7 +275,7 @@ def unwrap_structural_tags(body_tag):
:return: None :return: None
""" """
def _preserve_class_in_aside_tag(tag_): def preserve_class_in_aside_tag(tag_):
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)""" """to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
# this is for Wiley books with boxes # this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance( tag_class = tag_.attrs['class'] if not isinstance(
@@ -561,8 +561,8 @@ def preprocess_pre_tags(chapter_tag):
spans = pre.find_all("span") spans = pre.find_all("span")
# if in <pre> there are multiple <span>, we need to add <br> after each content # if in <pre> there are multiple <span>, we need to add <br> after each content
to_add_br = len(spans) > 1 to_add_br = len(spans) > 1
copy_contents = pre.contents[:]
for child in pre.children: for child in copy_contents:
if isinstance(child, NavigableString): if isinstance(child, NavigableString):
cleaned_text = prepare_formatted(str(child)) cleaned_text = prepare_formatted(str(child))
sub_strings = re.split('\r\n|\n|\r', cleaned_text) sub_strings = re.split('\r\n|\n|\r', cleaned_text)
@@ -573,8 +573,8 @@ def preprocess_pre_tags(chapter_tag):
else: else:
for sub_child in child.children: for sub_child in child.children:
if isinstance(sub_child, NavigableString): if isinstance(sub_child, NavigableString):
cleaned_text2 = prepare_formatted(str(sub_child)) cleaned_text = prepare_formatted(str(sub_child))
sub_child.replace_with(NavigableString(cleaned_text2)) sub_child.replace_with(NavigableString(cleaned_text))
else: else:
sub_child.string = prepare_formatted(sub_child.text) sub_child.string = prepare_formatted(sub_child.text)
cleaned_tag = child.extract() cleaned_tag = child.extract()
@@ -594,11 +594,15 @@ def preprocess_pre_tags(chapter_tag):
def preprocess_code_tags(chapter_tag): def preprocess_code_tags(chapter_tag):
"""Function that emulates style of <code>, <kdb>, <var>""" """Function that emulates style of <code>, <kdb>, <var>"""
for code in chapter_tag.find_all(re.compile("code|kdb|var")): for parent_tag in chapter_tag.find_all(re.compile("pre|p")):
code.name = 'span' for code in parent_tag.find_all(re.compile("code|kbd|var")):
if code.parent.name == "pre": # if code.name == "code":
# parent_tag.name = "pre"
code.name = "span"
if parent_tag.name == "pre":
continue continue
code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;' # if tags aren't in pre
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
def prepare_title(title_of_chapter: str) -> str: def prepare_title(title_of_chapter: str) -> str:
@@ -614,11 +618,11 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
"""Function finalise processing/cleaning content """Function finalise processing/cleaning content
Parameters Parameters
---------- ----------
title_str : str title_str: str
content_tag : BeautifulSoup content_tag: BeautifulSoup
remove_title_from_chapter : bool remove_title_from_chapter: bool
Steps Steps
---------- ----------
@@ -629,10 +633,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
Returns Returns
------- -------
str prepared content: str
Prepared content
""" """
# 0. cleaning \n # 0. cleaning \n
to_remove = [] to_remove = []
for child in content_tag.contents: for child in content_tag.contents:

View File

@@ -22,7 +22,7 @@ class LiveCartaConfig:
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif", "Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
"Verdana": "verdana,geneva,sans-serif", "Verdana": "verdana,geneva,sans-serif",
"monospace": "courier new,courier,monospace", "monospace": "courier new,courier,monospace",
"sans-serif": "arial,helvetica,sans-serif", "sans-serif": "arial,helvetica,sans-serif"
} }
COLORS_MAP = { COLORS_MAP = {