forked from LiveCarta/BookConverter
LAW-5444
This commit is contained in:
@@ -14,11 +14,11 @@ from src.livecarta_config import LiveCartaConfig
|
||||
cssutils.log.setLevel(CRITICAL)
|
||||
|
||||
|
||||
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
|
||||
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
|
||||
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
|
||||
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
|
||||
|
||||
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
|
||||
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
|
||||
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
|
||||
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
|
||||
'48px', '49px', '50px', '64px', '72px']
|
||||
@@ -28,61 +28,42 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
|
||||
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
||||
|
||||
|
||||
def convert_font_size(value):
|
||||
""" Function converts font-size in mapping """
|
||||
if 'pt' in value:
|
||||
if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
|
||||
return ''
|
||||
else:
|
||||
return value.replace('pt', 'px')
|
||||
def convert_tag_values(value):
|
||||
"""Function 1. converts values of tags from em/%/pt to px
|
||||
2. find closest font-size px
|
||||
Parameters
|
||||
----------
|
||||
value: str
|
||||
|
||||
if value == '100%':
|
||||
return ''
|
||||
try:
|
||||
if '%' in value:
|
||||
value = float(value.replace('%', ''))
|
||||
value = value / 100.0
|
||||
elif 'em' in value:
|
||||
value = float(value.replace('em', ''))
|
||||
else:
|
||||
return ''
|
||||
|
||||
if value > 5:
|
||||
return ''
|
||||
Returns
|
||||
-------
|
||||
converted value: str
|
||||
"""
|
||||
|
||||
def find_closest_size(value):
|
||||
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
|
||||
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
||||
return sizes_px[last_possible_size_index]
|
||||
|
||||
except ValueError:
|
||||
return ''
|
||||
|
||||
|
||||
def convert_indents(value):
|
||||
""" Function converts text-indent and margin-left values to px """
|
||||
# 30px = 3.2% = 1.25em = 23pt
|
||||
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)')
|
||||
has_style_attrs = re.search(text_indent_regexp, value)
|
||||
font_size_regexp = re.compile(r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
|
||||
has_style_attrs = re.search(font_size_regexp, value)
|
||||
if has_style_attrs:
|
||||
if has_style_attrs.group(1):
|
||||
value = value.replace(has_style_attrs.group(1),
|
||||
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
|
||||
'px')
|
||||
|
||||
elif has_style_attrs.group(2):
|
||||
value = value.replace(has_style_attrs.group(2),
|
||||
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
|
||||
'px')
|
||||
|
||||
elif has_style_attrs.group(4):
|
||||
value = value.replace(has_style_attrs.group(4),
|
||||
str(abs(int("0" + "".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px')
|
||||
value = float(value.replace('%', '')) / 100.0
|
||||
return find_closest_size(value)
|
||||
elif has_style_attrs.group(3):
|
||||
value = float(value.replace('em', ''))
|
||||
return find_closest_size(value)
|
||||
elif has_style_attrs.group(5):
|
||||
return value.replace('pt', 'px')
|
||||
else:
|
||||
return ''
|
||||
return value
|
||||
|
||||
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||
|
||||
"""
|
||||
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||
Style properties that can be used to fit livecarta css style convention.
|
||||
If property has empty list, it means that any value can be converted.
|
||||
If property has not empty list, it means that only certain property-value combinations can be transformed.
|
||||
@@ -115,7 +96,8 @@ LIVECARTA_STYLE_ATTRS = {
|
||||
'list-style-type': [],
|
||||
'list-style-image': [],
|
||||
'margin-left': [],
|
||||
'margin-top': []
|
||||
'margin-top': [],
|
||||
'margin': [],
|
||||
}
|
||||
|
||||
|
||||
@@ -132,18 +114,18 @@ def get_text_color(x):
|
||||
|
||||
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
||||
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
||||
|
||||
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
|
||||
to suit livecarta style convention.
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||
'text-indent': convert_indents,
|
||||
'text-indent': convert_tag_values,
|
||||
'font-variant': lambda x: x,
|
||||
'text-align': lambda x: x,
|
||||
'font': lambda x: '',
|
||||
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
|
||||
'font-size': convert_font_size,
|
||||
'font-size': convert_tag_values,
|
||||
'color': get_text_color,
|
||||
'background-color': get_bg_color,
|
||||
'background': get_bg_color,
|
||||
@@ -156,8 +138,9 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||
'border-bottom': lambda x: x if x != '0' else '',
|
||||
'list-style-type': lambda x: x if x in list_types else 'disc',
|
||||
'list-style-image': lambda x: 'disc',
|
||||
'margin-left': convert_indents,
|
||||
'margin-top': convert_indents
|
||||
'margin-left': convert_tag_values,
|
||||
'margin-top': convert_tag_values,
|
||||
'margin': convert_tag_values,
|
||||
}
|
||||
|
||||
"""
|
||||
@@ -181,10 +164,17 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
||||
|
||||
|
||||
def check_style_to_be_tag(style) -> List[tuple]:
|
||||
"""
|
||||
Some css style properties converts to tags.
|
||||
Search for them and prepare list of properties to be removed from style string
|
||||
"""Function search style properties that can be converted to tags.
|
||||
It searches for them and prepare list of properties to be removed from style string
|
||||
Parameters
|
||||
----------
|
||||
style: str
|
||||
<tag style="...">
|
||||
Returns
|
||||
-------
|
||||
properties to remove: list
|
||||
"""
|
||||
|
||||
to_remove = []
|
||||
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
||||
if f'{k[0]}:{k[1]}' in style:
|
||||
@@ -267,37 +257,40 @@ class TagStyleConverter:
|
||||
|
||||
@staticmethod
|
||||
def process_indents_to_px(split_style: dict) -> str:
|
||||
""" Function cleans using convert_indents() style string and returns new clean_style """
|
||||
"""Function cleans style string using convert_tag_values() and returns new clean_style"""
|
||||
split_style = [k + ":" + v for k, v in split_style.items()]
|
||||
clean_style = ''
|
||||
for item in split_style:
|
||||
item = item.split(':')
|
||||
if item[0] in ['text-indent', 'margin-left']:
|
||||
item[1] = convert_indents(item[1])
|
||||
if item[0] in ['text-indent', 'margin-left', 'margin']:
|
||||
if len(item[1].split(' ')) == 3:
|
||||
item[1] = convert_tag_values(item[1].split(' ')[-2]) # split returns middle value
|
||||
else:
|
||||
item[1] = convert_tag_values(item[1].split(' ')[-1]) # split returns last value
|
||||
clean_style += item[0] + ': ' + item[1] + '; '
|
||||
|
||||
margin_left_regexp = re.compile(
|
||||
r'(margin-left: *(-*\w+);*)')
|
||||
r'((margin-left|margin): *(-*\w+);*)')
|
||||
text_indent_regexp = re.compile(
|
||||
r'(text-indent: *(-*\w+);*)')
|
||||
|
||||
has_margin_left = re.search(margin_left_regexp, clean_style)
|
||||
has_margin = re.search(margin_left_regexp, clean_style)
|
||||
has_text_indent = re.search(text_indent_regexp, clean_style)
|
||||
# formula_of_indent: indent = abs(margin_left - text_indent)
|
||||
if has_margin_left:
|
||||
num_ml = abs(int("0" + "".join(
|
||||
filter(str.isdigit, str(has_margin_left.group(2))))))
|
||||
# formula_of_indent: indent = abs(margin - text_indent)
|
||||
if has_margin:
|
||||
num_m = abs(int("0" + "".join(
|
||||
filter(str.isdigit, str(has_margin.group(3))))))
|
||||
|
||||
if has_text_indent:
|
||||
num_ti = abs(int("0" + "".join(
|
||||
filter(str.isdigit, str(has_text_indent.group(2))))))
|
||||
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
|
||||
str(abs(num_ml - num_ti)) + 'px; ')
|
||||
clean_style = clean_style.replace(has_margin_left.group(1), '')
|
||||
str(abs(num_m - num_ti)) + 'px; ')
|
||||
clean_style = clean_style.replace(has_margin.group(1), '')
|
||||
return clean_style
|
||||
|
||||
clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' +
|
||||
str(abs(num_ml)) + 'px; ')
|
||||
clean_style = clean_style.replace(has_margin.group(1), 'text-indent: ' +
|
||||
str(abs(num_m)) + 'px; ')
|
||||
return clean_style
|
||||
|
||||
elif has_text_indent:
|
||||
@@ -309,7 +302,7 @@ class TagStyleConverter:
|
||||
|
||||
def preprocess_style(self):
|
||||
def remove_extra_spaces(style: str) -> dict:
|
||||
""" Function to remove extra spaces in style to process clean_style """
|
||||
"""Function to remove extra spaces in style to process clean_style"""
|
||||
# replace all spaces between '; & letter' to ';'
|
||||
style = re.sub(r"; *", ";", style)
|
||||
split_style: List = style.split(';')
|
||||
@@ -509,7 +502,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
|
||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||
livecarta_tmp_ids = []
|
||||
could_have_style_in_livecarta_regexp = re.compile(
|
||||
'(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
||||
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
||||
tags_with_possible_style_attr = html_soup.find_all(
|
||||
could_have_style_in_livecarta_regexp)
|
||||
for i, x in enumerate(tags_with_possible_style_attr):
|
||||
|
||||
@@ -275,7 +275,7 @@ def unwrap_structural_tags(body_tag):
|
||||
:return: None
|
||||
"""
|
||||
|
||||
def _preserve_class_in_aside_tag(tag_):
|
||||
def preserve_class_in_aside_tag(tag_):
|
||||
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
|
||||
# this is for Wiley books with boxes
|
||||
tag_class = tag_.attrs['class'] if not isinstance(
|
||||
@@ -561,8 +561,8 @@ def preprocess_pre_tags(chapter_tag):
|
||||
spans = pre.find_all("span")
|
||||
# if in <pre> there are multiple <span>, we need to add <br> after each content
|
||||
to_add_br = len(spans) > 1
|
||||
|
||||
for child in pre.children:
|
||||
copy_contents = pre.contents[:]
|
||||
for child in copy_contents:
|
||||
if isinstance(child, NavigableString):
|
||||
cleaned_text = prepare_formatted(str(child))
|
||||
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
|
||||
@@ -573,8 +573,8 @@ def preprocess_pre_tags(chapter_tag):
|
||||
else:
|
||||
for sub_child in child.children:
|
||||
if isinstance(sub_child, NavigableString):
|
||||
cleaned_text2 = prepare_formatted(str(sub_child))
|
||||
sub_child.replace_with(NavigableString(cleaned_text2))
|
||||
cleaned_text = prepare_formatted(str(sub_child))
|
||||
sub_child.replace_with(NavigableString(cleaned_text))
|
||||
else:
|
||||
sub_child.string = prepare_formatted(sub_child.text)
|
||||
cleaned_tag = child.extract()
|
||||
@@ -594,11 +594,15 @@ def preprocess_pre_tags(chapter_tag):
|
||||
|
||||
def preprocess_code_tags(chapter_tag):
|
||||
"""Function that emulates style of <code>, <kdb>, <var>"""
|
||||
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
|
||||
code.name = 'span'
|
||||
if code.parent.name == "pre":
|
||||
continue
|
||||
code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;'
|
||||
for parent_tag in chapter_tag.find_all(re.compile("pre|p")):
|
||||
for code in parent_tag.find_all(re.compile("code|kbd|var")):
|
||||
# if code.name == "code":
|
||||
# parent_tag.name = "pre"
|
||||
code.name = "span"
|
||||
if parent_tag.name == "pre":
|
||||
continue
|
||||
# if tags aren't in pre
|
||||
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
|
||||
|
||||
|
||||
def prepare_title(title_of_chapter: str) -> str:
|
||||
@@ -614,11 +618,11 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
"""Function finalise processing/cleaning content
|
||||
Parameters
|
||||
----------
|
||||
title_str : str
|
||||
title_str: str
|
||||
|
||||
content_tag : BeautifulSoup
|
||||
content_tag: BeautifulSoup
|
||||
|
||||
remove_title_from_chapter : bool
|
||||
remove_title_from_chapter: bool
|
||||
|
||||
Steps
|
||||
----------
|
||||
@@ -629,10 +633,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Prepared content
|
||||
|
||||
prepared content: str
|
||||
"""
|
||||
|
||||
# 0. cleaning \n
|
||||
to_remove = []
|
||||
for child in content_tag.contents:
|
||||
|
||||
@@ -22,7 +22,7 @@ class LiveCartaConfig:
|
||||
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
|
||||
"Verdana": "verdana,geneva,sans-serif",
|
||||
"monospace": "courier new,courier,monospace",
|
||||
"sans-serif": "arial,helvetica,sans-serif",
|
||||
"sans-serif": "arial,helvetica,sans-serif"
|
||||
}
|
||||
|
||||
COLORS_MAP = {
|
||||
|
||||
Reference in New Issue
Block a user