Make preprocessing of inline css

This commit is contained in:
Kiryl
2022-06-06 16:36:14 +03:00
parent 0d1ec03f57
commit 002316f086
5 changed files with 669 additions and 581 deletions

View File

@@ -0,0 +1,238 @@
import re
import cssutils
from ebooklib import epub
from bs4 import BeautifulSoup
from itertools import takewhile
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
def get_text_color(x):
color = str2hex(x)
color = color if color not in ['#000000', '#000', 'black'] else ''
return color
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
return color
def convert_tag_style_values(size_value: str) -> str:
"""
Function
- converts values of tags from em/%/pt to px
- find closest font-size px
Parameters
----------
size_value: str
Returns
-------
size_value: str
"""
def find_closest_size(style_value):
possible_sizes = list(
takewhile(lambda x: style_value > x, LiveCartaConfig.sizes_pr))
last_possible_size_index = LiveCartaConfig.sizes_pr.index(
possible_sizes[-1])
return LiveCartaConfig.sizes_px[last_possible_size_index]
font_size_regexp = re.compile(
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
has_style_attrs = re.search(font_size_regexp, size_value)
if has_style_attrs:
if has_style_attrs.group(1):
size_value = float(size_value.replace('%', '')) / 100.0
return find_closest_size(size_value)
elif has_style_attrs.group(3):
size_value = float(size_value.replace('em', ''))
return find_closest_size(size_value)
elif has_style_attrs.group(5):
return size_value.replace('pt', 'px')
else:
return ''
return size_value
def convert_indents_tag_values(size_value: str) -> str:
"""
Function converts values of ['text-indent', 'margin-left', 'margin']
Parameters
----------
size_value: str
Returns
-------
size_value: str
"""
if len(size_value.split(' ')) == 3:
size_value = convert_tag_style_values(size_value.split(
' ')[-2]) # returns middle value
else:
size_value = convert_tag_style_values(size_value.split(
' ')[-1]) # returns last value
return size_value
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [],
'font': [],
'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys()
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
'color': [],
'background-color': [],
'background': [],
'width': [],
'border': [],
'border-top-width': [],
'border-right-width': [],
'border-left-width': [],
'border-bottom-width': [],
'border-top': [],
'border-bottom': [],
'list-style-type': [],
'list-style-image': [],
'margin-left': [],
'margin-top': [],
'margin': [],
}
"""
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents_tag_values,
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(x) or
LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(x.capitalize()),
'font-size': convert_tag_style_values,
'color': get_text_color,
'background-color': get_bg_color,
'background': get_bg_color,
'border': lambda x: x if x != '0' else '',
'border-top-width': lambda x: x if x != '0' else '',
'border-right-width': lambda x: x if x != '0' else '',
'border-left-width': lambda x: x if x != '0' else '',
'border-bottom-width': lambda x: x if x != '0' else '',
'border-top': lambda x: x if x != '0' else '',
'border-bottom': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc',
'list-style-image': lambda x: 'disc',
'margin-left': convert_indents_tag_values,
'margin-top': convert_tag_style_values,
'margin': convert_indents_tag_values
}
def update_inline_styles_to_livecarta_convention(split_style: list):
for i, style in enumerate(split_style):
style_name, style_value = style.split(":")
if style_name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ''
return split_style
cleaned_value = style_value.replace('\"', '').split()[-1]
constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
style_name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_name]
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ''
else:
if style_name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
style_value = func(cleaned_value)
split_style[i] = style_name + ":" + style_value
return split_style
def build_inline_style_content(style: str) -> str:
"""Build inline style with livecarta convention"""
# replace all spaces between '; & letter' to ';'
style = re.sub(r"; *", ";", style)
# when we split style by ';', last element of the list is ''-None
# remove it
split_style: list = list(filter(None, style.split(';')))
# replace all spaces between ': & letter' to ':'
split_style = [el.replace(
re.search(r'(:\s*)', el).group(1), ':') for el in split_style]
split_style = update_inline_styles_to_livecarta_convention(split_style)
style = "; ".join(split_style)
return style
def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule,
style_type: cssutils.css.property.Property):
if style_type.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
return
cleaned_value = style_type.value.replace('\"', '').split(', ')[-1]
constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
style_type.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_type.name]
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
else:
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
css_rule.style[style_type.name] = func(cleaned_value)
def build_css_file_content(css_content: str) -> str:
"""Build css content with livecarta convention"""
sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet:
if css_rule.type == css_rule.STYLE_RULE:
for style_type in css_rule.style:
update_css_styles_to_livecarta_convention(
css_rule, style_type)
css_text: str = sheet._getCssText().decode()
return css_text
if __name__ == '__main__':
file = '../../epub/9781627222174.epub'
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode()
css_cleaned = build_css_file_content(css_)
html_ = ebooklib_book.get_item_with_href(
'pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml')

View File

@@ -1,557 +0,0 @@
import re
import cssutils
from typing import List
from ebooklib import epub
from logging import CRITICAL
from bs4 import BeautifulSoup
from premailer import transform
from itertools import takewhile
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
cssutils.log.setLevel(CRITICAL)
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0,
1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69,
1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38,
2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px',
'17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px',
'26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px',
'44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px']
list_types = ['circle', 'disc', 'armenian', 'decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_tag_style_values(value: str) -> str:
"""
Function
- converts values of tags from em/%/pt to px
- find closest font-size px
Parameters
----------
value: str
Returns
-------
value: str
"""
def find_closest_size(size_value):
possible_sizes = list(takewhile(lambda x: size_value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index]
font_size_regexp = re.compile(
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
has_style_attrs = re.search(font_size_regexp, value)
if has_style_attrs:
if has_style_attrs.group(1):
value = float(value.replace('%', '')) / 100.0
return find_closest_size(value)
elif has_style_attrs.group(3):
value = float(value.replace('em', ''))
return find_closest_size(value)
elif has_style_attrs.group(5):
return value.replace('pt', 'px')
else:
return ''
return value
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [],
'font': [],
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
'color': [],
'background-color': [],
'background': [],
'width': [],
'border': [],
'border-top-width': [],
'border-right-width': [],
'border-left-width': [],
'border-bottom-width': [],
'border-top': [],
'border-bottom': [],
'list-style-type': [],
'list-style-image': [],
'margin-left': [],
'margin-top': [],
'margin': [],
}
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
return color
def get_text_color(x):
color = str2hex(x)
color = color if color not in ['#000000', '#000', 'black'] else ''
return color
"""
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_tag_style_values,
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or
LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_tag_style_values,
'color': get_text_color,
'background-color': get_bg_color,
'background': get_bg_color,
'border': lambda x: x if x != '0' else '',
'border-top-width': lambda x: x if x != '0' else '',
'border-right-width': lambda x: x if x != '0' else '',
'border-left-width': lambda x: x if x != '0' else '',
'border-bottom-width': lambda x: x if x != '0' else '',
'border-top': lambda x: x if x != '0' else '',
'border-bottom': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in list_types else 'disc',
'list-style-image': lambda x: 'disc',
'margin-left': convert_tag_style_values,
'margin-top': convert_tag_style_values,
'margin': convert_tag_style_values,
}
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong',
('font-weight', '600'): 'strong',
('font-weight', '700'): 'strong',
('font-weight', '800'): 'strong',
('font-weight', '900'): 'strong',
('font-style', 'italic'): 'i',
('text-decoration', 'underline'): 'u',
('text-decoration', 'line-through'): 's',
('text-decoration-line', 'underline'): 'u',
('text-decoration-line', 'line-through'): 's',
('vertical-align', 'super'): 'sup',
}
def check_style_to_be_tag(style: str) -> List[tuple]:
"""
Function searches style properties that can be converted to tags.
It searches for them and prepare list of properties to be removed from style string
Parameters
----------
style: str
<tag style="...">
Returns
-------
to_remove: list
properties to remove
"""
to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
to_remove.append(k)
return to_remove
def update_css_style_types_to_livecarta_convention(css_rule, style_type):
if style_type.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
return
cleaned_value = style_type.value.replace('\"', '') # value of style
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_type.name]
if there_are_constraints_on_value and value_not_in_possible_values_list:
# style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
else:
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
css_rule.style[style_type.name] = func(cleaned_value)
def build_css_content(css_content):
"""Build css content with livecarta convention"""
sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet:
if css_rule.type == css_rule.STYLE_RULE:
for style_type in css_rule.style:
update_css_style_types_to_livecarta_convention(
css_rule, style_type)
css_text = sheet._getCssText().decode()
return css_text
class TagStyleConverter:
def __init__(self, tag_with_inline_style, tag_with_ultimate_style):
# tag with inline style to be updated with style attribute
self.tag_with_inline_style = tag_with_inline_style
self.tag_initial_name = tag_with_inline_style.name
# tag with inline style + style parsed from css file
self.tag_with_ultimate_style = tag_with_ultimate_style
self.style = self.preprocess_style()
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
"""Function remove text white color if there is no bg color"""
if 'background' in style_:
return style_
# if text color is white, check that we have bg-color
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
# if bg color is inherited, just return style as is
for parent_tag in tag.parents:
# white bg color not need to be checked as we do not write 'white bg color'
tag_with_bg = ['span', 'td', 'tr', 'p']
tag_will_be_saved = parent_tag.name in tag_with_bg
has_bg = parent_tag.attrs.get('style') and (
'background' in parent_tag.attrs.get('style'))
if has_bg and tag_will_be_saved:
return style_
children = tag.find_all()
for child in children:
if child.attrs.get('style') and ('background' in child.attrs.get('style')):
tmp_style = child.attrs['style'] + '; color:#fff; '
child.attrs['style'] = tmp_style
# for child with bg color we added white text color, so this tag don't need white color
style_ = style_.replace('color:#fff;', '')
style_ = style_.replace('color:#ffffff;', '')
style_ = style_.replace('color:white;', '')
return style_
@staticmethod
def process_indents_to_px(split_style: dict) -> str:
"""Function cleans style string using convert_tag_values() and returns new clean_style"""
split_style = [k + ":" + v for k, v in split_style.items()]
clean_style = ''
for item in split_style:
item = item.split(':')
if item[0] in ['text-indent', 'margin-left', 'margin']:
if len(item[1].split(' ')) == 3:
item[1] = convert_tag_style_values(item[1].split(
' ')[-2]) # split returns middle value
else:
item[1] = convert_tag_style_values(item[1].split(
' ')[-1]) # split returns last value
clean_style += item[0] + ': ' + item[1] + '; '
margin_left_regexp = re.compile(
r'((margin-left|margin): *(-*\w+);*)')
text_indent_regexp = re.compile(
r'(text-indent: *(-*\w+);*)')
has_margin = re.search(margin_left_regexp, clean_style)
has_text_indent = re.search(text_indent_regexp, clean_style)
# formula_of_indent: indent = abs(margin - text_indent)
if has_margin:
num_m = abs(int("0" + "".join(
filter(str.isdigit, str(has_margin.group(3))))))
if has_text_indent:
num_ti = abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2))))))
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_m - num_ti)) + 'px; ')
clean_style = clean_style.replace(has_margin.group(1), '')
return clean_style
clean_style = clean_style.replace(has_margin.group(1), 'text-indent: ' +
str(abs(num_m)) + 'px; ')
return clean_style
elif has_text_indent:
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ')
return clean_style
return clean_style
def preprocess_style(self):
def remove_extra_spaces(style: str) -> dict:
"""Function to remove extra spaces in style to process clean_style"""
# replace all spaces between '; & letter' to ';'
style = re.sub(r"; *", ";", style)
split_style: List = style.split(';')
# when we split style by ; and we have at the end ; that's why we have '' in list
while '' in split_style:
split_style.remove('')
# replace all spaces between ': & letter' to ':'
split_style = [el.replace(
re.search(r'(:\s*)', el).group(1), ':') for el in split_style]
dict = {}
for list_item in split_style:
key, val = list_item.split(":")
dict[key] = val
return dict
ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';'
ultimate_style = self.remove_white_if_no_bgcolor(
ultimate_style, self.tag_with_ultimate_style)
ultimate_style = ultimate_style.replace(
'background:', 'background-color:')
ultimate_style = ultimate_style.replace(
'list-style-image', 'list-style-type')
split_ultimate_style: dict = remove_extra_spaces(ultimate_style)
ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
if self.tag_with_inline_style.attrs.get('style'):
inline_style = self.tag_with_inline_style.attrs['style']
split_inline_style: dict = remove_extra_spaces(inline_style)
# repetition check - if the tag had already had inline style
# that isn't in the css styles, add this to style parsed from css
repeat_styles = list(set(split_ultimate_style.keys())
& set(split_inline_style.keys()))
# remove styles(css) that are in css and inline
[split_inline_style.pop(item) for item in repeat_styles]
if split_inline_style:
# if split_inline_style is not empty - start convert and add to ultimate style
print('we enter repetition check', '\n')
inline_style: str = self.process_indents_to_px(
split_inline_style)
ultimate_style += inline_style
return ultimate_style
def change_attrs_with_corresponding_tags(self):
# adds <b>, <u>, <sup>, etc
to_remove = check_style_to_be_tag(self.style)
new_tags = []
for i, (attr, value) in enumerate(to_remove):
s = f'{attr}:{value};'
self.style = self.style.replace(s, '')
self.style = self.style.strip()
if not i:
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)]
new_tags.append(self.tag_with_inline_style)
else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag)
new_tags.append(new_tag)
top_tag = self.tag_with_inline_style
if new_tags:
tmp_attrs = top_tag.attrs.copy()
top_tag.attrs = {}
top_tag2 = BeautifulSoup(features='lxml').new_tag(
self.tag_initial_name)
top_tag2.attrs = tmp_attrs
if self.style:
top_tag2.attrs['style'] = self.style
new_tags[-1].wrap(top_tag2)
else:
top_tag.attrs['style'] = self.style
return top_tag
@staticmethod
def wrap_span_in_p_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in p -> span"""
if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
p_style = ''
initial_style = tag.attrs['style']
split_style = initial_style.replace('; ', ';').split(';')
possible_p_attrs_regexp = re.compile(
r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)')
for item in split_style:
has_p_style_attrs = re.search(possible_p_attrs_regexp, item)
if has_p_style_attrs:
p_style += item + ';'
initial_style = initial_style.replace(item + ';', '')
# here check that this style i exactly the same.
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
if any(styles_to_be_saved_in_span):
# if find styles that cannot be in <p> -> wrap them in span
tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p')
p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_p_style_attr = re.search(p_attrs_regexp, initial_style)
span_style = initial_style if not has_p_style_attr else initial_style.replace(
has_p_style_attr.group(1), '')
p_tag.attrs['style'] = p_style
tag.attrs['style'] = span_style
tag.wrap(p_tag)
else:
tag.attrs['style'] = p_style
@staticmethod
def wrap_span_in_li_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in li -> span"""
if tag.name == 'li' and tag.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']]
styles_to_be_saved_in_span = [attr in tag.attrs.get(
'style') for attr in styles_cant_be_in_li]
if any(styles_to_be_saved_in_span):
tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li')
span_style = tag.attrs['style']
li_style = ''
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
re.compile(r'(list-style-type:(\w+);)')]:
has_li_style_attrs = re.search(
possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
li_style += has_li_style_attrs.group(1)
span_style = span_style.replace(
has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = li_style
tag.attrs['style'] = span_style
tag.wrap(li_tag)
@staticmethod
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in ul/ol -> span"""
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
for attr in styles_cant_be_in_ul_ol]
if any(styles_to_be_saved_in_span):
tag.name = 'span'
oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
span_style = tag.attrs['style']
possible_uol_attrs_regexp = re.compile(
r'(list-style-type:(\w+);)')
has_uol_style_attrs = re.search(
possible_uol_attrs_regexp, span_style)
if has_uol_style_attrs and has_uol_style_attrs.group(1):
oul_style = has_uol_style_attrs.group(1)
span_style = span_style.replace(oul_style, '')
oul_tag.attrs['style'] = oul_style
tag.attrs['style'] = span_style
tag.wrap(oul_tag)
@staticmethod
def wrap_span_in_h_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in h -> span"""
h_regexp = re.compile('(^h[1-9]$)')
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
tag.name = 'span'
tag.wrap(h_tag)
style = tag.attrs['style']
h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_h_style_attr = re.search(h_attrs_regexp, style)
tag.attrs['style'] = style if not has_h_style_attr else style.replace(
has_h_style_attr.group(1), '')
def convert_initial_tag(self):
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
self.wrap_span_in_p_to_save_style_attrs(self.tag_with_inline_style)
self.wrap_span_in_li_to_save_style_attrs(self.tag_with_inline_style)
self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_with_inline_style)
self.wrap_span_in_h_to_save_style_attrs(self.tag_with_inline_style)
return self.tag_with_inline_style
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""Function adds styles from .css to inline style"""
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []
could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_possible_style_attr = html_soup.find_all(
could_have_style_in_livecarta_regexp)
for i, x in enumerate(tags_with_possible_style_attr):
if i == 2:
pass
x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i)
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
# go through tags with possible style attrs
for i in livecarta_tmp_ids:
tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i})
tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})
del tag_with_initial_style.attrs['livecarta_id']
if tag_with_ultimate_style.attrs.get('style'):
style_converter = TagStyleConverter(
tag_with_initial_style, tag_with_ultimate_style)
style_converter.convert_initial_tag()
return html_soup
if __name__ == '__main__':
file = '../../epub/9781627222174.epub'
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode()
css_cleaned = build_css_content(css_)
html_ = ebooklib_book.get_item_with_href(
'pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml')
print(convert_html_soup_with_css_style(html_soup, css_cleaned))

View File

@@ -17,7 +17,8 @@ from bs4 import BeautifulSoup, Tag
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\
prepare_title, prepare_content, update_images_src_links, preprocess_footnotes
@@ -68,6 +69,8 @@ class EpubConverter:
BeautifulSoup] = self.build_href2soup_content()
# TODO Presets
self.logger.log('Process CSS inline styles.')
self.process_inline_styles_in_html_soup()
self.logger.log('CSS files processing.')
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log('CSS styles adding.')
@@ -106,7 +109,7 @@ class EpubConverter:
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
# for now just for HTML objects, as it is simplest chapter
# for now just for HTML objects, as it is the simplest chapter
nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
@@ -122,6 +125,7 @@ class EpubConverter:
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace('\\', '/')
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search('"(.*)"', str(css_obj.content)).group(1)
@@ -131,12 +135,26 @@ class EpubConverter:
css_content: str = css_obj.get_content().decode()
return css_content
def process_inline_styles_in_html_soup(self):
"""This function is designed to convert inline html styles"""
for html_href in self.html_href2html_body_soup:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_inline_style = html_content.find_all(could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs['style']
tag_initial_inline_style.attrs['style'] = \
build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]:
"""
Function is designed to get 2 dictionaries:
The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them
The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them
) which are used on this html
The second is css_href2css_content. It is created to connect href of css to content of css
...2... = key2value
Returns
----------
@@ -154,26 +172,27 @@ class EpubConverter:
soup_html_content = BeautifulSoup(html_content, features='lxml')
# check if file links to css file
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
# alternate page of original page (e.g. another language)
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
continue
css_href = tag.attrs.get('href')
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_content(
css_href2css_content[css_href] = build_css_file_content(
self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all('style')):
css_content = tag.string
html_href2css_href[html_href].append(f'href{i}')
css_href2css_content[f'href{i}'] = build_css_content(
css_href2css_content[f'href{i}'] = build_css_file_content(
css_content)
return html_href2css_href, css_href2css_content
def add_css_styles_to_html_soup(self):
"""
This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content
- add to html_inline_style css_style_content
"""
for html_href in self.html_href2html_body_soup:
@@ -181,9 +200,9 @@ class EpubConverter:
css = ''
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
content: BeautifulSoup = self.html_href2html_body_soup[html_href]
content = convert_html_soup_with_css_style(content, css)
self.html_href2html_body_soup[html_href] = content
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
html_content = convert_html_soup_with_css_style(html_content, css)
self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
"""
@@ -191,7 +210,7 @@ class EpubConverter:
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
key = -1 if root(top chapters),
value = None if leaf(least chapters)
value = None if leaf(the least chapters)
Parameters
----------
element: [Link, tuple, list]
@@ -299,8 +318,7 @@ class EpubConverter:
# go to line structure
for html_href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(
soup)
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
@staticmethod
def create_unique_id(href, id_):
@@ -314,7 +332,7 @@ class EpubConverter:
new_anchor_span.string = "\xa0"
return new_anchor_span
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
"""
Function used to find full path to file that is parsed from tag link
TOC: a/b/c.xhtml
@@ -327,7 +345,7 @@ class EpubConverter:
href_in_link: str
filename got from tag link, like file1.xhtml
internal_link_tag: Tag
tag object that is parsed now
object that is parsed now
Returns
-------
@@ -362,6 +380,10 @@ class EpubConverter:
1. rebuild ids to be unique in all documents
2a. process anchor which is a whole xhtml file
2b. process anchor which is an element in xhtml file
Returns
-------
None
process links in html
"""
# 1. rebuild ids to be unique in all documents
@@ -393,14 +415,14 @@ class EpubConverter:
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
# insert a new span to the begin of the file
# insert a new span to the beginning of the file
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href']
# 2b. process anchor which is an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)')
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
@@ -459,7 +481,7 @@ class EpubConverter:
id wraps chapter's content + subchapters' content
id points to the start of title of a chapter
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
and id of the next chapter/subchapter
Parameters
----------
@@ -504,7 +526,8 @@ class EpubConverter:
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.book_image_src_path2aws_path,
book_id=self.file_path.stem if hasattr(self.file_path, 'stem') else 'book_id')
book_id=self.file_path.stem
if hasattr(self.file_path, 'stem') else 'book_id')
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)

View File

@@ -0,0 +1,340 @@
import re
import cssutils
from typing import List
from logging import CRITICAL
from bs4 import BeautifulSoup
from premailer import transform
from src.livecarta_config import LiveCartaConfig
from src.epub_converter.css_preprocessing import LIVECARTA_STYLE_ATTRS
cssutils.log.setLevel(CRITICAL)
class TagStyleConverter:
def __init__(self, tag_inline_style):
# tag with inline style + style parsed from css file
self.tag_inline_style = tag_inline_style
self.style = self.process_inline_style()
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
"""Function remove text white color if there is no bg color"""
if 'background' in style_:
style_ = style_.replace(
'background:', 'background-color:')
return style_
# if text color is white, check that we have bg-color
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
# if bg color is inherited, just return style as is
for parent_tag in tag.parents:
# white bg color not need to be checked as we do not write 'white bg color'
tag_with_bg = ['span', 'td', 'tr', 'p']
tag_will_be_saved = parent_tag.name in tag_with_bg
has_bg = parent_tag.attrs.get('style') and (
'background' in parent_tag.attrs.get('style'))
if has_bg and tag_will_be_saved:
return style_
children = tag.find_all()
for child in children:
if child.attrs.get('style') and ('background' in child.attrs.get('style')):
tmp_style = child.attrs['style'] + '; color:#fff; '
child.attrs['style'] = tmp_style
# for child with bg color we added white text color, so this tag don't need white color
style_ = style_.replace('color:#fff;', '')
style_ = style_.replace('color:#ffffff;', '')
style_ = style_.replace('color:white;', '')
return style_
@staticmethod
def duplicate_styles_check(split_style: list) -> list:
# replace all spaces between ': & letter' to ':'
# split_style = [el.replace(
# re.search(r'(:\s*)', el).group(1), ':') for el in split_style_]
style_name2style_value = {}
for list_item in split_style:
key, val = list_item.split(":")
if val not in style_name2style_value.keys():
style_name2style_value[key] = val
split_style = [k + ":" + v for k, v in style_name2style_value.items()]
return split_style
@staticmethod
def indents_processing(split_style: list) -> str:
"""
Function process indents from left using
formula_of_indent: indent = abs(margin - text_indent)
Parameters
----------
split_style: list
list of styles split by ';'
Returns
----------
processed_style:str
processed style with counted indent
"""
processed_style = ";".join(split_style)
margin_left_regexp = re.compile(
r'((margin-left|margin): *(-*\w+);*)')
text_indent_regexp = re.compile(
r'(text-indent: *(-*\w+);*)')
has_margin = re.search(margin_left_regexp, processed_style)
has_text_indent = re.search(text_indent_regexp, processed_style)
if has_margin:
num_m = abs(int("0" + "".join(
filter(str.isdigit, str(has_margin.group(3))))))
if has_text_indent:
num_ti = abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2))))))
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_m - num_ti)) + 'px; ')
processed_style = processed_style.replace(
has_margin.group(1), '')
return processed_style
processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' +
str(abs(num_m)) + 'px; ')
return processed_style
elif has_text_indent:
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2)))))))
+ 'px; ')
return processed_style
return processed_style
def process_inline_style(self):
"""
Function processes final(css+initial inline) inline style
Steps
----------
1. Remove white color if tag doesn't have background color in style
2. Create list of styles from inline style
3. Duplicate styles check - if the tag had duplicate styles
4. Processing indents
Returns
-------
inline_style: str
processed inline style
"""
inline_style = self.tag_inline_style.attrs.get('style') + ';'
# 1. Remove white color if tag doesn't have background color in style
inline_style = self.remove_white_if_no_bgcolor(
inline_style, self.tag_inline_style)
inline_style = inline_style.replace(
'list-style-image', 'list-style-type')
# 2. Create list of styles from inline style
# replace all spaces between '; & letter' to ';'
style = re.sub(r"; *", ";", inline_style)
# when we split style by ';', last element of the list is '' - None (remove it)
split_inline_style: list = list(filter(None, style.split(';')))
# 3. Duplicate styles check - if the tag had duplicate styles
split_inline_style = self.duplicate_styles_check(split_inline_style)
# 4. Processing indents#
inline_style: str = self.indents_processing(split_inline_style)
return inline_style
@staticmethod
def check_style_to_be_tag(style: str) -> List[tuple]:
"""
Function searches style properties that can be converted to tag.
It searches for them and prepare list of properties to be removed from style string
Parameters
----------
style: str
<tag style="...">
Returns
-------
to_remove: list
properties to remove
"""
to_remove = []
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
to_remove.append(k)
return to_remove
def change_attrs_with_corresponding_tags(self, tag_initial_name: str):
# adds <strong>, <u>, <sup> instead of styles
to_remove = self.check_style_to_be_tag(self.style)
new_tags = []
for i, (attr, value) in enumerate(to_remove):
s = f'{attr}:{value};'
self.style = self.style.replace(s, '')
self.style = self.style.strip()
if not i:
self.tag_inline_style.name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)]
new_tags.append(self.tag_inline_style)
else:
name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)]
new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag)
new_tags.append(new_tag)
top_tag = self.tag_inline_style
if new_tags:
tmp_attrs = top_tag.attrs.copy()
top_tag.attrs = {}
top_tag2 = BeautifulSoup(features='lxml').new_tag(tag_initial_name)
top_tag2.attrs = tmp_attrs
if self.style:
top_tag2.attrs['style'] = self.style
new_tags[-1].wrap(top_tag2)
else:
top_tag.attrs['style'] = self.style
return top_tag
@staticmethod
def wrap_span_in_p_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in p -> span"""
if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
p_style = ''
initial_style = tag.attrs['style']
split_style = initial_style.replace('; ', ';').split(';')
possible_p_attrs_regexp = re.compile(
r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)')
for item in split_style:
has_p_style_attrs = re.search(possible_p_attrs_regexp, item)
if has_p_style_attrs:
p_style += item + ';'
initial_style = initial_style.replace(item + ';', '')
# here check that this style i exactly the same.
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
if any(styles_to_be_saved_in_span):
# if we find styles that cannot be in <p> -> wrap them in span
tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p')
p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_p_style_attr = re.search(p_attrs_regexp, initial_style)
span_style = initial_style if not has_p_style_attr else initial_style.replace(
has_p_style_attr.group(1), '')
p_tag.attrs['style'] = p_style
tag.attrs['style'] = span_style
tag.wrap(p_tag)
else:
tag.attrs['style'] = p_style
@staticmethod
def wrap_span_in_li_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in li -> span"""
if tag.name == 'li' and tag.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']]
styles_to_be_saved_in_span = [attr in tag.attrs.get(
'style') for attr in styles_cant_be_in_li]
if any(styles_to_be_saved_in_span):
tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li')
span_style = tag.attrs['style']
li_style = ''
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
re.compile(r'(list-style-type:(\w+);)')]:
has_li_style_attrs = re.search(
possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
li_style += has_li_style_attrs.group(1)
span_style = span_style.replace(
has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = li_style
tag.attrs['style'] = span_style
tag.wrap(li_tag)
@staticmethod
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in ul/ol -> span"""
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
for attr in styles_cant_be_in_ul_ol]
if any(styles_to_be_saved_in_span):
tag.name = 'span'
oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
span_style = tag.attrs['style']
possible_uol_attrs_regexp = re.compile(
r'(list-style-type:(\w+);)')
has_uol_style_attrs = re.search(
possible_uol_attrs_regexp, span_style)
if has_uol_style_attrs and has_uol_style_attrs.group(1):
oul_style = has_uol_style_attrs.group(1)
span_style = span_style.replace(oul_style, '')
oul_tag.attrs['style'] = oul_style
tag.attrs['style'] = span_style
tag.wrap(oul_tag)
@staticmethod
def wrap_span_in_h_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in h -> span"""
h_regexp = re.compile('(^h[1-9]$)')
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
tag.name = 'span'
tag.wrap(h_tag)
style = tag.attrs['style']
h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_h_style_attr = re.search(h_attrs_regexp, style)
tag.attrs['style'] = style if not has_h_style_attr else style.replace(
has_h_style_attr.group(1), '')
def convert_initial_tag(self):
self.tag_inline_style = self.change_attrs_with_corresponding_tags(
self.tag_inline_style.name)
self.wrap_span_in_p_to_save_style_attrs(self.tag_inline_style)
self.wrap_span_in_li_to_save_style_attrs(self.tag_inline_style)
self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_inline_style)
self.wrap_span_in_h_to_save_style_attrs(self.tag_inline_style)
return self.tag_inline_style
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""Function adds styles from .css to inline style"""
# remove this specification because it causes problems
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_inline_style = inline_soup.find_all(could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:
style_converter = TagStyleConverter(tag_inline_style)
style_converter.convert_initial_tag()
return inline_soup

View File

@@ -1,17 +1,26 @@
class LiveCartaConfig:
"""Class of values that LiveCarta platform using and supports"""
# tag with inline style to be updated with style attribute
SUPPORTED_LEVELS = 5
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
HEADERS_LEVELS = {"h1", "h2", "h3",
"h4", "h5", "h6", "h7", "h8", "h9"}
DEFAULT_ALIGN_STYLE = 'left'
ALIGN_STYLES = ['justify', 'right', 'center', 'left']
# Main constant values
DEFAULT_FONT_NAME = 'Times New Roman'
DEFAULT_ALIGN_STYLE = 'left'
ALIGN_STYLES = ['justify', 'right', 'center', 'left']
WORD_DEFAULT_FONT_SIZE = 11
LIVECARTA_DEFAULT_FONT_SIZE = 18
FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE / WORD_DEFAULT_FONT_SIZE
font_correspondence_table = {
FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE /\
WORD_DEFAULT_FONT_SIZE
FONT_CORRESPONDANCE_TABLE = {
"Arial": "arial,helvetica,sans-serif",
"Comic Sans MS": "comic sans ms,cursive",
"Courier New": "courier new,courier,monospace",
@@ -61,4 +70,39 @@ class LiveCartaConfig:
'gray': 'darkGray',
'grey': 'darkGray',
}
INDENT = '30px'
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0,
1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69,
1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38,
2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px',
'17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px',
'26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px',
'44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px']
list_types = ['circle', 'disc', 'armenian', 'decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong',
('font-weight', '600'): 'strong',
('font-weight', '700'): 'strong',
('font-weight', '800'): 'strong',
('font-weight', '900'): 'strong',
('font-style', 'italic'): 'i',
('text-decoration', 'underline'): 'u',
('text-decoration', 'line-through'): 's',
('text-decoration-line', 'underline'): 'u',
('text-decoration-line', 'line-through'): 's',
('vertical-align', 'super'): 'sup'
}