Rewrite wrap_span_in_p function

This commit is contained in:
Kiryl
2021-11-03 12:23:13 +03:00
parent 479695e185
commit f69c638640

View File

@@ -12,7 +12,6 @@ from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
cssutils.log.setLevel(CRITICAL) cssutils.log.setLevel(CRITICAL)
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
@@ -57,26 +56,28 @@ def convert_font_size(value):
except ValueError: except ValueError:
return '' return ''
def convert_indents(value): def convert_indents(value):
# 30px = 3.2% = 1.25em = 23pt # 30px = 3.2% = 1.25em = 23pt
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)') text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)')
has_style_attrs = re.search(text_indent_regexp, value) has_style_attrs = re.search(text_indent_regexp, value)
if has_style_attrs: if has_style_attrs:
if has_style_attrs.group(1): if has_style_attrs.group(1):
value = value.replace(has_style_attrs.group(1), value = value.replace(has_style_attrs.group(1),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) + str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
'px') 'px')
elif has_style_attrs.group(2): elif has_style_attrs.group(2):
value = value.replace(has_style_attrs.group(2), value = value.replace(has_style_attrs.group(2),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) + str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
'px') 'px')
elif has_style_attrs.group(4): elif has_style_attrs.group(4):
value = value.replace(has_style_attrs.group(4), value = value.replace(has_style_attrs.group(4),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px') str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px')
return value return value
""" """
LIVECARTA_STYLE_ATTRS = { css property: value } LIVECARTA_STYLE_ATTRS = { css property: value }
@@ -107,6 +108,7 @@ LIVECARTA_STYLE_ATTRS = {
'border-right-width': [], 'border-right-width': [],
'border-left-width': [], 'border-left-width': [],
'border-bottom-width': [], 'border-bottom-width': [],
'border-top': [],
'border-bottom': [], 'border-bottom': [],
'list-style-type': [], 'list-style-type': [],
'list-style-image': [], 'list-style-image': [],
@@ -132,6 +134,7 @@ def get_text_color(x):
color = color if color not in ['#000000', '#000', 'black'] else '' color = color if color not in ['#000000', '#000', 'black'] else ''
return color return color
LIVECARTA_STYLE_ATTRS_MAPPING = { LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents, 'text-indent': convert_indents,
'font-variant': lambda x: x, 'font-variant': lambda x: x,
@@ -147,6 +150,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
'border-right-width': lambda x: x if x != '0' else '', 'border-right-width': lambda x: x if x != '0' else '',
'border-left-width': lambda x: x if x != '0' else '', 'border-left-width': lambda x: x if x != '0' else '',
'border-bottom-width': lambda x: x if x != '0' else '', 'border-bottom-width': lambda x: x if x != '0' else '',
'border-top': lambda x: x if x != '0' else '',
'border-bottom': lambda x: x if x != '0' else '', 'border-bottom': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-type': lambda x: x if x in list_types else 'disc',
'list-style-image': lambda x: 'disc', 'list-style-image': lambda x: 'disc',
@@ -182,30 +186,35 @@ def check_style_to_be_tag(style) -> List[tuple]:
to_remove.append(k) to_remove.append(k)
return to_remove return to_remove
def update_css_style_types_to_livecarta_convention(css_rule, style_type): def update_css_style_types_to_livecarta_convention(css_rule, style_type):
if style_type.name not in LIVECARTA_STYLE_ATTRS: if style_type.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file # property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = '' css_rule.style[style_type.name] = ''
return return
cleaned_value = style_type.value.replace('\"', '') # value of style cleaned_value = style_type.value.replace('\"', '') # value of style
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name) there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[style_type.name] value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_type.name]
if there_are_constraints_on_value and value_not_in_possible_values_list: if there_are_constraints_on_value and value_not_in_possible_values_list:
# style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file # style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = '' css_rule.style[style_type.name] = ''
else: else:
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] # function that converts our data # function that converts our data
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
css_rule.style[style_type.name] = func(cleaned_value) css_rule.style[style_type.name] = func(cleaned_value)
def build_css_content(css_content): def build_css_content(css_content):
sheet = cssutils.parseString(css_content, validate=False) sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet: for css_rule in sheet:
if css_rule.type == css_rule.STYLE_RULE: if css_rule.type == css_rule.STYLE_RULE:
for style_type in css_rule.style: for style_type in css_rule.style:
update_css_style_types_to_livecarta_convention(css_rule, style_type) update_css_style_types_to_livecarta_convention(
css_rule, style_type)
css_text = sheet._getCssText().decode() css_text = sheet._getCssText().decode()
return css_text return css_text
@@ -213,9 +222,11 @@ def build_css_content(css_content):
class TagStyleConverter: class TagStyleConverter:
def __init__(self, tag_with_inline_style, tag_with_ultimate_style): def __init__(self, tag_with_inline_style, tag_with_ultimate_style):
self.tag_with_inline_style = tag_with_inline_style # tag with inline style to be updated with style attribute # tag with inline style to be updated with style attribute
self.tag_with_inline_style = tag_with_inline_style
self.tag_initial_name = tag_with_inline_style.name self.tag_initial_name = tag_with_inline_style.name
self.tag_with_ultimate_style = tag_with_ultimate_style # tag with inline style + style parsed from css file # tag with inline style + style parsed from css file
self.tag_with_ultimate_style = tag_with_ultimate_style
self.style = self.preprocess_style() self.style = self.preprocess_style()
@staticmethod @staticmethod
@@ -230,7 +241,8 @@ class TagStyleConverter:
# white bg color not need to be checked as we do not write 'white bg color' # white bg color not need to be checked as we do not write 'white bg color'
tag_with_bg = ['span', 'td', 'tr', 'p'] tag_with_bg = ['span', 'td', 'tr', 'p']
tag_will_be_saved = parent_tag.name in tag_with_bg tag_will_be_saved = parent_tag.name in tag_with_bg
has_bg = parent_tag.attrs.get('style') and ('background' in parent_tag.attrs.get('style')) has_bg = parent_tag.attrs.get('style') and (
'background' in parent_tag.attrs.get('style'))
if has_bg and tag_will_be_saved: if has_bg and tag_will_be_saved:
return style_ return style_
@@ -256,7 +268,7 @@ class TagStyleConverter:
if item[0] in ['text-indent', 'margin-left']: if item[0] in ['text-indent', 'margin-left']:
item[1] = convert_indents(item[1]) item[1] = convert_indents(item[1])
clean_style += item[0] + ': ' + item[1] + '; ' clean_style += item[0] + ': ' + item[1] + '; '
margin_left_regexp = re.compile( margin_left_regexp = re.compile(
r'(margin-left:( *-*\w+);*)') r'(margin-left:( *-*\w+);*)')
text_indent_regexp = re.compile( text_indent_regexp = re.compile(
@@ -267,63 +279,70 @@ class TagStyleConverter:
#formula_of_indent: indent = abs(margin_left - text_indent) #formula_of_indent: indent = abs(margin_left - text_indent)
if has_margin_left: if has_margin_left:
num_ml = abs(int("".join( num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2)))))) filter(str.isdigit, str(has_margin_left.group(2))))))
if has_text_indent: if has_text_indent:
num_ti = abs(int("".join( num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2)))))) filter(str.isdigit, str(has_text_indent.group(2))))))
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ') str(abs(num_ml - num_ti)) + 'px; ')
clean_style = clean_style.replace(has_margin_left.group(1), '') clean_style = clean_style.replace(has_margin_left.group(1), '')
return clean_style return clean_style
clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' + clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' +
str(abs(num_ml)) + 'px; ') str(abs(num_ml)) + 'px; ')
return clean_style return clean_style
elif has_text_indent: elif has_text_indent:
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(int("".join( str(abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ') filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ')
return clean_style return clean_style
return clean_style return clean_style
def preprocess_style(self): def preprocess_style(self):
ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';' ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';'
ultimate_style = self.remove_white_if_no_bgcolor(ultimate_style, self.tag_with_ultimate_style) ultimate_style = self.remove_white_if_no_bgcolor(
ultimate_style = ultimate_style.replace('background:', 'background-color:') ultimate_style, self.tag_with_ultimate_style)
ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type') ultimate_style = ultimate_style.replace(
'background:', 'background-color:')
ultimate_style = ultimate_style.replace(
'list-style-image', 'list-style-type')
split_ultimate_style = ultimate_style.replace('; ',';').split(';') split_ultimate_style = ultimate_style.replace('; ', ';').split(';')
# when we split style by ; and we have at the end ; that's why we have '' in list # when we split style by ; and we have at the end ; that's why we have '' in list
while '' in split_ultimate_style: while '' in split_ultimate_style:
split_ultimate_style.remove('') split_ultimate_style.remove('')
# replace all spaces between ': & letter' to ':' # replace all spaces between ': & letter' to ':'
split_ultimate_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_ultimate_style] split_ultimate_style = [el.replace(
re.search(r'(:\s*)', el).group(1), ':') for el in split_ultimate_style]
if self.tag_with_inline_style.attrs.get('style'): if self.tag_with_inline_style.attrs.get('style'):
inline_style = self.tag_with_inline_style.attrs['style'] inline_style = self.tag_with_inline_style.attrs['style']
split_inline_style = inline_style.replace('; ',';').split(';') split_inline_style = inline_style.replace('; ', ';').split(';')
# when we split style by ; and we have at the end ; that's why we have '' in list # when we split style by ; and we have at the end ; that's why we have '' in list
while '' in split_inline_style: while '' in split_inline_style:
split_inline_style.remove('') split_inline_style.remove('')
# replace all spaces between ': & letter' to ':' # replace all spaces between ': & letter' to ':'
split_inline_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_inline_style] split_inline_style = [el.replace(
re.search(r'(:\s*)', el).group(1), ':') for el in split_inline_style]
# repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css # repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css
repeat_styles = list(set(split_ultimate_style) & set(split_inline_style)) repeat_styles = list(set(split_ultimate_style)
& set(split_inline_style))
for item in repeat_styles: for item in repeat_styles:
split_inline_style.remove(item) split_inline_style.remove(item)
if split_inline_style: if split_inline_style:
# if inline style is not empty - start convert and add to ultimate style # if inline style is not empty - start convert and add to ultimate style
print('we enter repetition check', '\n') print('we enter repetition check', '\n')
inline_style: str = self.process_indents_to_px(split_inline_style) inline_style: str = self.process_indents_to_px(
split_inline_style)
ultimate_style += inline_style ultimate_style += inline_style
ultimate_style: str = self.process_indents_to_px(split_ultimate_style) ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
@@ -338,7 +357,8 @@ class TagStyleConverter:
self.style = self.style.replace(s, '') self.style = self.style.replace(s, '')
self.style = self.style.strip() self.style = self.style.strip()
if i == 0: if i == 0:
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)]
new_tags.append(self.tag_with_inline_style) new_tags.append(self.tag_with_inline_style)
else: else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
@@ -351,7 +371,8 @@ class TagStyleConverter:
if new_tags: if new_tags:
tmp_attrs = top_tag.attrs.copy() tmp_attrs = top_tag.attrs.copy()
top_tag.attrs = {} top_tag.attrs = {}
top_tag2 = BeautifulSoup(features='lxml').new_tag(self.tag_initial_name) top_tag2 = BeautifulSoup(features='lxml').new_tag(
self.tag_initial_name)
top_tag2.attrs = tmp_attrs top_tag2.attrs = tmp_attrs
if self.style: if self.style:
top_tag2.attrs['style'] = self.style top_tag2.attrs['style'] = self.style
@@ -363,39 +384,36 @@ class TagStyleConverter:
@staticmethod @staticmethod
def wrap_span_in_p_to_save_style_attrs(tag): def wrap_span_in_p_to_save_style_attrs(tag):
'''Function designed to save style attrs that cannot be in p -> span '''Function designed to save style attrs that cannot be in p -> span'''
that cannot be in span -> p'''
if tag.name == 'p' and tag.attrs.get('style'): if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom']] if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
p_style = ''
initial_style = tag.attrs['style']
split_style = initial_style.replace('; ', ';').split(';')
possible_p_attrs_regexp = re.compile(
r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)')
for item in split_style:
has_p_style_attrs = re.search(possible_p_attrs_regexp, item)
if has_p_style_attrs:
p_style += item + ';'
initial_style = initial_style.replace(item + ';', '')
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p] # here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
if any(styles_to_be_saved): styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
if any(styles_to_be_saved_in_span):
# if find styles that cannot be in <p> -> wrap them in span
tag.name = 'span' tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p') p_tag = BeautifulSoup(features='lxml').new_tag('p')
span_style = tag.attrs['style'] li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
p_style = '' has_li_style_attr = re.search(li_attrs_regexp, initial_style)
possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)|(border-bottom:( *\w+);*)') span_style = initial_style if not has_li_style_attr else initial_style.replace(
for i in range(span_style.count(';') + 1): has_li_style_attr.group(1), '')
has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style) p_tag.attrs['style'] = p_style
if has_p_style_attrs:
if has_p_style_attrs.group(1):
p_style += has_p_style_attrs.group(1)
span_style = span_style.replace(has_p_style_attrs.group(1), '')
if has_p_style_attrs.group(3):
p_style += has_p_style_attrs.group(3)
span_style = span_style.replace(has_p_style_attrs.group(3), '')
if has_p_style_attrs.group(5):
p_style += span_style
span_style = span_style.replace(span_style, '')
p_tag.attrs['style'] = p_style
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, span_style)
span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '')
tag.attrs['style'] = span_style tag.attrs['style'] = span_style
tag.wrap(p_tag) tag.wrap(p_tag)
else: tag.attrs['style'] = p_style
@staticmethod @staticmethod
def wrap_span_in_li_to_save_style_attrs(tag): def wrap_span_in_li_to_save_style_attrs(tag):
@@ -403,7 +421,8 @@ class TagStyleConverter:
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']] attr not in ['text-align', 'list-style-type']]
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_li] styles_to_be_saved = [attr in tag.attrs.get(
'style') for attr in styles_cant_be_in_li]
if any(styles_to_be_saved): if any(styles_to_be_saved):
tag.name = 'span' tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li') li_tag = BeautifulSoup(features='lxml').new_tag('li')
@@ -412,10 +431,12 @@ class TagStyleConverter:
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
re.compile(r'(list-style-type:(\w+);)')]: re.compile(r'(list-style-type:(\w+);)')]:
has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style) has_li_style_attrs = re.search(
possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1): if has_li_style_attrs and has_li_style_attrs.group(1):
li_style += has_li_style_attrs.group(1) li_style += has_li_style_attrs.group(1)
span_style = span_style.replace(has_li_style_attrs.group(1), '') span_style = span_style.replace(
has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = li_style li_tag.attrs['style'] = li_style
tag.attrs['style'] = span_style tag.attrs['style'] = span_style
@@ -424,16 +445,20 @@ class TagStyleConverter:
@staticmethod @staticmethod
def wrap_span_in_ul_ol_to_save_style_attrs(tag): def wrap_span_in_ul_ol_to_save_style_attrs(tag):
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] styles_cant_be_in_ul_ol = [
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
check = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_ul_ol] check = [attr in tag.attrs.get('style')
for attr in styles_cant_be_in_ul_ol]
if any(check): if any(check):
tag.name = 'span' tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('ul') li_tag = BeautifulSoup(features='lxml').new_tag('ul')
span_style = tag.attrs['style'] span_style = tag.attrs['style']
possible_li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') possible_li_attrs_regexp = re.compile(
has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style) r'(list-style-type:(\w+);)')
has_li_style_attrs = re.search(
possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1): if has_li_style_attrs and has_li_style_attrs.group(1):
oul_style = has_li_style_attrs.group(1) oul_style = has_li_style_attrs.group(1)
span_style = span_style.replace(oul_style, '') span_style = span_style.replace(oul_style, '')
@@ -452,7 +477,8 @@ class TagStyleConverter:
style = tag.attrs['style'] style = tag.attrs['style']
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, style) has_li_style_attr = re.search(li_attrs_regexp, style)
tag.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') tag.attrs['style'] = style if not has_li_style_attr else style.replace(
has_li_style_attr.group(1), '')
def convert_initial_tag(self): def convert_initial_tag(self):
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags() self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
@@ -464,10 +490,13 @@ class TagStyleConverter:
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '') css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = [] livecarta_tmp_ids = []
could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') could_have_style_in_livecarta_regexp = re.compile(
tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp) '(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_possible_style_attr = html_soup.find_all(
could_have_style_in_livecarta_regexp)
for i, x in enumerate(tags_with_possible_style_attr): for i, x in enumerate(tags_with_possible_style_attr):
x.attrs['livecarta_id'] = i x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i) livecarta_tmp_ids.append(i)
@@ -488,7 +517,8 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i}) tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})
del tag_with_initial_style.attrs['livecarta_id'] del tag_with_initial_style.attrs['livecarta_id']
if tag_with_ultimate_style.attrs.get('style'): if tag_with_ultimate_style.attrs.get('style'):
style_converter = TagStyleConverter(tag_with_initial_style, tag_with_ultimate_style) style_converter = TagStyleConverter(
tag_with_initial_style, tag_with_ultimate_style)
style_converter.convert_initial_tag() style_converter.convert_initial_tag()
return html_soup return html_soup
@@ -500,7 +530,8 @@ if __name__ == '__main__':
css_ = ebooklib_book.get_item_with_href('css/epub.css') css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode() css_ = css_.get_content().decode()
css_cleaned = build_css_content(css_) css_cleaned = build_css_content(css_)
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode() html_ = ebooklib_book.get_item_with_href(
'pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml') html_soup = BeautifulSoup(html_, features='lxml')
print(convert_html_soup_with_css_style(html_soup, css_cleaned)) print(convert_html_soup_with_css_style(html_soup, css_cleaned))