epub converter: fix css styles processing

This commit is contained in:
shirshasa
2021-07-05 18:01:27 +03:00
parent 14088ccc3a
commit dbacea7453

View File

@@ -179,21 +179,14 @@ def clean_css(css):
return css_text
def add_inline_style_to_html_soup(soup1, css_text):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []
h_regex = f'(^h[1-9]$)'
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
elements_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
for i, x in enumerate(elements_with_possible_style_attr):
x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i)
html_with_inline_style = transform(str(soup1), css_text=css_text,
remove_classes=False,
external_styles=False,
disable_validation=True)
soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
class TagStyleConverter:
def __init__(self, tag, tag_with_style):
self.tag = tag
self.tag_initial_name = tag.name
self.tag_with_style = tag_with_style
self.style = self.preprocess_style()
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
if 'background' in style_:
return style_
@@ -221,57 +214,98 @@ def add_inline_style_to_html_soup(soup1, css_text):
style_ = style_.replace('color:white;', '')
return style_
def preprocess_style(self):
style = self.tag_with_style.attrs.get('style') + ';'
style = self.remove_white_if_no_bgcolor(style, self.tag_with_style)
style = style.replace('background:', 'background-color:')
return style
def change_attrs_with_corresponding_tags(self):
# adds <b>, <u>, <sup>, etc
to_remove = check_style_to_be_tag(self.style)
new_tags = []
for i, (p, v) in enumerate(to_remove):
s = f'{p}:{v};'
self.style = self.style.replace(s, '')
self.style = self.style.strip()
if i == 0:
self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
new_tags.append(self.tag)
else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag)
new_tags.append(new_tag)
top_tag = self.tag
if new_tags:
tmp_attrs = top_tag.attrs.copy()
top_tag = BeautifulSoup(features='lxml').new_tag(self.tag_initial_name)
top_tag.attrs = tmp_attrs
if self.style:
top_tag.attrs['style'] = self.style
new_tags[-1].wrap(top_tag)
else:
top_tag.attrs['style'] = self.style
return top_tag
@staticmethod
def wrap_p_to_save_style_attrs(t):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'text-indent']]
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent']]
if t.name == 'p' and t.attrs.get('style'):
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_p]
if any(check):
t.name = 'span'
t.wrap( BeautifulSoup(features='lxml').new_tag('p'))
p_tag = BeautifulSoup(features='lxml').new_tag('p')
old_style = t.attrs['style']
new_style = ''
possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)')
has_p_style_attrs = re.search(possible_p_attrs_regexp, old_style)
if has_p_style_attrs:
if has_p_style_attrs.group(1):
new_style += has_p_style_attrs.group(1)
old_style = old_style.replace(has_p_style_attrs.group(1), '')
if has_p_style_attrs.group(3):
new_style += has_p_style_attrs.group(3)
old_style = old_style.replace(has_p_style_attrs.group(3), '')
p_tag.attrs['style'] = new_style
t.attrs['style'] = old_style
t.wrap(p_tag)
def convert_initial_tag(self):
del self.tag.attrs['livecarta_id']
self.tag = self.change_attrs_with_corresponding_tags()
self.wrap_p_to_save_style_attrs(self.tag)
return self.tag
def add_inline_style_to_html_soup(soup1, css_text):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []
h_regex = f'(^h[1-9]$)'
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
elements_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
for i, x in enumerate(elements_with_possible_style_attr):
x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i)
html_with_inline_style = transform(str(soup1), css_text=css_text,
remove_classes=False,
external_styles=False,
disable_validation=True)
soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
for i in livecarta_tmp_ids:
tag = soup1.find(attrs={'livecarta_id': i})
tag_initial_name = tag.name
tag_with_style = soup2.find(attrs={'livecarta_id': i})
if tag_with_style.attrs.get('style'):
style = tag_with_style.attrs.get('style') + ';'
style = remove_white_if_no_bgcolor(style, tag_with_style)
style = style.replace('background:', 'background-color:')
to_remove = check_style_to_be_tag(style)
new_tags = []
for i, (p, v) in enumerate(to_remove):
s = f'{p}:{v};'
style = style.replace(s, '')
if i == 0:
tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
new_tags.append(tag)
else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag)
new_tags.append(new_tag)
top_tag = tag
if to_remove:
style = style.strip()
tmp_attrs = tag.attrs.copy()
tag.attrs = {}
top_tag = BeautifulSoup(features='lxml').new_tag(tag_initial_name)
top_tag.attrs = tmp_attrs
if style:
top_tag.attrs['style'] = style
del top_tag.attrs['livecarta_id']
new_tags[-1].wrap(top_tag)
else:
tag.attrs['style'] = style
del tag.attrs['livecarta_id']
wrap_p_to_save_style_attrs(top_tag)
style_converter = TagStyleConverter(tag, tag_with_style)
style_converter.convert_initial_tag()
else:
del tag.attrs['livecarta_id']
return soup1