Add regex processing of span wraps

This commit is contained in:
Kiryl
2022-06-13 13:00:19 +03:00
parent 180631d33b
commit da70a574e6
2 changed files with 49 additions and 102 deletions

View File

@@ -202,112 +202,42 @@ class TagStyleConverter:
return top_tag
@staticmethod
def wrap_span_in_p_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in p -> span"""
if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
p_style = ''
initial_style = tag.attrs['style']
split_style = initial_style.replace('; ', ';').split(';')
possible_p_attrs_regexp = re.compile(
r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)')
for item in split_style:
has_p_style_attrs = re.search(possible_p_attrs_regexp, item)
if has_p_style_attrs:
p_style += item + ';'
initial_style = initial_style.replace(item + ';', '')
# here check that this style i exactly the same.
def wrap_span_in_tag_to_save_style_attrs(initial_tag):
"""Function designed to save style attrs that cannot be in tag.name -> span"""
dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG))
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'):
styles_can_be_in_tag = [style
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items()
if re.match(tag, initial_tag.name)
for style in styles]
styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in styles_can_be_in_tag]
span_style = initial_tag.attrs['style']
# here check that this style is exactly the same.
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
styles_to_be_saved_in_span = [((attr + ':') in span_style) & (
'-' + attr not in span_style) for attr in styles_cant_be_in_tag]
if any(styles_to_be_saved_in_span):
# if we find styles that cannot be in <p> -> wrap them in span
tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p')
p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_p_style_attr = re.search(p_attrs_regexp, initial_style)
span_style = initial_style if not has_p_style_attr else initial_style.replace(
has_p_style_attr.group(1), '')
p_tag.attrs['style'] = p_style
tag.attrs['style'] = span_style
tag.wrap(p_tag)
else:
tag.attrs['style'] = p_style
@staticmethod
def wrap_span_in_li_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in li -> span"""
if tag.name == 'li' and tag.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']]
styles_to_be_saved_in_span = [attr in tag.attrs.get(
'style') for attr in styles_cant_be_in_li]
if any(styles_to_be_saved_in_span):
tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li')
span_style = tag.attrs['style']
li_style = ''
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
re.compile(r'(list-style-type:(\w+);)')]:
has_li_style_attrs = re.search(
possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
li_style += has_li_style_attrs.group(1)
# if we find styles that cannot be in <tag.name> -> wrap them in span
tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}')
style = ''
possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag]
for possible_attr_regexp in possible_attrs_regexp:
has_style_attrs = re.search(
possible_attr_regexp, span_style)
if has_style_attrs and has_style_attrs.group(1):
style += has_style_attrs.group(1)
span_style = span_style.replace(
has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = li_style
tag.attrs['style'] = span_style
tag.wrap(li_tag)
@staticmethod
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in ul/ol -> span"""
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
for attr in styles_cant_be_in_ul_ol]
if any(styles_to_be_saved_in_span):
tag.name = 'span'
oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
span_style = tag.attrs['style']
possible_uol_attrs_regexp = re.compile(
r'(list-style-type:(\w+);)')
has_uol_style_attrs = re.search(
possible_uol_attrs_regexp, span_style)
if has_uol_style_attrs and has_uol_style_attrs.group(1):
oul_style = has_uol_style_attrs.group(1)
span_style = span_style.replace(oul_style, '')
oul_tag.attrs['style'] = oul_style
tag.attrs['style'] = span_style
tag.wrap(oul_tag)
@staticmethod
def wrap_span_in_h_to_save_style_attrs(tag):
"""Function designed to save style attrs that cannot be in h -> span"""
h_regexp = re.compile('(^h[1-9]$)')
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
tag.name = 'span'
tag.wrap(h_tag)
style = tag.attrs['style']
h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_h_style_attr = re.search(h_attrs_regexp, style)
tag.attrs['style'] = style if not has_h_style_attr else style.replace(
has_h_style_attr.group(1), '')
has_style_attrs.group(1), '')
tag.attrs['style'] = style
initial_tag.name = 'span'
initial_tag.attrs['style'] = span_style
initial_tag.wrap(tag)
def convert_initial_tag(self):
self.tag_inline_style = self.change_attrs_with_corresponding_tags(
self.tag_inline_style.name)
self.wrap_span_in_p_to_save_style_attrs(self.tag_inline_style)
self.wrap_span_in_li_to_save_style_attrs(self.tag_inline_style)
self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_inline_style)
self.wrap_span_in_h_to_save_style_attrs(self.tag_inline_style)
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
return self.tag_inline_style
@@ -339,9 +269,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) ->
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_inline_style = inline_soup.find_all(could_have_style_in_livecarta_regexp,
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')})
# go through the tags with inline style + style parsed from css file

View File

@@ -1,3 +1,6 @@
import re
class LiveCartaConfig:
"""Class of values that LiveCarta platform using and supports"""
# tag with inline style to be updated with style attribute
@@ -87,6 +90,14 @@ class LiveCartaConfig:
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside',
'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p'
]
could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
@@ -105,3 +116,11 @@ class LiveCartaConfig:
('text-decoration-line', 'line-through'): 's',
('vertical-align', 'super'): 'sup'
}
LIVECARTA_STYLES_CANT_BE_IN_TAG = {
'p': ['text-align', 'text-indent', 'border-bottom', 'border-top'],
'li': ['text-align', 'list-style-type'],
'ul': ['list-style-type'],
'ol': ['list-style-type'],
'(^h[1-9]$)': ['list-style-type']
}