forked from LiveCarta/BookConverter
Add regex processing of span wraps
This commit is contained in:
@@ -202,112 +202,42 @@ class TagStyleConverter:
|
||||
return top_tag
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_p_to_save_style_attrs(tag):
|
||||
"""Function designed to save style attrs that cannot be in p -> span"""
|
||||
if tag.name == 'p' and tag.attrs.get('style'):
|
||||
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
||||
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
|
||||
p_style = ''
|
||||
initial_style = tag.attrs['style']
|
||||
split_style = initial_style.replace('; ', ';').split(';')
|
||||
possible_p_attrs_regexp = re.compile(
|
||||
r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)')
|
||||
for item in split_style:
|
||||
has_p_style_attrs = re.search(possible_p_attrs_regexp, item)
|
||||
if has_p_style_attrs:
|
||||
p_style += item + ';'
|
||||
initial_style = initial_style.replace(item + ';', '')
|
||||
# here check that this style i exactly the same.
|
||||
def wrap_span_in_tag_to_save_style_attrs(initial_tag):
|
||||
"""Function designed to save style attrs that cannot be in tag.name -> span"""
|
||||
dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG))
|
||||
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'):
|
||||
styles_can_be_in_tag = [style
|
||||
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items()
|
||||
if re.match(tag, initial_tag.name)
|
||||
for style in styles]
|
||||
styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS
|
||||
if attr not in styles_can_be_in_tag]
|
||||
span_style = initial_tag.attrs['style']
|
||||
# here check that this style is exactly the same.
|
||||
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
|
||||
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
|
||||
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
|
||||
styles_to_be_saved_in_span = [((attr + ':') in span_style) & (
|
||||
'-' + attr not in span_style) for attr in styles_cant_be_in_tag]
|
||||
if any(styles_to_be_saved_in_span):
|
||||
# if we find styles that cannot be in <p> -> wrap them in span
|
||||
tag.name = 'span'
|
||||
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
||||
p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||
has_p_style_attr = re.search(p_attrs_regexp, initial_style)
|
||||
span_style = initial_style if not has_p_style_attr else initial_style.replace(
|
||||
has_p_style_attr.group(1), '')
|
||||
p_tag.attrs['style'] = p_style
|
||||
tag.attrs['style'] = span_style
|
||||
tag.wrap(p_tag)
|
||||
else:
|
||||
tag.attrs['style'] = p_style
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_li_to_save_style_attrs(tag):
|
||||
"""Function designed to save style attrs that cannot be in li -> span"""
|
||||
if tag.name == 'li' and tag.attrs.get('style'):
|
||||
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
|
||||
attr not in ['text-align', 'list-style-type']]
|
||||
|
||||
styles_to_be_saved_in_span = [attr in tag.attrs.get(
|
||||
'style') for attr in styles_cant_be_in_li]
|
||||
if any(styles_to_be_saved_in_span):
|
||||
tag.name = 'span'
|
||||
li_tag = BeautifulSoup(features='lxml').new_tag('li')
|
||||
span_style = tag.attrs['style']
|
||||
li_style = ''
|
||||
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
|
||||
re.compile(r'(list-style-type:(\w+);)')]:
|
||||
has_li_style_attrs = re.search(
|
||||
possible_li_attrs_regexp, span_style)
|
||||
if has_li_style_attrs and has_li_style_attrs.group(1):
|
||||
li_style += has_li_style_attrs.group(1)
|
||||
# if we find styles that cannot be in <tag.name> -> wrap them in span
|
||||
tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}')
|
||||
style = ''
|
||||
possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag]
|
||||
for possible_attr_regexp in possible_attrs_regexp:
|
||||
has_style_attrs = re.search(
|
||||
possible_attr_regexp, span_style)
|
||||
if has_style_attrs and has_style_attrs.group(1):
|
||||
style += has_style_attrs.group(1)
|
||||
span_style = span_style.replace(
|
||||
has_li_style_attrs.group(1), '')
|
||||
li_tag.attrs['style'] = li_style
|
||||
tag.attrs['style'] = span_style
|
||||
tag.wrap(li_tag)
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
|
||||
"""Function designed to save style attrs that cannot be in ul/ol -> span"""
|
||||
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
|
||||
styles_cant_be_in_ul_ol = [
|
||||
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
|
||||
|
||||
styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
|
||||
for attr in styles_cant_be_in_ul_ol]
|
||||
if any(styles_to_be_saved_in_span):
|
||||
tag.name = 'span'
|
||||
oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
|
||||
span_style = tag.attrs['style']
|
||||
|
||||
possible_uol_attrs_regexp = re.compile(
|
||||
r'(list-style-type:(\w+);)')
|
||||
has_uol_style_attrs = re.search(
|
||||
possible_uol_attrs_regexp, span_style)
|
||||
if has_uol_style_attrs and has_uol_style_attrs.group(1):
|
||||
oul_style = has_uol_style_attrs.group(1)
|
||||
span_style = span_style.replace(oul_style, '')
|
||||
oul_tag.attrs['style'] = oul_style
|
||||
tag.attrs['style'] = span_style
|
||||
tag.wrap(oul_tag)
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_h_to_save_style_attrs(tag):
|
||||
"""Function designed to save style attrs that cannot be in h -> span"""
|
||||
h_regexp = re.compile('(^h[1-9]$)')
|
||||
|
||||
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
|
||||
h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
|
||||
tag.name = 'span'
|
||||
tag.wrap(h_tag)
|
||||
style = tag.attrs['style']
|
||||
h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||
has_h_style_attr = re.search(h_attrs_regexp, style)
|
||||
tag.attrs['style'] = style if not has_h_style_attr else style.replace(
|
||||
has_h_style_attr.group(1), '')
|
||||
has_style_attrs.group(1), '')
|
||||
tag.attrs['style'] = style
|
||||
initial_tag.name = 'span'
|
||||
initial_tag.attrs['style'] = span_style
|
||||
initial_tag.wrap(tag)
|
||||
|
||||
def convert_initial_tag(self):
|
||||
self.tag_inline_style = self.change_attrs_with_corresponding_tags(
|
||||
self.tag_inline_style.name)
|
||||
self.wrap_span_in_p_to_save_style_attrs(self.tag_inline_style)
|
||||
self.wrap_span_in_li_to_save_style_attrs(self.tag_inline_style)
|
||||
self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_inline_style)
|
||||
self.wrap_span_in_h_to_save_style_attrs(self.tag_inline_style)
|
||||
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
|
||||
return self.tag_inline_style
|
||||
|
||||
|
||||
@@ -339,9 +269,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) ->
|
||||
# soup with converted styles from css
|
||||
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
|
||||
|
||||
could_have_style_in_livecarta_regexp = re.compile(
|
||||
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
||||
tags_with_inline_style = inline_soup.find_all(could_have_style_in_livecarta_regexp,
|
||||
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
attrs={'style': re.compile('.*')})
|
||||
|
||||
# go through the tags with inline style + style parsed from css file
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
import re
|
||||
|
||||
|
||||
class LiveCartaConfig:
|
||||
"""Class of values that LiveCarta platform using and supports"""
|
||||
# tag with inline style to be updated with style attribute
|
||||
@@ -87,6 +90,14 @@ class LiveCartaConfig:
|
||||
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
|
||||
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
||||
|
||||
structural_tags_names = [
|
||||
'div', 'section', 'article', 'main', 'body', 'html', 'aside',
|
||||
'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p'
|
||||
]
|
||||
|
||||
could_have_style_in_livecarta_regexp = re.compile(
|
||||
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
||||
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
|
||||
|
||||
@@ -105,3 +116,11 @@ class LiveCartaConfig:
|
||||
('text-decoration-line', 'line-through'): 's',
|
||||
('vertical-align', 'super'): 'sup'
|
||||
}
|
||||
|
||||
LIVECARTA_STYLES_CANT_BE_IN_TAG = {
|
||||
'p': ['text-align', 'text-indent', 'border-bottom', 'border-top'],
|
||||
'li': ['text-align', 'list-style-type'],
|
||||
'ul': ['list-style-type'],
|
||||
'ol': ['list-style-type'],
|
||||
'(^h[1-9]$)': ['list-style-type']
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user