diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_css_style_converter.py index bb2a7fc..7d1ff1b 100644 --- a/src/epub_converter/tag_css_style_converter.py +++ b/src/epub_converter/tag_css_style_converter.py @@ -202,112 +202,42 @@ class TagStyleConverter: return top_tag @staticmethod - def wrap_span_in_p_to_save_style_attrs(tag): - """Function designed to save style attrs that cannot be in p -> span""" - if tag.name == 'p' and tag.attrs.get('style'): - styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS - if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']] - p_style = '' - initial_style = tag.attrs['style'] - split_style = initial_style.replace('; ', ';').split(';') - possible_p_attrs_regexp = re.compile( - r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)') - for item in split_style: - has_p_style_attrs = re.search(possible_p_attrs_regexp, item) - if has_p_style_attrs: - p_style += item + ';' - initial_style = initial_style.replace(item + ';', '') - # here check that this style i exactly the same. + def wrap_span_in_tag_to_save_style_attrs(initial_tag): + """Function designed to save style attrs that cannot be in tag.name -> span""" + dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG)) + if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'): + styles_can_be_in_tag = [style + for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items() + if re.match(tag, initial_tag.name) + for style in styles] + styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS + if attr not in styles_can_be_in_tag] + span_style = initial_tag.attrs['style'] + # here check that this style is exactly the same. # Not 'align' when we have 'text-align', or 'border' when we have 'border-top' - styles_to_be_saved_in_span = [((attr + ':') in initial_style) & ( - '-' + attr not in initial_style) for attr in styles_cant_be_in_p] + styles_to_be_saved_in_span = [((attr + ':') in span_style) & ( + '-' + attr not in span_style) for attr in styles_cant_be_in_tag] if any(styles_to_be_saved_in_span): - # if we find styles that cannot be in

-> wrap them in span - tag.name = 'span' - p_tag = BeautifulSoup(features='lxml').new_tag('p') - p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_p_style_attr = re.search(p_attrs_regexp, initial_style) - span_style = initial_style if not has_p_style_attr else initial_style.replace( - has_p_style_attr.group(1), '') - p_tag.attrs['style'] = p_style - tag.attrs['style'] = span_style - tag.wrap(p_tag) - else: - tag.attrs['style'] = p_style - - @staticmethod - def wrap_span_in_li_to_save_style_attrs(tag): - """Function designed to save style attrs that cannot be in li -> span""" - if tag.name == 'li' and tag.attrs.get('style'): - styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if - attr not in ['text-align', 'list-style-type']] - - styles_to_be_saved_in_span = [attr in tag.attrs.get( - 'style') for attr in styles_cant_be_in_li] - if any(styles_to_be_saved_in_span): - tag.name = 'span' - li_tag = BeautifulSoup(features='lxml').new_tag('li') - span_style = tag.attrs['style'] - li_style = '' - for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), - re.compile(r'(list-style-type:(\w+);)')]: - has_li_style_attrs = re.search( - possible_li_attrs_regexp, span_style) - if has_li_style_attrs and has_li_style_attrs.group(1): - li_style += has_li_style_attrs.group(1) + # if we find styles that cannot be in -> wrap them in span + tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}') + style = '' + possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag] + for possible_attr_regexp in possible_attrs_regexp: + has_style_attrs = re.search( + possible_attr_regexp, span_style) + if has_style_attrs and has_style_attrs.group(1): + style += has_style_attrs.group(1) span_style = span_style.replace( - has_li_style_attrs.group(1), '') - li_tag.attrs['style'] = li_style - tag.attrs['style'] = span_style - tag.wrap(li_tag) - - @staticmethod - def wrap_span_in_ul_ol_to_save_style_attrs(tag): - """Function designed to save style attrs that cannot be in ul/ol -> span""" - if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): - styles_cant_be_in_ul_ol = [ - attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] - - styles_to_be_saved_in_span = [attr in tag.attrs.get('style') - for attr in styles_cant_be_in_ul_ol] - if any(styles_to_be_saved_in_span): - tag.name = 'span' - oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name) - span_style = tag.attrs['style'] - - possible_uol_attrs_regexp = re.compile( - r'(list-style-type:(\w+);)') - has_uol_style_attrs = re.search( - possible_uol_attrs_regexp, span_style) - if has_uol_style_attrs and has_uol_style_attrs.group(1): - oul_style = has_uol_style_attrs.group(1) - span_style = span_style.replace(oul_style, '') - oul_tag.attrs['style'] = oul_style - tag.attrs['style'] = span_style - tag.wrap(oul_tag) - - @staticmethod - def wrap_span_in_h_to_save_style_attrs(tag): - """Function designed to save style attrs that cannot be in h -> span""" - h_regexp = re.compile('(^h[1-9]$)') - - if re.search(h_regexp, tag.name) and tag.attrs.get('style'): - h_tag = BeautifulSoup(features='lxml').new_tag(tag.name) - tag.name = 'span' - tag.wrap(h_tag) - style = tag.attrs['style'] - h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_h_style_attr = re.search(h_attrs_regexp, style) - tag.attrs['style'] = style if not has_h_style_attr else style.replace( - has_h_style_attr.group(1), '') + has_style_attrs.group(1), '') + tag.attrs['style'] = style + initial_tag.name = 'span' + initial_tag.attrs['style'] = span_style + initial_tag.wrap(tag) def convert_initial_tag(self): self.tag_inline_style = self.change_attrs_with_corresponding_tags( self.tag_inline_style.name) - self.wrap_span_in_p_to_save_style_attrs(self.tag_inline_style) - self.wrap_span_in_li_to_save_style_attrs(self.tag_inline_style) - self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_inline_style) - self.wrap_span_in_h_to_save_style_attrs(self.tag_inline_style) + self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) return self.tag_inline_style @@ -339,9 +269,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> # soup with converted styles from css inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') - could_have_style_in_livecarta_regexp = re.compile( - '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') - tags_with_inline_style = inline_soup.find_all(could_have_style_in_livecarta_regexp, + tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, attrs={'style': re.compile('.*')}) # go through the tags with inline style + style parsed from css file diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 3d5f667..e3e63d4 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -1,3 +1,6 @@ +import re + + class LiveCartaConfig: """Class of values that LiveCarta platform using and supports""" # tag with inline style to be updated with style attribute @@ -87,6 +90,14 @@ class LiveCartaConfig: 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] + structural_tags_names = [ + 'div', 'section', 'article', 'main', 'body', 'html', 'aside', + 'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p' + ] + + could_have_style_in_livecarta_regexp = re.compile( + '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') + """ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } @@ -105,3 +116,11 @@ class LiveCartaConfig: ('text-decoration-line', 'line-through'): 's', ('vertical-align', 'super'): 'sup' } + + LIVECARTA_STYLES_CANT_BE_IN_TAG = { + 'p': ['text-align', 'text-indent', 'border-bottom', 'border-top'], + 'li': ['text-align', 'list-style-type'], + 'ul': ['list-style-type'], + 'ol': ['list-style-type'], + '(^h[1-9]$)': ['list-style-type'] + }