import re from typing import List import cssutils from bs4 import BeautifulSoup from ebooklib import epub from premailer import transform from itertools import takewhile from logging import CRITICAL from livecarta_config import LawCartaConfig from util.color_reader import str2hex cssutils.log.setLevel(CRITICAL) sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] list_types = ['circle', 'disc', 'armenian', 'decimal', 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] def convert_font_size(value): if 'pt' in value: if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE: return '' else: return value.replace('pt', 'px') if value == '100%': return '' try: if '%' in value: value = float(value.replace('%', '')) value = value / 100.0 elif 'em' in value: value = float(value.replace('em', '')) else: return '' if value > 5: return '' possible_sizes = list(takewhile(lambda x: value > x, sizes_pr)) last_possible_size_index = sizes_pr.index(possible_sizes[-1]) return sizes_px[last_possible_size_index] except ValueError: return '' def convert_indents(value): if '-' not in value[0]: # 30px = 3.2% = 1.25em = 23pt positive_text_indent_regexp = re.compile(r'(\w+%)|(\w*.*\w+em)') has_style_attrs = re.search(positive_text_indent_regexp, value) if has_style_attrs: if has_style_attrs.group(1): value = value.replace(has_style_attrs.group(1), str(int("".join(filter(str.isdigit, str(has_style_attrs.group(1)))))) + '%') # elif has_style_attrs.group(2): # value = value.replace(has_style_attrs.group(2), # str(int("".join(filter(str.isdigit, str(has_style_attrs.group(2))))) * 5) + # '%') return value else: return '' """ LIVECARTA_STYLE_ATTRS = { css property: value } Style properties that can be used to fit livecarta css style convention. If property has empty list, it means that any value can be converted. If property has not empty list, it means that only certain property-value combinations can be transformed. """ LIVECARTA_STYLE_ATTRS = { 'text-indent': [], 'font-variant': ['small-caps'], 'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE], 'align': [], # ??? 'font': [], # ??? 'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys() if x != LawCartaConfig.DEFAULT_FONT_NAME], 'font-size': [], 'font-weight': ['bold', '600', '700', '800', '900'], # 'font-style': ['italic'], # 'text-decoration': ['underline', 'line-through'], # , 'text-decoration-line': ['underline', 'line-through'], # , 'vertical-align': ['super'], # 'color': [], 'background-color': [], 'background': [], 'width': [], 'border-top-width': [], 'border-right-width': [], 'border-left-width': [], 'border-bottom-width': [], 'border': [], 'list-style-type': [], 'list-style-image': [], 'margin-left': [] } """ LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated to suit livecarta style convention. """ def get_bg_color(x): color = str2hex(x) color = color if color not in ['#ffffff', '#fff', 'white'] else '' return color def get_text_color(x): color = str2hex(x) color = color if color not in ['#000000', '#000', 'black'] else '' return color LIVECARTA_STYLE_ATTRS_MAPPING = { #'text-indent': convert_indents, 'font-variant': lambda x: x, 'text-align': lambda x: x, 'font': lambda x: '', 'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x) or LawCartaConfig.font_correspondence_table.get(x.capitalize()), 'font-size': convert_font_size, 'color': get_text_color, 'background-color': get_bg_color, 'background': get_bg_color, 'border': lambda x: x if x != '0' else '', 'border-top-width': lambda x: x if x != '0' else '', 'border-right-width': lambda x: x if x != '0' else '', 'border-left-width': lambda x: x if x != '0' else '', 'border-bottom-width': lambda x: x if x != '0' else '', 'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-image': lambda x: 'disc', 'margin-left': lambda x: x } """ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }

List[tuple]: """ Some css style properties converts to tags. Search for them and prepare list of properties to be removed from style string""" to_remove = [] for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: if f'{k[0]}:{k[1]}' in style: to_remove.append(k) return to_remove def update_property_to_livecarta_convention(rule, property_): if property_.name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file rule.style[property_.name] = '' return cleaned_value = property_.value.replace('\"', '') there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(property_.name) value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[property_.name] if there_are_constraints_on_value and value_not_in_possible_values_list: # property + value not in LIVECARTA_STYLE_ATTRS, remove from css file rule.style[property_.name] = '' else: if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING: func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name] rule.style[property_.name] = func(cleaned_value) def clean_css(css): sheet = cssutils.parseString(css, validate=False) for rule in sheet: if rule.type == rule.STYLE_RULE: for property_ in rule.style: update_property_to_livecarta_convention(rule, property_) css_text = sheet._getCssText().decode() return css_text class TagStyleConverter: def __init__(self, tag, tag_with_style): self.tag = tag # tag to be updated with style attribute self.tag_initial_name = tag.name self.tag_with_style = tag_with_style # tag with inline style parsed from css file self.style = self.preprocess_style() @staticmethod def remove_white_if_no_bgcolor(style_, tag): if 'background' in style_: return style_ # if text color is white, check that we have bg-color if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): # if bg color is inherited, just return style as is for parent_tag in tag.parents: # white bg color not need to be checked as we do not write 'white bg color' tag_with_bg = ['span', 'td', 'tr', 'p'] tag_will_be_saved = parent_tag.name in tag_with_bg has_bg = parent_tag.attrs.get('style') and ('background' in parent_tag.attrs.get('style')) if has_bg and tag_will_be_saved: return style_ children = tag.find_all() for child in children: if child.attrs.get('style') and ('background' in child.attrs.get('style')): tmp_style = child.attrs['style'] + '; color:#fff; ' child.attrs['style'] = tmp_style # for child with bg color we added white text color, so this tag don't need white color style_ = style_.replace('color:#fff;', '') style_ = style_.replace('color:#ffffff;', '') style_ = style_.replace('color:white;', '') return style_ @staticmethod def convert_indentions_to_px(style): margin_left_regexp = re.compile( r'(margin-left:( *-*\w+%*);*)') text_indent_regexp = re.compile( r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)') has_margin_left = re.search(margin_left_regexp, style) has_text_indent = re.search(text_indent_regexp, style) # consider that 5% = 30px if has_margin_left and has_text_indent: num_ml = abs(int("".join( filter(str.isdigit, str(has_margin_left.group(2))))) * 6) if has_text_indent.group(1): num_ti = abs(int("".join( filter(str.isdigit, str(has_text_indent.group(2))))) * 6) style = style.replace(has_text_indent.group(1), 'text-indent: ' + str(abs(num_ml - num_ti)) + 'px; ') style = style.replace(has_margin_left.group(1), '') return style elif has_text_indent.group(3): num_ti = abs(int("".join( filter(str.isdigit, str(has_text_indent.group(4))))) * 6) style = style.replace(has_text_indent.group(3), 'text-indent: ' + str(abs(num_ml - num_ti)) + 'px; ') style = style.replace(has_margin_left.group(1), '') return style elif has_text_indent: if has_text_indent.group(1): style = style.replace(has_text_indent.group(1), 'text-indent: ' + str(abs(int("".join( filter(str.isdigit, str(has_text_indent.group(2))))) * 6)) + 'px; ') return style elif has_text_indent.group(3): style = style.replace(has_text_indent.group(3), 'text-indent: ' + str("".join( filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ') return style elif has_margin_left: num_ml = abs(int("".join( filter(str.isdigit, str(has_margin_left.group(2))))) * 6) style = style.replace(has_margin_left.group(1), 'text-indent: ' + str(abs(num_ml)) + 'px; ') return style return style def preprocess_style(self): style = self.tag_with_style.attrs.get('style') + ';' style = self.remove_white_if_no_bgcolor(style, self.tag_with_style) style = style.replace('background:', 'background-color:') style = style.replace('list-style-image', 'list-style-type') # todo: make hmtl_reader + do a repetition check with inline_style style = self.convert_indentions_to_px(style) # if tag had already had inline style, add this to style parsed from css if self.tag.attrs.get('style'): inline_style = self.convert_indentions_to_px(self.tag.attrs['style']) style += inline_style return style def change_attrs_with_corresponding_tags(self): # adds , , , etc to_remove = check_style_to_be_tag(self.style) new_tags = [] for i, (attr, value) in enumerate(to_remove): s = f'{attr}:{value};' self.style = self.style.replace(s, '') self.style = self.style.strip() if i == 0: self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] new_tags.append(self.tag) else: name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] new_tag = BeautifulSoup(features='lxml').new_tag(name) new_tags[-1].wrap(new_tag) new_tags.append(new_tag) top_tag = self.tag if new_tags: tmp_attrs = top_tag.attrs.copy() top_tag.attrs = {} top_tag2 = BeautifulSoup(features='lxml').new_tag(self.tag_initial_name) top_tag2.attrs = tmp_attrs if self.style: top_tag2.attrs['style'] = self.style new_tags[-1].wrap(top_tag2) else: top_tag.attrs['style'] = self.style return top_tag @staticmethod def wrap_span_in_p_to_save_style_attrs(tag): styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'text-indent']] if tag.name == 'p' and tag.attrs.get('style'): styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p] if any(styles_to_be_saved): tag.name = 'span' p_tag = BeautifulSoup(features='lxml').new_tag('p') span_style = tag.attrs['style'] p_style = '' possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)') has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style) if has_p_style_attrs: if has_p_style_attrs.group(1): p_style += has_p_style_attrs.group(1) span_style = span_style.replace(has_p_style_attrs.group(1), '') if has_p_style_attrs.group(3): p_style += has_p_style_attrs.group(3) span_style = span_style.replace(has_p_style_attrs.group(3), '') p_tag.attrs['style'] = p_style li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') has_li_style_attr = re.search(li_attrs_regexp, span_style) span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '') tag.attrs['style'] = span_style tag.wrap(p_tag) @staticmethod def add_span_to_save_style_attrs_in_li(t): if t.name == 'li' and t.attrs.get('style'): styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'list-style-type']] check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li] if any(check): t.name = 'span' li_tag = BeautifulSoup(features='lxml').new_tag('li') old_style = t.attrs['style'] new_style = '' for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), re.compile(r'(list-style-type:(\w+);)')]: has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style) if has_li_style_attrs and has_li_style_attrs.group(1): new_style += has_li_style_attrs.group(1) old_style = old_style.replace(has_li_style_attrs.group(1), '') li_tag.attrs['style'] = new_style t.attrs['style'] = old_style t.wrap(li_tag) @staticmethod def add_span_to_save_style_attrs_in_ul_ol(t): if t.name in ['ul', 'ol'] and t.attrs.get('style'): styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li] if any(check): t.name = 'span' li_tag = BeautifulSoup(features='lxml').new_tag('ul') old_style = t.attrs['style'] possible_li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style) if has_li_style_attrs and has_li_style_attrs.group(1): new_style = has_li_style_attrs.group(1) old_style = old_style.replace(new_style, '') li_tag.attrs['style'] = new_style t.attrs['style'] = old_style t.wrap(li_tag) @staticmethod def add_span_to_save_style_attrs(t): no_style_in_livecarta_regexp = re.compile('(^h[1-9]$)') if re.search(no_style_in_livecarta_regexp, t.name) and t.attrs.get('style'): new_tag = BeautifulSoup(features='lxml').new_tag(t.name) t.name = 'span' t.wrap(new_tag) style = t.attrs['style'] li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') has_li_style_attr = re.search(li_attrs_regexp, style) t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') def convert_initial_tag(self): self.tag = self.change_attrs_with_corresponding_tags() self.wrap_span_in_p_to_save_style_attrs(self.tag) self.add_span_to_save_style_attrs_in_li(self.tag) self.add_span_to_save_style_attrs_in_ul_ol(self.tag) self.add_span_to_save_style_attrs(self.tag) return self.tag def add_inline_style_to_html_soup(soup1: BeautifulSoup, css_text: str): css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '') livecarta_tmp_ids = [] h_regex = f'(^h[1-9]$)' could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex) tags_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp) for i, x in enumerate(tags_with_possible_style_attr): x.attrs['livecarta_id'] = i livecarta_tmp_ids.append(i) html_with_inline_style: str = transform(str(soup1), css_text=css_text, remove_classes=False, external_styles=False, allow_network=False, disable_validation=True, ) soup2 = BeautifulSoup(html_with_inline_style, features='lxml') for i in livecarta_tmp_ids: tag = soup1.find(attrs={'livecarta_id': i}) tag_with_style = soup2.find(attrs={'livecarta_id': i}) del tag.attrs['livecarta_id'] if tag_with_style.attrs.get('style'): style_converter = TagStyleConverter(tag, tag_with_style) style_converter.convert_initial_tag() return soup1 if __name__ == '__main__': file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub' ebooklib_book = epub.read_epub(file) css_ = ebooklib_book.get_item_with_href('css/epub.css') css_ = css_.get_content().decode() css_cleaned = clean_css(css_) html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode() html_soup = BeautifulSoup(html_, features='lxml') print(add_inline_style_to_html_soup(html_soup, css_cleaned))