import re import cssutils from typing import List from ebooklib import epub from logging import CRITICAL from bs4 import BeautifulSoup from premailer import transform from itertools import takewhile from src.util.color_reader import str2hex from src.livecarta_config import LiveCartaConfig cssutils.log.setLevel(CRITICAL) sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] list_types = ['circle', 'disc', 'armenian', 'decimal', 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] def convert_tag_values(value: str) -> str: """ Function - converts values of tags from em/%/pt to px - find closest font-size px Parameters ---------- value: str Returns ------- value: str """ def find_closest_size(value): possible_sizes = list(takewhile(lambda x: value > x, sizes_pr)) last_possible_size_index = sizes_pr.index(possible_sizes[-1]) return sizes_px[last_possible_size_index] font_size_regexp = re.compile( r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') has_style_attrs = re.search(font_size_regexp, value) if has_style_attrs: if has_style_attrs.group(1): value = float(value.replace('%', '')) / 100.0 return find_closest_size(value) elif has_style_attrs.group(3): value = float(value.replace('em', '')) return find_closest_size(value) elif has_style_attrs.group(5): return value.replace('pt', 'px') else: return '' return value """ Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } Style properties that can be used to fit livecarta css style convention. If property has empty list, it means that any value can be converted. If property has not empty list, it means that only certain property-value combinations can be transformed. """ LIVECARTA_STYLE_ATTRS = { 'text-indent': [], 'font-variant': ['small-caps'], 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], 'align': [], 'font': [], 'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys() if x != LiveCartaConfig.DEFAULT_FONT_NAME], 'font-size': [], 'font-weight': ['bold', '600', '700', '800', '900'], # 'font-style': ['italic'], # 'text-decoration': ['underline', 'line-through'], # , 'text-decoration-line': ['underline', 'line-through'], # , 'vertical-align': ['super'], # 'color': [], 'background-color': [], 'background': [], 'width': [], 'border': [], 'border-top-width': [], 'border-right-width': [], 'border-left-width': [], 'border-bottom-width': [], 'border-top': [], 'border-bottom': [], 'list-style-type': [], 'list-style-image': [], 'margin-left': [], 'margin-top': [], 'margin': [], } def get_bg_color(x): color = str2hex(x) color = color if color not in ['#ffffff', '#fff', 'white'] else '' return color def get_text_color(x): color = str2hex(x) color = color if color not in ['#000000', '#000', 'black'] else '' return color """ Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated to suit livecarta style convention. """ LIVECARTA_STYLE_ATTRS_MAPPING = { 'text-indent': convert_tag_values, 'font-variant': lambda x: x, 'text-align': lambda x: x, 'font': lambda x: '', 'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()), 'font-size': convert_tag_values, 'color': get_text_color, 'background-color': get_bg_color, 'background': get_bg_color, 'border': lambda x: x if x != '0' else '', 'border-top-width': lambda x: x if x != '0' else '', 'border-right-width': lambda x: x if x != '0' else '', 'border-left-width': lambda x: x if x != '0' else '', 'border-bottom-width': lambda x: x if x != '0' else '', 'border-top': lambda x: x if x != '0' else '', 'border-bottom': lambda x: x if x != '0' else '', 'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-image': lambda x: 'disc', 'margin-left': convert_tag_values, 'margin-top': convert_tag_values, 'margin': convert_tag_values, } """ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }

List[tuple]: """ Function searches style properties that can be converted to tags. It searches for them and prepare list of properties to be removed from style string Parameters ---------- style: str Returns ------- to_remove: list properties to remove """ to_remove = [] for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: if f'{k[0]}:{k[1]}' in style: to_remove.append(k) return to_remove def update_css_style_types_to_livecarta_convention(css_rule, style_type): if style_type.name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = '' return cleaned_value = style_type.value.replace('\"', '') # value of style there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name) value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ style_type.name] if there_are_constraints_on_value and value_not_in_possible_values_list: # style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = '' else: if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: # function that converts our data func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] css_rule.style[style_type.name] = func(cleaned_value) def build_css_content(css_content): """Build css content with livecarta convention""" sheet = cssutils.parseString(css_content, validate=False) for css_rule in sheet: if css_rule.type == css_rule.STYLE_RULE: for style_type in css_rule.style: update_css_style_types_to_livecarta_convention( css_rule, style_type) css_text = sheet._getCssText().decode() return css_text class TagStyleConverter: def __init__(self, tag_with_inline_style, tag_with_ultimate_style): # tag with inline style to be updated with style attribute self.tag_with_inline_style = tag_with_inline_style self.tag_initial_name = tag_with_inline_style.name # tag with inline style + style parsed from css file self.tag_with_ultimate_style = tag_with_ultimate_style self.style = self.preprocess_style() @staticmethod def remove_white_if_no_bgcolor(style_, tag): """Function remove text white color if there is no bg color""" if 'background' in style_: return style_ # if text color is white, check that we have bg-color if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): # if bg color is inherited, just return style as is for parent_tag in tag.parents: # white bg color not need to be checked as we do not write 'white bg color' tag_with_bg = ['span', 'td', 'tr', 'p'] tag_will_be_saved = parent_tag.name in tag_with_bg has_bg = parent_tag.attrs.get('style') and ( 'background' in parent_tag.attrs.get('style')) if has_bg and tag_will_be_saved: return style_ children = tag.find_all() for child in children: if child.attrs.get('style') and ('background' in child.attrs.get('style')): tmp_style = child.attrs['style'] + '; color:#fff; ' child.attrs['style'] = tmp_style # for child with bg color we added white text color, so this tag don't need white color style_ = style_.replace('color:#fff;', '') style_ = style_.replace('color:#ffffff;', '') style_ = style_.replace('color:white;', '') return style_ @staticmethod def process_indents_to_px(split_style: dict) -> str: """Function cleans style string using convert_tag_values() and returns new clean_style""" split_style = [k + ":" + v for k, v in split_style.items()] clean_style = '' for item in split_style: item = item.split(':') if item[0] in ['text-indent', 'margin-left', 'margin']: if len(item[1].split(' ')) == 3: item[1] = convert_tag_values(item[1].split( ' ')[-2]) # split returns middle value else: item[1] = convert_tag_values(item[1].split( ' ')[-1]) # split returns last value clean_style += item[0] + ': ' + item[1] + '; ' margin_left_regexp = re.compile( r'((margin-left|margin): *(-*\w+);*)') text_indent_regexp = re.compile( r'(text-indent: *(-*\w+);*)') has_margin = re.search(margin_left_regexp, clean_style) has_text_indent = re.search(text_indent_regexp, clean_style) # formula_of_indent: indent = abs(margin - text_indent) if has_margin: num_m = abs(int("0" + "".join( filter(str.isdigit, str(has_margin.group(3)))))) if has_text_indent: num_ti = abs(int("0" + "".join( filter(str.isdigit, str(has_text_indent.group(2)))))) clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + str(abs(num_m - num_ti)) + 'px; ') clean_style = clean_style.replace(has_margin.group(1), '') return clean_style clean_style = clean_style.replace(has_margin.group(1), 'text-indent: ' + str(abs(num_m)) + 'px; ') return clean_style elif has_text_indent: clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + str(abs(int("0" + "".join( filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ') return clean_style return clean_style def preprocess_style(self): def remove_extra_spaces(style: str) -> dict: """Function to remove extra spaces in style to process clean_style""" # replace all spaces between '; & letter' to ';' style = re.sub(r"; *", ";", style) split_style: List = style.split(';') # when we split style by ; and we have at the end ; that's why we have '' in list while '' in split_style: split_style.remove('') # replace all spaces between ': & letter' to ':' split_style = [el.replace( re.search(r'(:\s*)', el).group(1), ':') for el in split_style] dict = {} for list_item in split_style: key, val = list_item.split(":") dict[key] = val return dict ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';' ultimate_style = self.remove_white_if_no_bgcolor( ultimate_style, self.tag_with_ultimate_style) ultimate_style = ultimate_style.replace( 'background:', 'background-color:') ultimate_style = ultimate_style.replace( 'list-style-image', 'list-style-type') split_ultimate_style: dict = remove_extra_spaces(ultimate_style) ultimate_style: str = self.process_indents_to_px(split_ultimate_style) if self.tag_with_inline_style.attrs.get('style'): inline_style = self.tag_with_inline_style.attrs['style'] split_inline_style: dict = remove_extra_spaces(inline_style) # repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css repeat_styles = list(set(split_ultimate_style.keys()) & set(split_inline_style.keys())) # remove styles(css) that are in css and inline [split_inline_style.pop(item) for item in repeat_styles] if split_inline_style: # if split_inline_style is not empty - start convert and add to ultimate style print('we enter repetition check', '\n') inline_style: str = self.process_indents_to_px( split_inline_style) ultimate_style += inline_style return ultimate_style def change_attrs_with_corresponding_tags(self): # adds , , , etc to_remove = check_style_to_be_tag(self.style) new_tags = [] for i, (attr, value) in enumerate(to_remove): s = f'{attr}:{value};' self.style = self.style.replace(s, '') self.style = self.style.strip() if not i: self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( attr, value)] new_tags.append(self.tag_with_inline_style) else: name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] new_tag = BeautifulSoup(features='lxml').new_tag(name) new_tags[-1].wrap(new_tag) new_tags.append(new_tag) top_tag = self.tag_with_inline_style if new_tags: tmp_attrs = top_tag.attrs.copy() top_tag.attrs = {} top_tag2 = BeautifulSoup(features='lxml').new_tag( self.tag_initial_name) top_tag2.attrs = tmp_attrs if self.style: top_tag2.attrs['style'] = self.style new_tags[-1].wrap(top_tag2) else: top_tag.attrs['style'] = self.style return top_tag @staticmethod def wrap_span_in_p_to_save_style_attrs(tag): """Function designed to save style attrs that cannot be in p -> span""" if tag.name == 'p' and tag.attrs.get('style'): styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']] p_style = '' initial_style = tag.attrs['style'] split_style = initial_style.replace('; ', ';').split(';') possible_p_attrs_regexp = re.compile( r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)') for item in split_style: has_p_style_attrs = re.search(possible_p_attrs_regexp, item) if has_p_style_attrs: p_style += item + ';' initial_style = initial_style.replace(item + ';', '') # here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top' styles_to_be_saved_in_span = [((attr + ':') in initial_style) & ( '-' + attr not in initial_style) for attr in styles_cant_be_in_p] if any(styles_to_be_saved_in_span): # if find styles that cannot be in

-> wrap them in span tag.name = 'span' p_tag = BeautifulSoup(features='lxml').new_tag('p') p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') has_p_style_attr = re.search(p_attrs_regexp, initial_style) span_style = initial_style if not has_p_style_attr else initial_style.replace( has_p_style_attr.group(1), '') p_tag.attrs['style'] = p_style tag.attrs['style'] = span_style tag.wrap(p_tag) else: tag.attrs['style'] = p_style @staticmethod def wrap_span_in_li_to_save_style_attrs(tag): """Function designed to save style attrs that cannot be in li -> span""" if tag.name == 'li' and tag.attrs.get('style'): styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'list-style-type']] styles_to_be_saved_in_span = [attr in tag.attrs.get( 'style') for attr in styles_cant_be_in_li] if any(styles_to_be_saved_in_span): tag.name = 'span' li_tag = BeautifulSoup(features='lxml').new_tag('li') span_style = tag.attrs['style'] li_style = '' for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), re.compile(r'(list-style-type:(\w+);)')]: has_li_style_attrs = re.search( possible_li_attrs_regexp, span_style) if has_li_style_attrs and has_li_style_attrs.group(1): li_style += has_li_style_attrs.group(1) span_style = span_style.replace( has_li_style_attrs.group(1), '') li_tag.attrs['style'] = li_style tag.attrs['style'] = span_style tag.wrap(li_tag) @staticmethod def wrap_span_in_ul_ol_to_save_style_attrs(tag): """Function designed to save style attrs that cannot be in ul/ol -> span""" if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): styles_cant_be_in_ul_ol = [ attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] styles_to_be_saved_in_span = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_ul_ol] if any(styles_to_be_saved_in_span): tag.name = 'span' oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name) span_style = tag.attrs['style'] possible_uol_attrs_regexp = re.compile( r'(list-style-type:(\w+);)') has_uol_style_attrs = re.search( possible_uol_attrs_regexp, span_style) if has_uol_style_attrs and has_uol_style_attrs.group(1): oul_style = has_uol_style_attrs.group(1) span_style = span_style.replace(oul_style, '') oul_tag.attrs['style'] = oul_style tag.attrs['style'] = span_style tag.wrap(oul_tag) @staticmethod def wrap_span_in_h_to_save_style_attrs(tag): """Function designed to save style attrs that cannot be in h -> span""" h_regexp = re.compile('(^h[1-9]$)') if re.search(h_regexp, tag.name) and tag.attrs.get('style'): h_tag = BeautifulSoup(features='lxml').new_tag(tag.name) tag.name = 'span' tag.wrap(h_tag) style = tag.attrs['style'] h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') has_h_style_attr = re.search(h_attrs_regexp, style) tag.attrs['style'] = style if not has_h_style_attr else style.replace( has_h_style_attr.group(1), '') def convert_initial_tag(self): self.tag_with_inline_style = self.change_attrs_with_corresponding_tags() self.wrap_span_in_p_to_save_style_attrs(self.tag_with_inline_style) self.wrap_span_in_li_to_save_style_attrs(self.tag_with_inline_style) self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_with_inline_style) self.wrap_span_in_h_to_save_style_attrs(self.tag_with_inline_style) return self.tag_with_inline_style def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: """Function adds styles from .css to inline style""" css_text = css_text.replace( '@namespace epub "http://www.idpf.org/2007/ops";', '') livecarta_tmp_ids = [] could_have_style_in_livecarta_regexp = re.compile( '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') tags_with_possible_style_attr = html_soup.find_all( could_have_style_in_livecarta_regexp) for i, x in enumerate(tags_with_possible_style_attr): x.attrs['livecarta_id'] = i livecarta_tmp_ids.append(i) # here we add css styles to inline style html_with_css_styles: str = transform(str(html_soup), css_text=css_text, remove_classes=False, external_styles=False, allow_network=False, disable_validation=True, ) inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') # go through tags with possible style attrs for i in livecarta_tmp_ids: tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i}) tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i}) del tag_with_initial_style.attrs['livecarta_id'] if tag_with_ultimate_style.attrs.get('style'): style_converter = TagStyleConverter( tag_with_initial_style, tag_with_ultimate_style) style_converter.convert_initial_tag() return html_soup if __name__ == '__main__': file = '../../epub/9781627222174.epub' ebooklib_book = epub.read_epub(file) css_ = ebooklib_book.get_item_with_href('css/epub.css') css_ = css_.get_content().decode() css_cleaned = build_css_content(css_) html_ = ebooklib_book.get_item_with_href( 'pr01s05.xhtml').get_body_content().decode() html_soup = BeautifulSoup(html_, features='lxml') print(convert_html_soup_with_css_style(html_soup, css_cleaned))