import re from typing import List import cssutils from bs4 import BeautifulSoup from ebooklib import epub from premailer import transform from itertools import takewhile from logging import CRITICAL from livecarta_config import LawCartaConfig from util.color_reader import str2color_name cssutils.log.setLevel(CRITICAL) sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] def convert_font_size(value): if 'pt' in value: if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE: return '' else: return value.replace('pt', 'px') if value == '100%': return '' try: if '%' in value: value = float(value.replace('%', '')) value = value / 100.0 elif 'em' in value: value = float(value.replace('em', '')) else: return '' if value > 5: return '' possible_sizes = list(takewhile(lambda x: value > x, sizes_pr)) last_possible_size_index = sizes_pr.index(possible_sizes[-1]) return sizes_px[last_possible_size_index] except ValueError: return '' """ LIVECARTA_STYLE_ATTRS = { css property: value } Style properties that can be used to fit livecarta css style convention. If property has empty list, it means that any value can be converted. If property has not empty list, it means that only certain property-value combinations can be transformed. """ LIVECARTA_STYLE_ATTRS = { 'text-indent': [], 'font-variant': ['small-caps'], 'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE], 'align': [], # ??? 'font': [], # ??? 'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys() if x != LawCartaConfig.DEFAULT_FONT_NAME], 'font-size': [], 'font-weight': ['bold', '600', '700', '800', '900'], # 'font-style': ['italic'], # 'text-decoration': ['underline', 'line-through'], # , 'text-decoration-line': ['underline', 'line-through'], # , 'vertical-align': ['super'], # 'color': [], 'background-color': [], 'background': [], 'width': [], 'border-top-width': [], 'border-right-width': [], 'border-left-width': [], 'border-bottom-width': [], 'border': [] } """ LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated to suit livecarta style convention. """ LIVECARTA_STYLE_ATTRS_MAPPING = { 'text-indent': lambda x: LawCartaConfig.INDENT if x != '0' else '', 'font-variant': lambda x: x, 'text-align': lambda x: x, 'font': lambda x: '', 'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x.capitalize()), 'font-size': convert_font_size, 'color': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''), 'background-color': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''), 'background': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''), 'border-top-width': lambda x: x if x != '0' else '', 'border-right-width': lambda x: x if x != '0' else '', 'border-left-width': lambda x: x if x != '0' else '', 'border-bottom-width': lambda x: x if x != '0' else '', } """ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } """ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { ('font-weight', 'bold'): 'strong', ('font-weight', '600'): 'strong', ('font-weight', '700'): 'strong', ('font-weight', '800'): 'strong', ('font-weight', '900'): 'strong', ('font-style', 'italic'): 'i', ('text-decoration', 'underline'): 'u', ('text-decoration', 'line-through'): 's', ('text-decoration-line', 'underline'): 'u', ('text-decoration-line', 'line-through'): 's', ('vertical-align', 'super'): 'sup', } def check_style_to_be_tag(style) -> List[tuple]: """ Some css style properties converts to tags. Search for them and prepare list of properties to be removed from style string""" to_remove = [] for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: if f'{k[0]}:{k[1]}' in style: to_remove.append(k) return to_remove def update_property_to_livecarta_convention(rule, property_): if property_.name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file rule.style[property_.name] = '' elif LIVECARTA_STYLE_ATTRS.get(property_.name): # check property value to decide weather to remove or not the property_ cleaned_property = property_.value.replace('\"', '') if cleaned_property in LIVECARTA_STYLE_ATTRS[property_.name]: if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING: # apply transformation func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name] rule.style[property_.name] = func(cleaned_property) else: # property + value not in LIVECARTA_STYLE_ATTRS, remove from css file rule.style[property_.name] = '' else: # property can have any value if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING: func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name] cleaned_property = property_.value.replace('\"', '') rule.style[property_.name] = func(cleaned_property) def clean_css(css): sheet = cssutils.parseString(css, validate=False) for rule in sheet: if rule.type == rule.STYLE_RULE: for property_ in rule.style: update_property_to_livecarta_convention(rule, property_) css_text = sheet._getCssText().decode() return css_text def add_inline_style_to_html_soup(soup1, css_text): livecarta_tmp_ids = [] h_regex = f'(^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$)' for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|' + h_regex))): x.attrs['livecarta_id'] = i livecarta_tmp_ids.append(i) html_with_inline_style = transform(str(soup1), css_text=css_text, remove_classes=False, external_styles=False, disable_validation=True) soup2 = BeautifulSoup(html_with_inline_style, features='lxml') for i in livecarta_tmp_ids: tag = soup1.find(attrs={'livecarta_id': i}) tag_initial_name = tag.name tag_with_style = soup2.find(attrs={'livecarta_id': i}) if tag_with_style.attrs.get('style'): style = tag_with_style.attrs.get('style') + ';' style = style.replace('background:', 'background-color:') to_remove = check_style_to_be_tag(style) new_tags = [] for i, (p, v) in enumerate(to_remove): s = f'{p}:{v};' style = style.replace(s, '') if i == 0: tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)] new_tags.append(tag) else: name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)] new_tag = BeautifulSoup(features='lxml').new_tag(name) new_tags[-1].wrap(new_tag) new_tags.append(new_tag) if to_remove: style = style.strip() tmp_attrs = tag.attrs.copy() tag.attrs = {} span_tag = BeautifulSoup(features='lxml').new_tag(tag_initial_name) span_tag.attrs = tmp_attrs if style: span_tag.attrs['style'] = style del span_tag.attrs['livecarta_id'] new_tags[-1].wrap(span_tag) else: tag.attrs['style'] = style del tag.attrs['livecarta_id'] if(tag.name == 'p') and ('background-color' in tag.attrs.get('style')): tag.name = 'span' print(tag) else: del tag.attrs['livecarta_id'] return soup1 if __name__ == '__main__': file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub' ebooklib_book = epub.read_epub(file) css_ = ebooklib_book.get_item_with_href('css/epub.css') css_ = css_.get_content().decode() css_cleaned = clean_css(css_) html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode() html_soup = BeautifulSoup(html_, features='lxml') print(add_inline_style_to_html_soup(html_soup, css_cleaned))