diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py new file mode 100644 index 0000000..f2dc536 --- /dev/null +++ b/src/epub_converter/css_preprocessing.py @@ -0,0 +1,238 @@ +import re +import cssutils + +from ebooklib import epub +from bs4 import BeautifulSoup +from itertools import takewhile + +from src.util.color_reader import str2hex +from src.livecarta_config import LiveCartaConfig + + +def get_text_color(x): + color = str2hex(x) + color = color if color not in ['#000000', '#000', 'black'] else '' + return color + + +def get_bg_color(x): + color = str2hex(x) + color = color if color not in ['#ffffff', '#fff', 'white'] else '' + return color + + +def convert_tag_style_values(size_value: str) -> str: + """ + Function + - converts values of tags from em/%/pt to px + - find closest font-size px + Parameters + ---------- + size_value: str + + Returns + ------- + size_value: str + + """ + def find_closest_size(style_value): + possible_sizes = list( + takewhile(lambda x: style_value > x, LiveCartaConfig.sizes_pr)) + last_possible_size_index = LiveCartaConfig.sizes_pr.index( + possible_sizes[-1]) + return LiveCartaConfig.sizes_px[last_possible_size_index] + + font_size_regexp = re.compile( + r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') + has_style_attrs = re.search(font_size_regexp, size_value) + if has_style_attrs: + if has_style_attrs.group(1): + size_value = float(size_value.replace('%', '')) / 100.0 + return find_closest_size(size_value) + elif has_style_attrs.group(3): + size_value = float(size_value.replace('em', '')) + return find_closest_size(size_value) + elif has_style_attrs.group(5): + return size_value.replace('pt', 'px') + else: + return '' + return size_value + + +def convert_indents_tag_values(size_value: str) -> str: + """ + Function converts values of ['text-indent', 'margin-left', 'margin'] + Parameters + ---------- + size_value: str + + Returns + ------- + size_value: str + + """ + if len(size_value.split(' ')) == 3: + size_value = convert_tag_style_values(size_value.split( + ' ')[-2]) # returns middle value + else: + size_value = convert_tag_style_values(size_value.split( + ' ')[-1]) # returns last value + return size_value + + +""" +Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } +Style properties that can be used to fit livecarta css style convention. +If property has empty list, it means that any value can be converted. +If property has not empty list, it means that only certain property-value combinations can be transformed. +""" +LIVECARTA_STYLE_ATTRS = { + 'text-indent': [], + 'font-variant': ['small-caps'], + 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], + 'align': [], + 'font': [], + 'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys() + if x != LiveCartaConfig.DEFAULT_FONT_NAME], + 'font-size': [], + 'font-weight': ['bold', '600', '700', '800', '900'], # + 'font-style': ['italic'], # + 'text-decoration': ['underline', 'line-through'], # , + 'text-decoration-line': ['underline', 'line-through'], # , + 'vertical-align': ['super'], # + 'color': [], + 'background-color': [], + 'background': [], + 'width': [], + 'border': [], + 'border-top-width': [], + 'border-right-width': [], + 'border-left-width': [], + 'border-bottom-width': [], + 'border-top': [], + 'border-bottom': [], + 'list-style-type': [], + 'list-style-image': [], + 'margin-left': [], + 'margin-top': [], + 'margin': [], +} + +""" +Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } + +Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated +to suit livecarta style convention. +""" +LIVECARTA_STYLE_ATTRS_MAPPING = { + 'text-indent': convert_indents_tag_values, + 'font-variant': lambda x: x, + 'text-align': lambda x: x, + 'font': lambda x: '', + 'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(x) or + LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(x.capitalize()), + 'font-size': convert_tag_style_values, + 'color': get_text_color, + 'background-color': get_bg_color, + 'background': get_bg_color, + 'border': lambda x: x if x != '0' else '', + 'border-top-width': lambda x: x if x != '0' else '', + 'border-right-width': lambda x: x if x != '0' else '', + 'border-left-width': lambda x: x if x != '0' else '', + 'border-bottom-width': lambda x: x if x != '0' else '', + 'border-top': lambda x: x if x != '0' else '', + 'border-bottom': lambda x: x if x != '0' else '', + 'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc', + 'list-style-image': lambda x: 'disc', + 'margin-left': convert_indents_tag_values, + 'margin-top': convert_tag_style_values, + 'margin': convert_indents_tag_values +} + + +def update_inline_styles_to_livecarta_convention(split_style: list): + for i, style in enumerate(split_style): + style_name, style_value = style.split(":") + if style_name not in LIVECARTA_STYLE_ATTRS: + # property not in LIVECARTA_STYLE_ATTRS, remove from css file + split_style[i] = '' + return split_style + + cleaned_value = style_value.replace('\"', '').split()[-1] + constraints_on_value = LIVECARTA_STYLE_ATTRS.get( + style_name) + value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ + style_name] + if constraints_on_value and value_not_in_possible_values_list: + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file + split_style[i] = '' + else: + if style_name in LIVECARTA_STYLE_ATTRS_MAPPING: + # function that converts our data + func = LIVECARTA_STYLE_ATTRS_MAPPING[style_name] + style_value = func(cleaned_value) + split_style[i] = style_name + ":" + style_value + return split_style + + +def build_inline_style_content(style: str) -> str: + """Build inline style with livecarta convention""" + # replace all spaces between '; & letter' to ';' + style = re.sub(r"; *", ";", style) + # when we split style by ';', last element of the list is ''-None + # remove it + split_style: list = list(filter(None, style.split(';'))) + # replace all spaces between ': & letter' to ':' + split_style = [el.replace( + re.search(r'(:\s*)', el).group(1), ':') for el in split_style] + + split_style = update_inline_styles_to_livecarta_convention(split_style) + style = "; ".join(split_style) + return style + + +def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule, + style_type: cssutils.css.property.Property): + if style_type.name not in LIVECARTA_STYLE_ATTRS: + # property not in LIVECARTA_STYLE_ATTRS, remove from css file + css_rule.style[style_type.name] = '' + return + + cleaned_value = style_type.value.replace('\"', '').split(', ')[-1] + constraints_on_value = LIVECARTA_STYLE_ATTRS.get( + style_type.name) + value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ + style_type.name] + if constraints_on_value and value_not_in_possible_values_list: + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file + css_rule.style[style_type.name] = '' + else: + if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: + # function that converts our data + func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] + css_rule.style[style_type.name] = func(cleaned_value) + + +def build_css_file_content(css_content: str) -> str: + """Build css content with livecarta convention""" + sheet = cssutils.parseString(css_content, validate=False) + + for css_rule in sheet: + if css_rule.type == css_rule.STYLE_RULE: + for style_type in css_rule.style: + update_css_styles_to_livecarta_convention( + css_rule, style_type) + + css_text: str = sheet._getCssText().decode() + return css_text + + +if __name__ == '__main__': + file = '../../epub/9781627222174.epub' + ebooklib_book = epub.read_epub(file) + css_ = ebooklib_book.get_item_with_href('css/epub.css') + css_ = css_.get_content().decode() + css_cleaned = build_css_file_content(css_) + html_ = ebooklib_book.get_item_with_href( + 'pr01s05.xhtml').get_body_content().decode() + html_soup = BeautifulSoup(html_, features='lxml') diff --git a/src/epub_converter/css_reader.py b/src/epub_converter/css_reader.py deleted file mode 100644 index 7e768b8..0000000 --- a/src/epub_converter/css_reader.py +++ /dev/null @@ -1,557 +0,0 @@ -import re -import cssutils -from typing import List - -from ebooklib import epub -from logging import CRITICAL -from bs4 import BeautifulSoup -from premailer import transform -from itertools import takewhile - -from src.util.color_reader import str2hex -from src.livecarta_config import LiveCartaConfig - -cssutils.log.setLevel(CRITICAL) - - -sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, - 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, - 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, - 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] - -sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', - '17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px', - '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', - '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', - '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] - -list_types = ['circle', 'disc', 'armenian', 'decimal', - 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', - 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] - - -def convert_tag_style_values(value: str) -> str: - """ - Function - - converts values of tags from em/%/pt to px - - find closest font-size px - Parameters - ---------- - value: str - - Returns - ------- - value: str - - """ - def find_closest_size(size_value): - possible_sizes = list(takewhile(lambda x: size_value > x, sizes_pr)) - last_possible_size_index = sizes_pr.index(possible_sizes[-1]) - return sizes_px[last_possible_size_index] - - font_size_regexp = re.compile( - r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') - has_style_attrs = re.search(font_size_regexp, value) - if has_style_attrs: - if has_style_attrs.group(1): - value = float(value.replace('%', '')) / 100.0 - return find_closest_size(value) - elif has_style_attrs.group(3): - value = float(value.replace('em', '')) - return find_closest_size(value) - elif has_style_attrs.group(5): - return value.replace('pt', 'px') - else: - return '' - return value - - -""" -Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } -Style properties that can be used to fit livecarta css style convention. -If property has empty list, it means that any value can be converted. -If property has not empty list, it means that only certain property-value combinations can be transformed. -""" -LIVECARTA_STYLE_ATTRS = { - 'text-indent': [], - 'font-variant': ['small-caps'], - 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], - 'align': [], - 'font': [], - 'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys() - if x != LiveCartaConfig.DEFAULT_FONT_NAME], - 'font-size': [], - 'font-weight': ['bold', '600', '700', '800', '900'], # - 'font-style': ['italic'], # - 'text-decoration': ['underline', 'line-through'], # , - 'text-decoration-line': ['underline', 'line-through'], # , - 'vertical-align': ['super'], # - 'color': [], - 'background-color': [], - 'background': [], - 'width': [], - 'border': [], - 'border-top-width': [], - 'border-right-width': [], - 'border-left-width': [], - 'border-bottom-width': [], - 'border-top': [], - 'border-bottom': [], - 'list-style-type': [], - 'list-style-image': [], - 'margin-left': [], - 'margin-top': [], - 'margin': [], -} - - -def get_bg_color(x): - color = str2hex(x) - color = color if color not in ['#ffffff', '#fff', 'white'] else '' - return color - - -def get_text_color(x): - color = str2hex(x) - color = color if color not in ['#000000', '#000', 'black'] else '' - return color - - -""" -Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } - -Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated -to suit livecarta style convention. -""" -LIVECARTA_STYLE_ATTRS_MAPPING = { - 'text-indent': convert_tag_style_values, - 'font-variant': lambda x: x, - 'text-align': lambda x: x, - 'font': lambda x: '', - 'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or - LiveCartaConfig.font_correspondence_table.get(x.capitalize()), - 'font-size': convert_tag_style_values, - 'color': get_text_color, - 'background-color': get_bg_color, - 'background': get_bg_color, - 'border': lambda x: x if x != '0' else '', - 'border-top-width': lambda x: x if x != '0' else '', - 'border-right-width': lambda x: x if x != '0' else '', - 'border-left-width': lambda x: x if x != '0' else '', - 'border-bottom-width': lambda x: x if x != '0' else '', - 'border-top': lambda x: x if x != '0' else '', - 'border-bottom': lambda x: x if x != '0' else '', - 'list-style-type': lambda x: x if x in list_types else 'disc', - 'list-style-image': lambda x: 'disc', - 'margin-left': convert_tag_style_values, - 'margin-top': convert_tag_style_values, - 'margin': convert_tag_style_values, -} - -""" -LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } - -

List[tuple]: - """ - Function searches style properties that can be converted to tags. - It searches for them and prepare list of properties to be removed from style string - Parameters - ---------- - style: str - - - Returns - ------- - to_remove: list - properties to remove - - """ - to_remove = [] - for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: - if f'{k[0]}:{k[1]}' in style: - to_remove.append(k) - return to_remove - - -def update_css_style_types_to_livecarta_convention(css_rule, style_type): - if style_type.name not in LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = '' - return - - cleaned_value = style_type.value.replace('\"', '') # value of style - there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name) - value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ - style_type.name] - if there_are_constraints_on_value and value_not_in_possible_values_list: - # style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = '' - else: - if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: - # function that converts our data - func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] - css_rule.style[style_type.name] = func(cleaned_value) - - -def build_css_content(css_content): - """Build css content with livecarta convention""" - sheet = cssutils.parseString(css_content, validate=False) - - for css_rule in sheet: - if css_rule.type == css_rule.STYLE_RULE: - for style_type in css_rule.style: - update_css_style_types_to_livecarta_convention( - css_rule, style_type) - - css_text = sheet._getCssText().decode() - return css_text - - -class TagStyleConverter: - def __init__(self, tag_with_inline_style, tag_with_ultimate_style): - # tag with inline style to be updated with style attribute - self.tag_with_inline_style = tag_with_inline_style - self.tag_initial_name = tag_with_inline_style.name - # tag with inline style + style parsed from css file - self.tag_with_ultimate_style = tag_with_ultimate_style - self.style = self.preprocess_style() - - @staticmethod - def remove_white_if_no_bgcolor(style_, tag): - """Function remove text white color if there is no bg color""" - if 'background' in style_: - return style_ - - # if text color is white, check that we have bg-color - if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): - # if bg color is inherited, just return style as is - for parent_tag in tag.parents: - # white bg color not need to be checked as we do not write 'white bg color' - tag_with_bg = ['span', 'td', 'tr', 'p'] - tag_will_be_saved = parent_tag.name in tag_with_bg - has_bg = parent_tag.attrs.get('style') and ( - 'background' in parent_tag.attrs.get('style')) - if has_bg and tag_will_be_saved: - return style_ - - children = tag.find_all() - for child in children: - if child.attrs.get('style') and ('background' in child.attrs.get('style')): - tmp_style = child.attrs['style'] + '; color:#fff; ' - child.attrs['style'] = tmp_style - - # for child with bg color we added white text color, so this tag don't need white color - style_ = style_.replace('color:#fff;', '') - style_ = style_.replace('color:#ffffff;', '') - style_ = style_.replace('color:white;', '') - return style_ - - @staticmethod - def process_indents_to_px(split_style: dict) -> str: - """Function cleans style string using convert_tag_values() and returns new clean_style""" - split_style = [k + ":" + v for k, v in split_style.items()] - clean_style = '' - for item in split_style: - item = item.split(':') - if item[0] in ['text-indent', 'margin-left', 'margin']: - if len(item[1].split(' ')) == 3: - item[1] = convert_tag_style_values(item[1].split( - ' ')[-2]) # split returns middle value - else: - item[1] = convert_tag_style_values(item[1].split( - ' ')[-1]) # split returns last value - clean_style += item[0] + ': ' + item[1] + '; ' - - margin_left_regexp = re.compile( - r'((margin-left|margin): *(-*\w+);*)') - text_indent_regexp = re.compile( - r'(text-indent: *(-*\w+);*)') - - has_margin = re.search(margin_left_regexp, clean_style) - has_text_indent = re.search(text_indent_regexp, clean_style) - # formula_of_indent: indent = abs(margin - text_indent) - if has_margin: - num_m = abs(int("0" + "".join( - filter(str.isdigit, str(has_margin.group(3)))))) - - if has_text_indent: - num_ti = abs(int("0" + "".join( - filter(str.isdigit, str(has_text_indent.group(2)))))) - clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + - str(abs(num_m - num_ti)) + 'px; ') - clean_style = clean_style.replace(has_margin.group(1), '') - return clean_style - - clean_style = clean_style.replace(has_margin.group(1), 'text-indent: ' + - str(abs(num_m)) + 'px; ') - return clean_style - - elif has_text_indent: - clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + - str(abs(int("0" + "".join( - filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ') - return clean_style - return clean_style - - def preprocess_style(self): - def remove_extra_spaces(style: str) -> dict: - """Function to remove extra spaces in style to process clean_style""" - # replace all spaces between '; & letter' to ';' - style = re.sub(r"; *", ";", style) - split_style: List = style.split(';') - - # when we split style by ; and we have at the end ; that's why we have '' in list - while '' in split_style: - split_style.remove('') - - # replace all spaces between ': & letter' to ':' - split_style = [el.replace( - re.search(r'(:\s*)', el).group(1), ':') for el in split_style] - dict = {} - for list_item in split_style: - key, val = list_item.split(":") - dict[key] = val - return dict - - ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';' - ultimate_style = self.remove_white_if_no_bgcolor( - ultimate_style, self.tag_with_ultimate_style) - ultimate_style = ultimate_style.replace( - 'background:', 'background-color:') - ultimate_style = ultimate_style.replace( - 'list-style-image', 'list-style-type') - - split_ultimate_style: dict = remove_extra_spaces(ultimate_style) - ultimate_style: str = self.process_indents_to_px(split_ultimate_style) - - if self.tag_with_inline_style.attrs.get('style'): - inline_style = self.tag_with_inline_style.attrs['style'] - - split_inline_style: dict = remove_extra_spaces(inline_style) - - # repetition check - if the tag had already had inline style - # that isn't in the css styles, add this to style parsed from css - repeat_styles = list(set(split_ultimate_style.keys()) - & set(split_inline_style.keys())) - - # remove styles(css) that are in css and inline - [split_inline_style.pop(item) for item in repeat_styles] - - if split_inline_style: - # if split_inline_style is not empty - start convert and add to ultimate style - print('we enter repetition check', '\n') - inline_style: str = self.process_indents_to_px( - split_inline_style) - ultimate_style += inline_style - - return ultimate_style - - def change_attrs_with_corresponding_tags(self): - # adds , , , etc - to_remove = check_style_to_be_tag(self.style) - new_tags = [] - for i, (attr, value) in enumerate(to_remove): - s = f'{attr}:{value};' - self.style = self.style.replace(s, '') - self.style = self.style.strip() - if not i: - self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( - attr, value)] - new_tags.append(self.tag_with_inline_style) - else: - name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] - new_tag = BeautifulSoup(features='lxml').new_tag(name) - new_tags[-1].wrap(new_tag) - new_tags.append(new_tag) - - top_tag = self.tag_with_inline_style - - if new_tags: - tmp_attrs = top_tag.attrs.copy() - top_tag.attrs = {} - top_tag2 = BeautifulSoup(features='lxml').new_tag( - self.tag_initial_name) - top_tag2.attrs = tmp_attrs - if self.style: - top_tag2.attrs['style'] = self.style - new_tags[-1].wrap(top_tag2) - else: - top_tag.attrs['style'] = self.style - - return top_tag - - @staticmethod - def wrap_span_in_p_to_save_style_attrs(tag): - """Function designed to save style attrs that cannot be in p -> span""" - if tag.name == 'p' and tag.attrs.get('style'): - styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS - if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']] - p_style = '' - initial_style = tag.attrs['style'] - split_style = initial_style.replace('; ', ';').split(';') - possible_p_attrs_regexp = re.compile( - r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)') - for item in split_style: - has_p_style_attrs = re.search(possible_p_attrs_regexp, item) - if has_p_style_attrs: - p_style += item + ';' - initial_style = initial_style.replace(item + ';', '') - # here check that this style i exactly the same. - # Not 'align' when we have 'text-align', or 'border' when we have 'border-top' - styles_to_be_saved_in_span = [((attr + ':') in initial_style) & ( - '-' + attr not in initial_style) for attr in styles_cant_be_in_p] - if any(styles_to_be_saved_in_span): - # if find styles that cannot be in

-> wrap them in span - tag.name = 'span' - p_tag = BeautifulSoup(features='lxml').new_tag('p') - p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_p_style_attr = re.search(p_attrs_regexp, initial_style) - span_style = initial_style if not has_p_style_attr else initial_style.replace( - has_p_style_attr.group(1), '') - p_tag.attrs['style'] = p_style - tag.attrs['style'] = span_style - tag.wrap(p_tag) - else: - tag.attrs['style'] = p_style - - @staticmethod - def wrap_span_in_li_to_save_style_attrs(tag): - """Function designed to save style attrs that cannot be in li -> span""" - if tag.name == 'li' and tag.attrs.get('style'): - styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if - attr not in ['text-align', 'list-style-type']] - - styles_to_be_saved_in_span = [attr in tag.attrs.get( - 'style') for attr in styles_cant_be_in_li] - if any(styles_to_be_saved_in_span): - tag.name = 'span' - li_tag = BeautifulSoup(features='lxml').new_tag('li') - span_style = tag.attrs['style'] - li_style = '' - for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), - re.compile(r'(list-style-type:(\w+);)')]: - has_li_style_attrs = re.search( - possible_li_attrs_regexp, span_style) - if has_li_style_attrs and has_li_style_attrs.group(1): - li_style += has_li_style_attrs.group(1) - span_style = span_style.replace( - has_li_style_attrs.group(1), '') - li_tag.attrs['style'] = li_style - tag.attrs['style'] = span_style - tag.wrap(li_tag) - - @staticmethod - def wrap_span_in_ul_ol_to_save_style_attrs(tag): - """Function designed to save style attrs that cannot be in ul/ol -> span""" - if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): - styles_cant_be_in_ul_ol = [ - attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] - - styles_to_be_saved_in_span = [attr in tag.attrs.get('style') - for attr in styles_cant_be_in_ul_ol] - if any(styles_to_be_saved_in_span): - tag.name = 'span' - oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name) - span_style = tag.attrs['style'] - - possible_uol_attrs_regexp = re.compile( - r'(list-style-type:(\w+);)') - has_uol_style_attrs = re.search( - possible_uol_attrs_regexp, span_style) - if has_uol_style_attrs and has_uol_style_attrs.group(1): - oul_style = has_uol_style_attrs.group(1) - span_style = span_style.replace(oul_style, '') - oul_tag.attrs['style'] = oul_style - tag.attrs['style'] = span_style - tag.wrap(oul_tag) - - @staticmethod - def wrap_span_in_h_to_save_style_attrs(tag): - """Function designed to save style attrs that cannot be in h -> span""" - h_regexp = re.compile('(^h[1-9]$)') - - if re.search(h_regexp, tag.name) and tag.attrs.get('style'): - h_tag = BeautifulSoup(features='lxml').new_tag(tag.name) - tag.name = 'span' - tag.wrap(h_tag) - style = tag.attrs['style'] - h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_h_style_attr = re.search(h_attrs_regexp, style) - tag.attrs['style'] = style if not has_h_style_attr else style.replace( - has_h_style_attr.group(1), '') - - def convert_initial_tag(self): - self.tag_with_inline_style = self.change_attrs_with_corresponding_tags() - self.wrap_span_in_p_to_save_style_attrs(self.tag_with_inline_style) - self.wrap_span_in_li_to_save_style_attrs(self.tag_with_inline_style) - self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_with_inline_style) - self.wrap_span_in_h_to_save_style_attrs(self.tag_with_inline_style) - return self.tag_with_inline_style - - -def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: - """Function adds styles from .css to inline style""" - css_text = css_text.replace( - '@namespace epub "http://www.idpf.org/2007/ops";', '') - livecarta_tmp_ids = [] - could_have_style_in_livecarta_regexp = re.compile( - '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') - tags_with_possible_style_attr = html_soup.find_all( - could_have_style_in_livecarta_regexp) - for i, x in enumerate(tags_with_possible_style_attr): - if i == 2: - pass - x.attrs['livecarta_id'] = i - livecarta_tmp_ids.append(i) - - # here we add css styles to inline style - html_with_css_styles: str = transform(str(html_soup), css_text=css_text, - remove_classes=False, - external_styles=False, - allow_network=False, - disable_validation=True, - ) - - inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') - - # go through tags with possible style attrs - for i in livecarta_tmp_ids: - tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i}) - tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i}) - del tag_with_initial_style.attrs['livecarta_id'] - if tag_with_ultimate_style.attrs.get('style'): - style_converter = TagStyleConverter( - tag_with_initial_style, tag_with_ultimate_style) - style_converter.convert_initial_tag() - - return html_soup - - -if __name__ == '__main__': - file = '../../epub/9781627222174.epub' - ebooklib_book = epub.read_epub(file) - css_ = ebooklib_book.get_item_with_href('css/epub.css') - css_ = css_.get_content().decode() - css_cleaned = build_css_content(css_) - html_ = ebooklib_book.get_item_with_href( - 'pr01s05.xhtml').get_body_content().decode() - html_soup = BeautifulSoup(html_, features='lxml') - - print(convert_html_soup_with_css_style(html_soup, css_cleaned)) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 7e5e389..2e40dcd 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -17,7 +17,8 @@ from bs4 import BeautifulSoup, Tag from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint -from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style +from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content +from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\ prepare_title, prepare_content, update_images_src_links, preprocess_footnotes @@ -68,6 +69,8 @@ class EpubConverter: BeautifulSoup] = self.build_href2soup_content() # TODO Presets + self.logger.log('Process CSS inline styles.') + self.process_inline_styles_in_html_soup() self.logger.log('CSS files processing.') self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() self.logger.log('CSS styles adding.') @@ -106,7 +109,7 @@ class EpubConverter: def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: # using EpubElements - # for now just for HTML objects, as it is simplest chapter + # for now just for HTML objects, as it is the simplest chapter nodes = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): @@ -122,6 +125,7 @@ class EpubConverter: path_to_css_from_root = normpath( join(html_folder, path_to_css_from_html)).replace('\\', '/') css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) + # if in css file we import another css if "@import" in str(css_obj.content): path_to_css_from_root = "css/" + \ re.search('"(.*)"', str(css_obj.content)).group(1) @@ -131,12 +135,26 @@ class EpubConverter: css_content: str = css_obj.get_content().decode() return css_content + def process_inline_styles_in_html_soup(self): + """This function is designed to convert inline html styles""" + for html_href in self.html_href2html_body_soup: + html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] + could_have_style_in_livecarta_regexp = re.compile( + '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') + tags_with_inline_style = html_content.find_all(could_have_style_in_livecarta_regexp, + attrs={'style': re.compile('.*')}) + + for tag_initial_inline_style in tags_with_inline_style: + inline_style = tag_initial_inline_style.attrs['style'] + tag_initial_inline_style.attrs['style'] = \ + build_inline_style_content(inline_style) + def build_html_and_css_relations(self) -> tuple[dict, dict]: """ Function is designed to get 2 dictionaries: - The first is css_href2css_content. It is created to connect href of css to content of css - The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them + The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them ) which are used on this html + The second is css_href2css_content. It is created to connect href of css to content of css ...2... = key2value Returns ---------- @@ -154,26 +172,27 @@ class EpubConverter: soup_html_content = BeautifulSoup(html_content, features='lxml') # check if file links to css file for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): + # alternate page of original page (e.g. another language) if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): continue css_href = tag.attrs.get('href') html_href2css_href[html_href].append(css_href) if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict - css_href2css_content[css_href] = build_css_content( + css_href2css_content[css_href] = build_css_file_content( self.get_css_content(css_href, html_href)) for i, tag in enumerate(soup_html_content.find_all('style')): css_content = tag.string html_href2css_href[html_href].append(f'href{i}') - css_href2css_content[f'href{i}'] = build_css_content( + css_href2css_content[f'href{i}'] = build_css_file_content( css_content) return html_href2css_href, css_href2css_content def add_css_styles_to_html_soup(self): """ This function is designed to update html_href2html_body_soup - And add to html_inline_style css_style_content + - add to html_inline_style css_style_content """ for html_href in self.html_href2html_body_soup: @@ -181,9 +200,9 @@ class EpubConverter: css = '' for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] - content: BeautifulSoup = self.html_href2html_body_soup[html_href] - content = convert_html_soup_with_css_style(content, css) - self.html_href2html_body_soup[html_href] = content + html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] + html_content = convert_html_soup_with_css_style(html_content, css) + self.html_href2html_body_soup[html_href] = html_content def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): """ @@ -191,7 +210,7 @@ class EpubConverter: self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc key = -1 if root(top chapters), - value = None if leaf(least chapters) + value = None if leaf(the least chapters) Parameters ---------- element: [Link, tuple, list] @@ -299,8 +318,7 @@ class EpubConverter: # go to line structure for html_href in self.html_href2html_body_soup: soup = self.html_href2html_body_soup[html_href] - self.html_href2html_body_soup[html_href] = unwrap_structural_tags( - soup) + self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup) @staticmethod def create_unique_id(href, id_): @@ -314,7 +332,7 @@ class EpubConverter: new_anchor_span.string = "\xa0" return new_anchor_span - def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str: + def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]: """ Function used to find full path to file that is parsed from tag link TOC: a/b/c.xhtml @@ -327,7 +345,7 @@ class EpubConverter: href_in_link: str filename got from tag link, like file1.xhtml internal_link_tag: Tag - tag object that is parsed now + object that is parsed now Returns ------- @@ -362,6 +380,10 @@ class EpubConverter: 1. rebuild ids to be unique in all documents 2a. process anchor which is a whole xhtml file 2b. process anchor which is an element in xhtml file + Returns + ------- + None + process links in html """ # 1. rebuild ids to be unique in all documents @@ -393,14 +415,14 @@ class EpubConverter: if new_id not in self.internal_anchors: anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] new_anchor_span = self.create_new_anchor_span(soup, new_id) - # insert a new span to the begin of the file + # insert a new span to the beginning of the file anchor_soup.insert(0, new_anchor_span) self.internal_anchors.add(new_id) del internal_link_tag.attrs['href'] # 2b. process anchor which is an element in xhtml file - internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)') + internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)') for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): @@ -459,7 +481,7 @@ class EpubConverter: id wraps chapter's content + subchapters' content id points to the start of title of a chapter - In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id + In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id and id of the next chapter/subchapter Parameters ---------- @@ -504,7 +526,8 @@ class EpubConverter: path_to_html=nav_point.href, access=self.access, path2aws_path=self.book_image_src_path2aws_path, - book_id=self.file_path.stem if hasattr(self.file_path, 'stem') else 'book_id') + book_id=self.file_path.stem + if hasattr(self.file_path, 'stem') else 'book_id') is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS title_preprocessed = prepare_title(title) diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_css_style_converter.py new file mode 100644 index 0000000..173e1ba --- /dev/null +++ b/src/epub_converter/tag_css_style_converter.py @@ -0,0 +1,340 @@ +import re +import cssutils +from typing import List + +from logging import CRITICAL +from bs4 import BeautifulSoup +from premailer import transform + +from src.livecarta_config import LiveCartaConfig +from src.epub_converter.css_preprocessing import LIVECARTA_STYLE_ATTRS + +cssutils.log.setLevel(CRITICAL) + + +class TagStyleConverter: + def __init__(self, tag_inline_style): + # tag with inline style + style parsed from css file + self.tag_inline_style = tag_inline_style + self.style = self.process_inline_style() + + @staticmethod + def remove_white_if_no_bgcolor(style_, tag): + """Function remove text white color if there is no bg color""" + if 'background' in style_: + style_ = style_.replace( + 'background:', 'background-color:') + return style_ + + # if text color is white, check that we have bg-color + if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): + # if bg color is inherited, just return style as is + for parent_tag in tag.parents: + # white bg color not need to be checked as we do not write 'white bg color' + tag_with_bg = ['span', 'td', 'tr', 'p'] + tag_will_be_saved = parent_tag.name in tag_with_bg + has_bg = parent_tag.attrs.get('style') and ( + 'background' in parent_tag.attrs.get('style')) + if has_bg and tag_will_be_saved: + return style_ + + children = tag.find_all() + for child in children: + if child.attrs.get('style') and ('background' in child.attrs.get('style')): + tmp_style = child.attrs['style'] + '; color:#fff; ' + child.attrs['style'] = tmp_style + + # for child with bg color we added white text color, so this tag don't need white color + style_ = style_.replace('color:#fff;', '') + style_ = style_.replace('color:#ffffff;', '') + style_ = style_.replace('color:white;', '') + return style_ + + @staticmethod + def duplicate_styles_check(split_style: list) -> list: + # replace all spaces between ': & letter' to ':' + # split_style = [el.replace( + # re.search(r'(:\s*)', el).group(1), ':') for el in split_style_] + style_name2style_value = {} + for list_item in split_style: + key, val = list_item.split(":") + if val not in style_name2style_value.keys(): + style_name2style_value[key] = val + split_style = [k + ":" + v for k, v in style_name2style_value.items()] + return split_style + + @staticmethod + def indents_processing(split_style: list) -> str: + """ + Function process indents from left using + formula_of_indent: indent = abs(margin - text_indent) + Parameters + ---------- + split_style: list + list of styles split by ';' + + Returns + ---------- + processed_style:str + processed style with counted indent + + """ + processed_style = ";".join(split_style) + + margin_left_regexp = re.compile( + r'((margin-left|margin): *(-*\w+);*)') + text_indent_regexp = re.compile( + r'(text-indent: *(-*\w+);*)') + + has_margin = re.search(margin_left_regexp, processed_style) + has_text_indent = re.search(text_indent_regexp, processed_style) + if has_margin: + num_m = abs(int("0" + "".join( + filter(str.isdigit, str(has_margin.group(3)))))) + + if has_text_indent: + num_ti = abs(int("0" + "".join( + filter(str.isdigit, str(has_text_indent.group(2)))))) + processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + + str(abs(num_m - num_ti)) + 'px; ') + processed_style = processed_style.replace( + has_margin.group(1), '') + return processed_style + + processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' + + str(abs(num_m)) + 'px; ') + return processed_style + + elif has_text_indent: + processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + + str(abs(int("0" + "".join( + filter(str.isdigit, str(has_text_indent.group(2))))))) + + 'px; ') + return processed_style + return processed_style + + def process_inline_style(self): + """ + Function processes final(css+initial inline) inline style + Steps + ---------- + 1. Remove white color if tag doesn't have background color in style + 2. Create list of styles from inline style + 3. Duplicate styles check - if the tag had duplicate styles + 4. Processing indents + Returns + ------- + inline_style: str + processed inline style + + """ + inline_style = self.tag_inline_style.attrs.get('style') + ';' + # 1. Remove white color if tag doesn't have background color in style + inline_style = self.remove_white_if_no_bgcolor( + inline_style, self.tag_inline_style) + inline_style = inline_style.replace( + 'list-style-image', 'list-style-type') + + # 2. Create list of styles from inline style + # replace all spaces between '; & letter' to ';' + style = re.sub(r"; *", ";", inline_style) + # when we split style by ';', last element of the list is '' - None (remove it) + split_inline_style: list = list(filter(None, style.split(';'))) + + # 3. Duplicate styles check - if the tag had duplicate styles + split_inline_style = self.duplicate_styles_check(split_inline_style) + + # 4. Processing indents# + inline_style: str = self.indents_processing(split_inline_style) + return inline_style + + @staticmethod + def check_style_to_be_tag(style: str) -> List[tuple]: + """ + Function searches style properties that can be converted to tag. + It searches for them and prepare list of properties to be removed from style string + Parameters + ---------- + style: str + + + Returns + ------- + to_remove: list + properties to remove + + """ + to_remove = [] + for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: + if f'{k[0]}:{k[1]}' in style: + to_remove.append(k) + return to_remove + + def change_attrs_with_corresponding_tags(self, tag_initial_name: str): + # adds , , instead of styles + to_remove = self.check_style_to_be_tag(self.style) + new_tags = [] + for i, (attr, value) in enumerate(to_remove): + s = f'{attr}:{value};' + self.style = self.style.replace(s, '') + self.style = self.style.strip() + if not i: + self.tag_inline_style.name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( + attr, value)] + new_tags.append(self.tag_inline_style) + else: + name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( + attr, value)] + new_tag = BeautifulSoup(features='lxml').new_tag(name) + new_tags[-1].wrap(new_tag) + new_tags.append(new_tag) + + top_tag = self.tag_inline_style + + if new_tags: + tmp_attrs = top_tag.attrs.copy() + top_tag.attrs = {} + top_tag2 = BeautifulSoup(features='lxml').new_tag(tag_initial_name) + top_tag2.attrs = tmp_attrs + if self.style: + top_tag2.attrs['style'] = self.style + new_tags[-1].wrap(top_tag2) + else: + top_tag.attrs['style'] = self.style + return top_tag + + @staticmethod + def wrap_span_in_p_to_save_style_attrs(tag): + """Function designed to save style attrs that cannot be in p -> span""" + if tag.name == 'p' and tag.attrs.get('style'): + styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS + if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']] + p_style = '' + initial_style = tag.attrs['style'] + split_style = initial_style.replace('; ', ';').split(';') + possible_p_attrs_regexp = re.compile( + r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)') + for item in split_style: + has_p_style_attrs = re.search(possible_p_attrs_regexp, item) + if has_p_style_attrs: + p_style += item + ';' + initial_style = initial_style.replace(item + ';', '') + # here check that this style i exactly the same. + # Not 'align' when we have 'text-align', or 'border' when we have 'border-top' + styles_to_be_saved_in_span = [((attr + ':') in initial_style) & ( + '-' + attr not in initial_style) for attr in styles_cant_be_in_p] + if any(styles_to_be_saved_in_span): + # if we find styles that cannot be in

-> wrap them in span + tag.name = 'span' + p_tag = BeautifulSoup(features='lxml').new_tag('p') + p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') + has_p_style_attr = re.search(p_attrs_regexp, initial_style) + span_style = initial_style if not has_p_style_attr else initial_style.replace( + has_p_style_attr.group(1), '') + p_tag.attrs['style'] = p_style + tag.attrs['style'] = span_style + tag.wrap(p_tag) + else: + tag.attrs['style'] = p_style + + @staticmethod + def wrap_span_in_li_to_save_style_attrs(tag): + """Function designed to save style attrs that cannot be in li -> span""" + if tag.name == 'li' and tag.attrs.get('style'): + styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if + attr not in ['text-align', 'list-style-type']] + + styles_to_be_saved_in_span = [attr in tag.attrs.get( + 'style') for attr in styles_cant_be_in_li] + if any(styles_to_be_saved_in_span): + tag.name = 'span' + li_tag = BeautifulSoup(features='lxml').new_tag('li') + span_style = tag.attrs['style'] + li_style = '' + for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), + re.compile(r'(list-style-type:(\w+);)')]: + has_li_style_attrs = re.search( + possible_li_attrs_regexp, span_style) + if has_li_style_attrs and has_li_style_attrs.group(1): + li_style += has_li_style_attrs.group(1) + span_style = span_style.replace( + has_li_style_attrs.group(1), '') + li_tag.attrs['style'] = li_style + tag.attrs['style'] = span_style + tag.wrap(li_tag) + + @staticmethod + def wrap_span_in_ul_ol_to_save_style_attrs(tag): + """Function designed to save style attrs that cannot be in ul/ol -> span""" + if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): + styles_cant_be_in_ul_ol = [ + attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] + + styles_to_be_saved_in_span = [attr in tag.attrs.get('style') + for attr in styles_cant_be_in_ul_ol] + if any(styles_to_be_saved_in_span): + tag.name = 'span' + oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name) + span_style = tag.attrs['style'] + + possible_uol_attrs_regexp = re.compile( + r'(list-style-type:(\w+);)') + has_uol_style_attrs = re.search( + possible_uol_attrs_regexp, span_style) + if has_uol_style_attrs and has_uol_style_attrs.group(1): + oul_style = has_uol_style_attrs.group(1) + span_style = span_style.replace(oul_style, '') + oul_tag.attrs['style'] = oul_style + tag.attrs['style'] = span_style + tag.wrap(oul_tag) + + @staticmethod + def wrap_span_in_h_to_save_style_attrs(tag): + """Function designed to save style attrs that cannot be in h -> span""" + h_regexp = re.compile('(^h[1-9]$)') + + if re.search(h_regexp, tag.name) and tag.attrs.get('style'): + h_tag = BeautifulSoup(features='lxml').new_tag(tag.name) + tag.name = 'span' + tag.wrap(h_tag) + style = tag.attrs['style'] + h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') + has_h_style_attr = re.search(h_attrs_regexp, style) + tag.attrs['style'] = style if not has_h_style_attr else style.replace( + has_h_style_attr.group(1), '') + + def convert_initial_tag(self): + self.tag_inline_style = self.change_attrs_with_corresponding_tags( + self.tag_inline_style.name) + self.wrap_span_in_p_to_save_style_attrs(self.tag_inline_style) + self.wrap_span_in_li_to_save_style_attrs(self.tag_inline_style) + self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_inline_style) + self.wrap_span_in_h_to_save_style_attrs(self.tag_inline_style) + return self.tag_inline_style + + +def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: + """Function adds styles from .css to inline style""" + # remove this specification because it causes problems + css_text = css_text.replace( + '@namespace epub "http://www.idpf.org/2007/ops";', '') + # here we add css styles to inline style + html_with_css_styles: str = transform(str(html_soup), css_text=css_text, + remove_classes=False, + external_styles=False, + allow_network=False, + disable_validation=True, + ) + # soup with converted styles from css + inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') + + could_have_style_in_livecarta_regexp = re.compile( + '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') + tags_with_inline_style = inline_soup.find_all(could_have_style_in_livecarta_regexp, + attrs={'style': re.compile('.*')}) + + # go through the tags with inline style + style parsed from css file + for tag_inline_style in tags_with_inline_style: + style_converter = TagStyleConverter(tag_inline_style) + style_converter.convert_initial_tag() + return inline_soup diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 7e57122..ddcabb2 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -1,17 +1,26 @@ class LiveCartaConfig: """Class of values that LiveCarta platform using and supports""" + # tag with inline style to be updated with style attribute SUPPORTED_LEVELS = 5 SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"} - HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} + HEADERS_LEVELS = {"h1", "h2", "h3", + "h4", "h5", "h6", "h7", "h8", "h9"} + + DEFAULT_ALIGN_STYLE = 'left' + + ALIGN_STYLES = ['justify', 'right', 'center', 'left'] # Main constant values DEFAULT_FONT_NAME = 'Times New Roman' - DEFAULT_ALIGN_STYLE = 'left' - ALIGN_STYLES = ['justify', 'right', 'center', 'left'] + WORD_DEFAULT_FONT_SIZE = 11 + LIVECARTA_DEFAULT_FONT_SIZE = 18 - FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE / WORD_DEFAULT_FONT_SIZE - font_correspondence_table = { + + FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE /\ + WORD_DEFAULT_FONT_SIZE + + FONT_CORRESPONDANCE_TABLE = { "Arial": "arial,helvetica,sans-serif", "Comic Sans MS": "comic sans ms,cursive", "Courier New": "courier new,courier,monospace", @@ -61,4 +70,39 @@ class LiveCartaConfig: 'gray': 'darkGray', 'grey': 'darkGray', } + INDENT = '30px' + + sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, + 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, + 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, + 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] + + sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', + '17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px', + '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', + '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', + '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] + + list_types = ['circle', 'disc', 'armenian', 'decimal', + 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', + 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] + + """ + LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } + +