diff --git a/src/epub_converter/css_reader.py b/src/epub_converter/css_reader.py index 450dd40..f7828ef 100644 --- a/src/epub_converter/css_reader.py +++ b/src/epub_converter/css_reader.py @@ -12,7 +12,6 @@ from src.util.color_reader import str2hex from src.livecarta_config import LiveCartaConfig - cssutils.log.setLevel(CRITICAL) sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, @@ -57,26 +56,28 @@ def convert_font_size(value): except ValueError: return '' + def convert_indents(value): - # 30px = 3.2% = 1.25em = 23pt + # 30px = 3.2% = 1.25em = 23pt text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)') has_style_attrs = re.search(text_indent_regexp, value) if has_style_attrs: if has_style_attrs.group(1): value = value.replace(has_style_attrs.group(1), - str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) + - 'px') + str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) + + 'px') elif has_style_attrs.group(2): value = value.replace(has_style_attrs.group(2), - str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) + - 'px') + str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) + + 'px') elif has_style_attrs.group(4): value = value.replace(has_style_attrs.group(4), str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px') return value + """ LIVECARTA_STYLE_ATTRS = { css property: value } @@ -107,6 +108,7 @@ LIVECARTA_STYLE_ATTRS = { 'border-right-width': [], 'border-left-width': [], 'border-bottom-width': [], + 'border-top': [], 'border-bottom': [], 'list-style-type': [], 'list-style-image': [], @@ -132,6 +134,7 @@ def get_text_color(x): color = color if color not in ['#000000', '#000', 'black'] else '' return color + LIVECARTA_STYLE_ATTRS_MAPPING = { 'text-indent': convert_indents, 'font-variant': lambda x: x, @@ -147,6 +150,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = { 'border-right-width': lambda x: x if x != '0' else '', 'border-left-width': lambda x: x if x != '0' else '', 'border-bottom-width': lambda x: x if x != '0' else '', + 'border-top': lambda x: x if x != '0' else '', 'border-bottom': lambda x: x if x != '0' else '', 'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-image': lambda x: 'disc', @@ -182,30 +186,35 @@ def check_style_to_be_tag(style) -> List[tuple]: to_remove.append(k) return to_remove + def update_css_style_types_to_livecarta_convention(css_rule, style_type): if style_type.name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = '' return - cleaned_value = style_type.value.replace('\"', '') # value of style + cleaned_value = style_type.value.replace('\"', '') # value of style there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name) - value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[style_type.name] + value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ + style_type.name] if there_are_constraints_on_value and value_not_in_possible_values_list: # style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = '' else: if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: - func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] # function that converts our data + # function that converts our data + func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] css_rule.style[style_type.name] = func(cleaned_value) + def build_css_content(css_content): sheet = cssutils.parseString(css_content, validate=False) for css_rule in sheet: if css_rule.type == css_rule.STYLE_RULE: for style_type in css_rule.style: - update_css_style_types_to_livecarta_convention(css_rule, style_type) + update_css_style_types_to_livecarta_convention( + css_rule, style_type) css_text = sheet._getCssText().decode() return css_text @@ -213,9 +222,11 @@ def build_css_content(css_content): class TagStyleConverter: def __init__(self, tag_with_inline_style, tag_with_ultimate_style): - self.tag_with_inline_style = tag_with_inline_style # tag with inline style to be updated with style attribute + # tag with inline style to be updated with style attribute + self.tag_with_inline_style = tag_with_inline_style self.tag_initial_name = tag_with_inline_style.name - self.tag_with_ultimate_style = tag_with_ultimate_style # tag with inline style + style parsed from css file + # tag with inline style + style parsed from css file + self.tag_with_ultimate_style = tag_with_ultimate_style self.style = self.preprocess_style() @staticmethod @@ -230,7 +241,8 @@ class TagStyleConverter: # white bg color not need to be checked as we do not write 'white bg color' tag_with_bg = ['span', 'td', 'tr', 'p'] tag_will_be_saved = parent_tag.name in tag_with_bg - has_bg = parent_tag.attrs.get('style') and ('background' in parent_tag.attrs.get('style')) + has_bg = parent_tag.attrs.get('style') and ( + 'background' in parent_tag.attrs.get('style')) if has_bg and tag_will_be_saved: return style_ @@ -256,7 +268,7 @@ class TagStyleConverter: if item[0] in ['text-indent', 'margin-left']: item[1] = convert_indents(item[1]) clean_style += item[0] + ': ' + item[1] + '; ' - + margin_left_regexp = re.compile( r'(margin-left:( *-*\w+);*)') text_indent_regexp = re.compile( @@ -267,63 +279,70 @@ class TagStyleConverter: #formula_of_indent: indent = abs(margin_left - text_indent) if has_margin_left: num_ml = abs(int("".join( - filter(str.isdigit, str(has_margin_left.group(2)))))) + filter(str.isdigit, str(has_margin_left.group(2)))))) if has_text_indent: num_ti = abs(int("".join( filter(str.isdigit, str(has_text_indent.group(2)))))) clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + - str(abs(num_ml - num_ti)) + 'px; ') + str(abs(num_ml - num_ti)) + 'px; ') clean_style = clean_style.replace(has_margin_left.group(1), '') return clean_style clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' + - str(abs(num_ml)) + 'px; ') + str(abs(num_ml)) + 'px; ') return clean_style elif has_text_indent: clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + - str(abs(int("".join( - filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ') + str(abs(int("".join( + filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ') return clean_style return clean_style def preprocess_style(self): ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';' - ultimate_style = self.remove_white_if_no_bgcolor(ultimate_style, self.tag_with_ultimate_style) - ultimate_style = ultimate_style.replace('background:', 'background-color:') - ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type') + ultimate_style = self.remove_white_if_no_bgcolor( + ultimate_style, self.tag_with_ultimate_style) + ultimate_style = ultimate_style.replace( + 'background:', 'background-color:') + ultimate_style = ultimate_style.replace( + 'list-style-image', 'list-style-type') - split_ultimate_style = ultimate_style.replace('; ',';').split(';') + split_ultimate_style = ultimate_style.replace('; ', ';').split(';') # when we split style by ; and we have at the end ; that's why we have '' in list while '' in split_ultimate_style: split_ultimate_style.remove('') # replace all spaces between ': & letter' to ':' - split_ultimate_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_ultimate_style] + split_ultimate_style = [el.replace( + re.search(r'(:\s*)', el).group(1), ':') for el in split_ultimate_style] if self.tag_with_inline_style.attrs.get('style'): inline_style = self.tag_with_inline_style.attrs['style'] - split_inline_style = inline_style.replace('; ',';').split(';') + split_inline_style = inline_style.replace('; ', ';').split(';') # when we split style by ; and we have at the end ; that's why we have '' in list while '' in split_inline_style: split_inline_style.remove('') # replace all spaces between ': & letter' to ':' - split_inline_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_inline_style] + split_inline_style = [el.replace( + re.search(r'(:\s*)', el).group(1), ':') for el in split_inline_style] # repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css - repeat_styles = list(set(split_ultimate_style) & set(split_inline_style)) + repeat_styles = list(set(split_ultimate_style) + & set(split_inline_style)) for item in repeat_styles: split_inline_style.remove(item) if split_inline_style: # if inline style is not empty - start convert and add to ultimate style print('we enter repetition check', '\n') - inline_style: str = self.process_indents_to_px(split_inline_style) + inline_style: str = self.process_indents_to_px( + split_inline_style) ultimate_style += inline_style ultimate_style: str = self.process_indents_to_px(split_ultimate_style) @@ -338,7 +357,8 @@ class TagStyleConverter: self.style = self.style.replace(s, '') self.style = self.style.strip() if i == 0: - self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] + self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( + attr, value)] new_tags.append(self.tag_with_inline_style) else: name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] @@ -351,7 +371,8 @@ class TagStyleConverter: if new_tags: tmp_attrs = top_tag.attrs.copy() top_tag.attrs = {} - top_tag2 = BeautifulSoup(features='lxml').new_tag(self.tag_initial_name) + top_tag2 = BeautifulSoup(features='lxml').new_tag( + self.tag_initial_name) top_tag2.attrs = tmp_attrs if self.style: top_tag2.attrs['style'] = self.style @@ -363,39 +384,36 @@ class TagStyleConverter: @staticmethod def wrap_span_in_p_to_save_style_attrs(tag): - '''Function designed to save style attrs that cannot be in p -> span - that cannot be in span -> p''' + '''Function designed to save style attrs that cannot be in p -> span''' if tag.name == 'p' and tag.attrs.get('style'): styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS - if attr not in ['text-align', 'text-indent', 'border-bottom']] + if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']] + p_style = '' + initial_style = tag.attrs['style'] + split_style = initial_style.replace('; ', ';').split(';') + possible_p_attrs_regexp = re.compile( + r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)') + for item in split_style: + has_p_style_attrs = re.search(possible_p_attrs_regexp, item) + if has_p_style_attrs: + p_style += item + ';' + initial_style = initial_style.replace(item + ';', '') - styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p] - if any(styles_to_be_saved): + # here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top' + styles_to_be_saved_in_span = [((attr + ':') in initial_style) & ( + '-' + attr not in initial_style) for attr in styles_cant_be_in_p] + if any(styles_to_be_saved_in_span): + # if find styles that cannot be in
-> wrap them in span tag.name = 'span' p_tag = BeautifulSoup(features='lxml').new_tag('p') - span_style = tag.attrs['style'] - p_style = '' - possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)|(border-bottom:( *\w+);*)') - for i in range(span_style.count(';') + 1): - has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style) - if has_p_style_attrs: - if has_p_style_attrs.group(1): - p_style += has_p_style_attrs.group(1) - span_style = span_style.replace(has_p_style_attrs.group(1), '') - if has_p_style_attrs.group(3): - p_style += has_p_style_attrs.group(3) - span_style = span_style.replace(has_p_style_attrs.group(3), '') - if has_p_style_attrs.group(5): - p_style += span_style - span_style = span_style.replace(span_style, '') - - p_tag.attrs['style'] = p_style - - li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_li_style_attr = re.search(li_attrs_regexp, span_style) - span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '') + li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') + has_li_style_attr = re.search(li_attrs_regexp, initial_style) + span_style = initial_style if not has_li_style_attr else initial_style.replace( + has_li_style_attr.group(1), '') + p_tag.attrs['style'] = p_style tag.attrs['style'] = span_style tag.wrap(p_tag) + else: tag.attrs['style'] = p_style @staticmethod def wrap_span_in_li_to_save_style_attrs(tag): @@ -403,7 +421,8 @@ class TagStyleConverter: styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'list-style-type']] - styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_li] + styles_to_be_saved = [attr in tag.attrs.get( + 'style') for attr in styles_cant_be_in_li] if any(styles_to_be_saved): tag.name = 'span' li_tag = BeautifulSoup(features='lxml').new_tag('li') @@ -412,10 +431,12 @@ class TagStyleConverter: for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), re.compile(r'(list-style-type:(\w+);)')]: - has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style) + has_li_style_attrs = re.search( + possible_li_attrs_regexp, span_style) if has_li_style_attrs and has_li_style_attrs.group(1): li_style += has_li_style_attrs.group(1) - span_style = span_style.replace(has_li_style_attrs.group(1), '') + span_style = span_style.replace( + has_li_style_attrs.group(1), '') li_tag.attrs['style'] = li_style tag.attrs['style'] = span_style @@ -424,16 +445,20 @@ class TagStyleConverter: @staticmethod def wrap_span_in_ul_ol_to_save_style_attrs(tag): if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): - styles_cant_be_in_ul_ol = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] + styles_cant_be_in_ul_ol = [ + attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] - check = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_ul_ol] + check = [attr in tag.attrs.get('style') + for attr in styles_cant_be_in_ul_ol] if any(check): tag.name = 'span' li_tag = BeautifulSoup(features='lxml').new_tag('ul') span_style = tag.attrs['style'] - possible_li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style) + possible_li_attrs_regexp = re.compile( + r'(list-style-type:(\w+);)') + has_li_style_attrs = re.search( + possible_li_attrs_regexp, span_style) if has_li_style_attrs and has_li_style_attrs.group(1): oul_style = has_li_style_attrs.group(1) span_style = span_style.replace(oul_style, '') @@ -452,7 +477,8 @@ class TagStyleConverter: style = tag.attrs['style'] li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') has_li_style_attr = re.search(li_attrs_regexp, style) - tag.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') + tag.attrs['style'] = style if not has_li_style_attr else style.replace( + has_li_style_attr.group(1), '') def convert_initial_tag(self): self.tag_with_inline_style = self.change_attrs_with_corresponding_tags() @@ -464,10 +490,13 @@ class TagStyleConverter: def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): - css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '') + css_text = css_text.replace( + '@namespace epub "http://www.idpf.org/2007/ops";', '') livecarta_tmp_ids = [] - could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') - tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp) + could_have_style_in_livecarta_regexp = re.compile( + '(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') + tags_with_possible_style_attr = html_soup.find_all( + could_have_style_in_livecarta_regexp) for i, x in enumerate(tags_with_possible_style_attr): x.attrs['livecarta_id'] = i livecarta_tmp_ids.append(i) @@ -488,7 +517,8 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i}) del tag_with_initial_style.attrs['livecarta_id'] if tag_with_ultimate_style.attrs.get('style'): - style_converter = TagStyleConverter(tag_with_initial_style, tag_with_ultimate_style) + style_converter = TagStyleConverter( + tag_with_initial_style, tag_with_ultimate_style) style_converter.convert_initial_tag() return html_soup @@ -500,7 +530,8 @@ if __name__ == '__main__': css_ = ebooklib_book.get_item_with_href('css/epub.css') css_ = css_.get_content().decode() css_cleaned = build_css_content(css_) - html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode() + html_ = ebooklib_book.get_item_with_href( + 'pr01s05.xhtml').get_body_content().decode() html_soup = BeautifulSoup(html_, features='lxml') - print(convert_html_soup_with_css_style(html_soup, css_cleaned)) + print(convert_html_soup_with_css_style(html_soup, css_cleaned)) \ No newline at end of file