From 45c1931ab3f0fb8e367e0f6a7ebf8c3434cbd98a Mon Sep 17 00:00:00 2001 From: shirshasa Date: Mon, 28 Jun 2021 13:56:23 +0300 Subject: [PATCH] epub converter: fix bg-color --- src/css_reader.py | 32 ++++++++++++++++++++++++++------ src/html_epub_preprocessor.py | 24 +++++++++++++----------- src/util/color_reader.py | 2 +- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/src/css_reader.py b/src/css_reader.py index 12f1e49..4a4e724 100644 --- a/src/css_reader.py +++ b/src/css_reader.py @@ -95,13 +95,13 @@ to suit livecarta style convention. def get_bg_color(x): color = str2hex(x) - color = color if color not in ['#ffffff', '#fff'] else '' + color = color if color not in ['#ffffff', '#fff', 'white'] else '' return color def get_text_color(x): color = str2hex(x) - color = color if color not in ['#000000', '#000'] else '' + color = color if color not in ['#000000', '#000', 'black'] else '' return color @@ -180,7 +180,7 @@ def clean_css(css): def add_inline_style_to_html_soup(soup1, css_text): livecarta_tmp_ids = [] - h_regex = f'(^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$)' + h_regex = f'(^h[1-9]$)' could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex) elements_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp) for i, x in enumerate(elements_with_possible_style_attr): @@ -193,8 +193,28 @@ def add_inline_style_to_html_soup(soup1, css_text): disable_validation=True) soup2 = BeautifulSoup(html_with_inline_style, features='lxml') - def remove_white_if_no_bgcolor(style_): - if ('color:white' in style_) and ('background' not in style_): + def remove_white_if_no_bgcolor(style_, tag): + if 'background' in style_: + return style_ + + # if text color is white, check that we have bg-color + if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): + # if bg color is inherited, just return style as is + for parent_tag in tag.parents: + # white bg color not need to be checked as we do not write 'white bg color' + if parent_tag.attrs.get('style') and ('background' in parent_tag.attrs.get('style')): + print(tag, parent_tag.attrs.get('style')) + return style_ + + children = tag.find_all() + for child in children: + if child.attrs.get('style') and ('background' in child.attrs.get('style')): + tmp_style = child.attrs['style'] + '; color:#fff; ' + child.attrs['style'] = tmp_style + + # for child with bg color we added white text color, so this tag don't need white color + style_ = style_.replace('color:#fff;', '') + style_ = style_.replace('color:#ffffff;', '') style_ = style_.replace('color:white;', '') return style_ @@ -209,7 +229,7 @@ def add_inline_style_to_html_soup(soup1, css_text): tag_with_style = soup2.find(attrs={'livecarta_id': i}) if tag_with_style.attrs.get('style'): style = tag_with_style.attrs.get('style') + ';' - style = remove_white_if_no_bgcolor(style) + style = remove_white_if_no_bgcolor(style, tag_with_style) style = style.replace('background:', 'background-color:') to_remove = check_style_to_be_tag(style) new_tags = [] diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index ab34ba1..4ce6ce9 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -97,7 +97,9 @@ def preprocess_table(body_tag: BeautifulSoup): if width: td.attrs['width'] = width - td.attrs['style'] = td.attrs.get('style').replace('border:0;', '') + + if td.attrs.get('style'): + td.attrs['style'] = td.attrs['style'].replace('border:0;', '') if border_sizes: border_size = sum(border_sizes) / len(border_sizes) @@ -270,16 +272,16 @@ def unwrap_structural_tags(body_tag): 'figure', 'footer', 'iframe', 'span', 'p' ] # should be before other tags processing, not to remove converter empty tags with id - for s in body_tag.find_all("span"): - if (s.attrs.get('epub:type') == 'pagebreak') or s.attrs.get('id'): - continue - if s.contents: - is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents] - if all(is_not_struct_tag): - continue - - _add_span_to_save_ids_for_links(s) - s.unwrap() + # for s in body_tag.find_all("span"): + # if (s.attrs.get('epub:type') == 'pagebreak') or s.attrs.get('id'): + # continue + # if s.contents: + # is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents] + # if all(is_not_struct_tag): + # continue + # + # _add_span_to_save_ids_for_links(s) + # s.unwrap() for div in body_tag.find_all("div"): if div.contents: diff --git a/src/util/color_reader.py b/src/util/color_reader.py index e5a9263..d7a3d61 100644 --- a/src/util/color_reader.py +++ b/src/util/color_reader.py @@ -80,7 +80,7 @@ def str2closest_html_color_name(s: str): def str2hex(s: str): if '#' in s: - return s + return s.lower() if ('rgb' in s) and ('%' in s): match = re.search(r'rgba*\(((\d+)%, *(\d+)%, *(\d+)%(, \d\.\d+)*)\)', s)