import re import cssutils from typing import List from logging import CRITICAL from premailer import transform from bs4 import BeautifulSoup, Tag from src.livecarta_config import LiveCartaConfig cssutils.log.setLevel(CRITICAL) class InlineStyleProcessor: def __init__(self, tag_inline_style: Tag): # tag with inline style + style parsed from css file self.tag_inline_style = tag_inline_style self.tag_inline_style.attrs["style"]: str = self.process_inline_style() @staticmethod def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str: """Function remove text white color if there is no bg color""" if "background" in style_: style_ = style_.replace( "background:", "background-color:") return style_ # if text color is white, check that we have bg-color if ("color:#ffffff" in style_) or ("color:#fff" in style_) or ("color:white" in style_): # if bg color is inherited, just return style as is for parent_tag in tag.parents: # white bg color not need to be checked as we do not write "white bg color" tag_with_bg = ["span", "td", "tr", "p"] tag_will_be_saved = parent_tag.name in tag_with_bg has_bg = parent_tag.attrs.get("style") and ( "background" in parent_tag.attrs.get("style")) if has_bg and tag_will_be_saved: return style_ children = tag.find_all() for child in children: if child.attrs.get("style") and ("background" in child.attrs.get("style")): tmp_style = child.attrs["style"] + "; color:#fff; " child.attrs["style"] = tmp_style # for child with bg color we added white text color, so this tag don"t need white color style_ = style_.replace("color:#fff;", "") style_ = style_.replace("color:#ffffff;", "") style_ = style_.replace("color:white;", "") return style_ # @staticmethod # def duplicate_styles_check(split_style: list) -> list: # style_name2style_value = {} # # {key: val for for list_item in split_style} # splitstrs = (list_item.split(":") for list_item in split_style) # d = {key: val for key, val in splitstrs} # for list_item in split_style: # key, val = list_item.split(":") # if key not in style_name2style_value.keys(): # style_name2style_value[key] = val # split_style = [k + ":" + v for k, v in style_name2style_value.items()] # return split_style @staticmethod def indents_processing(split_style: List[str]) -> str: """ Function process indents from left using formula_of_indent: indent = closest_number(abs(margin - text_indent)) Parameters ---------- split_style: List[str] list of styles split by ";" Returns ---------- processed_style:str processed style with counted indent """ def closest_number(value: int, m: int = 30) -> int: """ Function to find the number closest to value and divisible by m """ # Find the quotient q = round(value / m) return m * q processed_style = ";".join(split_style) + ';' margin_left_regexp = re.compile( r"(margin(-left)?:\s*-?(\d+(\.\d+)?)(\w*)\s*;)") text_indent_regexp = re.compile( r"(text-indent:\s*-?(\d+(\.\d+)?)(\w*)\s*;)") has_margin = margin_left_regexp.search(processed_style) has_text_indent = text_indent_regexp.search(processed_style) if has_margin: num_m = abs(float(has_margin.group(3))) if has_text_indent: num_ti = abs(float(has_text_indent.group(2))) indent_value = str(closest_number(abs(num_m - num_ti))) processed_style = processed_style.replace( has_text_indent.group(0), f"text-indent: {indent_value}px;") else: indent_value = str(closest_number(abs(num_m))) processed_style += f"text-indent: {indent_value}px;" processed_style = margin_left_regexp.sub("", processed_style) elif has_text_indent: num_ti = abs(float(has_text_indent.group(2))) indent_value = str(closest_number(num_ti)) processed_style = text_indent_regexp.sub(f"text-indent: {indent_value}px;", processed_style) return processed_style.strip(";") def process_inline_style(self) -> str: """ Function processes final(css+initial inline) inline style Steps ---------- 1. Remove white color if tag doesn't have background color in style 2. Create list of styles from inline style 3. Duplicate styles check - if the tag had duplicate styles 4. Processing indents Returns ------- inline_style: str processed inline style """ if self.tag_inline_style.attrs.get("style"): inline_style = self.tag_inline_style.attrs.get("style") + ";" # 1. Remove white color if tag doesn't have background color in style inline_style = self.remove_white_if_no_bgcolor( inline_style, self.tag_inline_style) inline_style = inline_style.replace( "list-style-image", "list-style-type") # 2. Create list of styles from inline style # replace all spaces between "; & letter" to ";" style = re.sub(r"; *", ";", inline_style) # when we split style by ";", last element of the list is "" - None (remove it) split_inline_style: list = list(filter(None, style.split(";"))) # 3. Duplicate styles check - if the tag had duplicate styles # split_inline_style = self.duplicate_styles_check(split_inline_style) # 4. Processing indents inline_style: str = self.indents_processing(split_inline_style) return inline_style else: return "" @staticmethod def check_style_to_be_tag(style: str) -> List[tuple]: """ Function searches style properties that can be converted to tag. It searches for them and prepare list of properties to be removed from style string Parameters ---------- style: str Returns ------- styles_to_remove: list properties to remove """ styles_to_remove = [] for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: if f"{k[0]}:{k[1]}" in style: styles_to_remove.append(k) return styles_to_remove def change_attrs_with_corresponding_tags(self): # adds , , instead of styles styles_to_remove = self.check_style_to_be_tag(self.tag_inline_style.attrs['style']) for i, (attr, value) in enumerate(styles_to_remove): self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\ .replace(f"{attr}:{value};", "").strip() corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( attr, value)] correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name) for content in reversed(self.tag_inline_style.contents): correspond_tag.insert(0, content.extract()) self.tag_inline_style.append(correspond_tag) @staticmethod def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag): """Function designed to save style attrs that cannot be in tag.name -> span""" dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG)) if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"): styles_can_be_in_tag = [style for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items() if re.match(tag, initial_tag.name) for style in styles] styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS if attr not in styles_can_be_in_tag] span_style = initial_tag.attrs["style"] # here check that this style is exactly the same. # Not "align" when we have "text-align", or "border" when we have "border-top" styles_to_be_saved_in_span = [((attr + ":") in span_style) & ( "-" + attr not in span_style) for attr in styles_cant_be_in_tag] if any(styles_to_be_saved_in_span): # if we find styles that cannot be in -> wrap them in span tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}") style = "" possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag] for possible_attr_regexp in possible_attrs_regexp: has_style_attrs = re.search( possible_attr_regexp, span_style) if has_style_attrs and has_style_attrs.group(1): style += has_style_attrs.group(1) span_style = span_style.replace( has_style_attrs.group(1), "") tag.attrs["style"] = style initial_tag.name = "span" initial_tag.attrs["style"] = span_style initial_tag.wrap(tag) def convert_initial_tag(self) -> Tag: self.change_attrs_with_corresponding_tags() self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) return self.tag_inline_style def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = "") -> BeautifulSoup: """ Function adds styles from .css to inline style. Parameters ---------- html_soup: BeautifulSoup html page with inline style css_text: str css content from css file Returns ------- inline_soup: BeautifulSoup soup with styles from css """ # remove this specification because it causes problems css_text = css_text.replace( '@namespace epub "http://www.idpf.org/2007/ops";', '') # here we add css styles to inline style html_with_css_styles: str = transform(str(html_soup), exclude_pseudoclasses=False, include_star_selectors=True, remove_classes=False, external_styles=False, css_text=css_text, disable_validation=True, allow_network=False) # soup with converted styles from css inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, attrs={"style": re.compile(".*")}) # go through the tags with inline style + style parsed from css file for tag_inline_style in tags_with_inline_style: style_converter = InlineStyleProcessor(tag_inline_style) style_converter.convert_initial_tag() return inline_soup