forked from LiveCarta/BookConverter
270 lines
12 KiB
Python
270 lines
12 KiB
Python
import re
|
|
import cssutils
|
|
from typing import List
|
|
from logging import CRITICAL
|
|
from premailer import transform
|
|
from bs4 import BeautifulSoup, Tag
|
|
|
|
from src.livecarta_config import LiveCartaConfig
|
|
|
|
cssutils.log.setLevel(CRITICAL)
|
|
|
|
|
|
class InlineStyleProcessor:
|
|
def __init__(self, tag_inline_style: Tag):
|
|
# tag with inline style + style parsed from css file
|
|
self.tag_inline_style = tag_inline_style
|
|
self.tag_inline_style.attrs["style"]: str = self.process_inline_style()
|
|
|
|
@staticmethod
|
|
def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str:
|
|
"""Function remove text white color if there is no bg color"""
|
|
if "background" in style_:
|
|
style_ = style_.replace(
|
|
"background:", "background-color:")
|
|
return style_
|
|
|
|
# if text color is white, check that we have bg-color
|
|
if ("color:#ffffff" in style_) or ("color:#fff" in style_) or ("color:white" in style_):
|
|
# if bg color is inherited, just return style as is
|
|
for parent_tag in tag.parents:
|
|
# white bg color not need to be checked as we do not write "white bg color"
|
|
tag_with_bg = ["span", "td", "tr", "p"]
|
|
tag_will_be_saved = parent_tag.name in tag_with_bg
|
|
has_bg = parent_tag.attrs.get("style") and (
|
|
"background" in parent_tag.attrs.get("style"))
|
|
if has_bg and tag_will_be_saved:
|
|
return style_
|
|
|
|
children = tag.find_all()
|
|
for child in children:
|
|
if child.attrs.get("style") and ("background" in child.attrs.get("style")):
|
|
tmp_style = child.attrs["style"] + "; color:#fff; "
|
|
child.attrs["style"] = tmp_style
|
|
|
|
# for child with bg color we added white text color, so this tag don"t need white color
|
|
style_ = style_.replace("color:#fff;", "")
|
|
style_ = style_.replace("color:#ffffff;", "")
|
|
style_ = style_.replace("color:white;", "")
|
|
return style_
|
|
|
|
# @staticmethod
|
|
# def duplicate_styles_check(split_style: list) -> list:
|
|
# style_name2style_value = {}
|
|
# # {key: val for for list_item in split_style}
|
|
# splitstrs = (list_item.split(":") for list_item in split_style)
|
|
# d = {key: val for key, val in splitstrs}
|
|
# for list_item in split_style:
|
|
# key, val = list_item.split(":")
|
|
# if key not in style_name2style_value.keys():
|
|
# style_name2style_value[key] = val
|
|
# split_style = [k + ":" + v for k, v in style_name2style_value.items()]
|
|
# return split_style
|
|
|
|
@staticmethod
|
|
def indents_processing(split_style: List[str]) -> str:
|
|
"""
|
|
Function process indents from left using
|
|
formula_of_indent: indent = closest_number(abs(margin - text_indent))
|
|
Parameters
|
|
----------
|
|
split_style: List[str]
|
|
list of styles split by ";"
|
|
|
|
Returns
|
|
----------
|
|
processed_style:str
|
|
processed style with counted indent
|
|
|
|
"""
|
|
|
|
def closest_number(value: int, m: int = 30) -> int:
|
|
"""
|
|
Function to find the number closest
|
|
to value and divisible by m
|
|
"""
|
|
# Find the quotient
|
|
q = round(value / m)
|
|
return m * q
|
|
|
|
processed_style = ";".join(split_style) + ';'
|
|
|
|
margin_left_regexp = re.compile(
|
|
r"(margin(-left)?:\s*-?(\d+(\.\d+)?)(\w*)\s*;)")
|
|
text_indent_regexp = re.compile(
|
|
r"(text-indent:\s*-?(\d+(\.\d+)?)(\w*)\s*;)")
|
|
|
|
has_margin = margin_left_regexp.search(processed_style)
|
|
has_text_indent = text_indent_regexp.search(processed_style)
|
|
|
|
if has_margin:
|
|
num_m = abs(float(has_margin.group(3)))
|
|
|
|
if has_text_indent:
|
|
num_ti = abs(float(has_text_indent.group(2)))
|
|
indent_value = str(closest_number(abs(num_m - num_ti)))
|
|
processed_style = processed_style.replace(
|
|
has_text_indent.group(0), f"text-indent: {indent_value}px;")
|
|
else:
|
|
indent_value = str(closest_number(abs(num_m)))
|
|
processed_style += f"text-indent: {indent_value}px;"
|
|
|
|
processed_style = margin_left_regexp.sub("", processed_style)
|
|
|
|
elif has_text_indent:
|
|
num_ti = abs(float(has_text_indent.group(2)))
|
|
indent_value = str(closest_number(num_ti))
|
|
processed_style = text_indent_regexp.sub(f"text-indent: {indent_value}px;", processed_style)
|
|
|
|
return processed_style.strip(";")
|
|
|
|
def process_inline_style(self) -> str:
|
|
"""
|
|
Function processes final(css+initial inline) inline style
|
|
Steps
|
|
----------
|
|
1. Remove white color if tag doesn't have background color in style
|
|
2. Create list of styles from inline style
|
|
3. Duplicate styles check - if the tag had duplicate styles
|
|
4. Processing indents
|
|
|
|
Returns
|
|
-------
|
|
inline_style: str
|
|
processed inline style
|
|
|
|
"""
|
|
if self.tag_inline_style.attrs.get("style"):
|
|
inline_style = self.tag_inline_style.attrs.get("style") + ";"
|
|
# 1. Remove white color if tag doesn't have background color in style
|
|
inline_style = self.remove_white_if_no_bgcolor(
|
|
inline_style, self.tag_inline_style)
|
|
inline_style = inline_style.replace(
|
|
"list-style-image", "list-style-type")
|
|
# 2. Create list of styles from inline style
|
|
# replace all spaces between "; & letter" to ";"
|
|
style = re.sub(r"; *", ";", inline_style)
|
|
# when we split style by ";", last element of the list is "" - None (remove it)
|
|
split_inline_style: list = list(filter(None, style.split(";")))
|
|
# 3. Duplicate styles check - if the tag had duplicate styles
|
|
# split_inline_style = self.duplicate_styles_check(split_inline_style)
|
|
# 4. Processing indents
|
|
inline_style: str = self.indents_processing(split_inline_style)
|
|
return inline_style
|
|
else:
|
|
return ""
|
|
|
|
@staticmethod
|
|
def check_style_to_be_tag(style: str) -> List[tuple]:
|
|
"""
|
|
Function searches style properties that can be converted to tag.
|
|
It searches for them and prepare list of properties to be removed from style string
|
|
Parameters
|
|
----------
|
|
style: str
|
|
<tag style="...">
|
|
|
|
Returns
|
|
-------
|
|
styles_to_remove: list
|
|
properties to remove
|
|
|
|
"""
|
|
styles_to_remove = []
|
|
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
|
if f"{k[0]}:{k[1]}" in style:
|
|
styles_to_remove.append(k)
|
|
return styles_to_remove
|
|
|
|
def change_attrs_with_corresponding_tags(self):
|
|
# adds <strong>, <u>, <sup> instead of styles
|
|
styles_to_remove = self.check_style_to_be_tag(self.tag_inline_style.attrs['style'])
|
|
for i, (attr, value) in enumerate(styles_to_remove):
|
|
self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\
|
|
.replace(f"{attr}:{value};", "").strip()
|
|
corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
|
|
attr, value)]
|
|
correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name)
|
|
for content in reversed(self.tag_inline_style.contents):
|
|
correspond_tag.insert(0, content.extract())
|
|
self.tag_inline_style.append(correspond_tag)
|
|
|
|
@staticmethod
|
|
def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag):
|
|
"""Function designed to save style attrs that cannot be in tag.name -> span"""
|
|
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG))
|
|
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
|
|
styles_can_be_in_tag = [style
|
|
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items()
|
|
if re.match(tag, initial_tag.name)
|
|
for style in styles]
|
|
styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS
|
|
if attr not in styles_can_be_in_tag]
|
|
span_style = initial_tag.attrs["style"]
|
|
# here check that this style is exactly the same.
|
|
# Not "align" when we have "text-align", or "border" when we have "border-top"
|
|
styles_to_be_saved_in_span = [((attr + ":") in span_style) & (
|
|
"-" + attr not in span_style) for attr in styles_cant_be_in_tag]
|
|
if any(styles_to_be_saved_in_span):
|
|
# if we find styles that cannot be in <tag.name> -> wrap them in span
|
|
tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}")
|
|
style = ""
|
|
possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag]
|
|
for possible_attr_regexp in possible_attrs_regexp:
|
|
has_style_attrs = re.search(
|
|
possible_attr_regexp, span_style)
|
|
if has_style_attrs and has_style_attrs.group(1):
|
|
style += has_style_attrs.group(1)
|
|
span_style = span_style.replace(
|
|
has_style_attrs.group(1), "")
|
|
tag.attrs["style"] = style
|
|
initial_tag.name = "span"
|
|
initial_tag.attrs["style"] = span_style
|
|
initial_tag.wrap(tag)
|
|
|
|
def convert_initial_tag(self) -> Tag:
|
|
self.change_attrs_with_corresponding_tags()
|
|
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
|
|
return self.tag_inline_style
|
|
|
|
|
|
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = "") -> BeautifulSoup:
|
|
"""
|
|
Function adds styles from .css to inline style.
|
|
Parameters
|
|
----------
|
|
html_soup: BeautifulSoup
|
|
html page with inline style
|
|
css_text: str
|
|
css content from css file
|
|
Returns
|
|
-------
|
|
inline_soup: BeautifulSoup
|
|
soup with styles from css
|
|
|
|
"""
|
|
# remove this specification because it causes problems
|
|
css_text = css_text.replace(
|
|
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
|
# here we add css styles to inline style
|
|
html_with_css_styles: str = transform(str(html_soup),
|
|
exclude_pseudoclasses=False,
|
|
include_star_selectors=True,
|
|
remove_classes=False,
|
|
external_styles=False,
|
|
css_text=css_text,
|
|
disable_validation=True,
|
|
allow_network=False)
|
|
|
|
# soup with converted styles from css
|
|
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
|
|
|
|
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
|
attrs={"style": re.compile(".*")})
|
|
|
|
# go through the tags with inline style + style parsed from css file
|
|
for tag_inline_style in tags_with_inline_style:
|
|
style_converter = InlineStyleProcessor(tag_inline_style)
|
|
style_converter.convert_initial_tag()
|
|
return inline_soup
|