css processing formatting

This commit is contained in:
Kiryl
2022-07-07 19:31:16 +03:00
parent 114ac78eb0
commit 687c09417a
6 changed files with 231 additions and 302 deletions

View File

@@ -1,5 +1,5 @@
import re
from bs4 import BeautifulSoup, NavigableString, Tag
from bs4 import BeautifulSoup, NavigableString
@staticmethod
def _clean_footnote_content(content):

View File

@@ -1,237 +0,0 @@
import re
import cssutils
from ebooklib import epub
from bs4 import BeautifulSoup
from itertools import takewhile
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
def get_text_color(x):
color = str2hex(x)
color = color if color not in ["#000000", "#000", "black"] else ""
return color
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ["#ffffff", "#fff", "white"] else ""
return color
def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str:
"""
Function
- converts values of tags from em/%/pt to px
- find closest font-size px
Parameters
----------
size_value: str
Returns
-------
size_value: str
converted value size
"""
size_regexp = re.compile(
r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)")
has_style_attrs = re.search(size_regexp, size_value)
if has_style_attrs:
if has_style_attrs.group(1):
multiplier = 5.76 if is_indent else 0.16
size_value = float(size_value.replace("%", "")) * multiplier
return str(size_value)+'px'
elif has_style_attrs.group(3):
multiplier = 18 if is_indent else 16
size_value = float(size_value.replace("em", "")) * multiplier
return str(size_value)+'px'
elif has_style_attrs.group(5):
size_value = float(size_value.replace("pt", "")) * 4/3
return str(size_value)+'px'
else:
return ""
return size_value
def convert_indents_tag_values(size_value: str) -> str:
"""
Function converts values of ["text-indent", "margin-left", "margin"]
Parameters
----------
size_value: str
Returns
-------
size_value: str
"""
if len(size_value.split(" ")) == 3:
size_value = convert_tag_style_values(size_value.split(
" ")[-2], True) # returns middle value
else:
size_value = convert_tag_style_values(size_value.split(
" ")[-1], True) # returns last value
return size_value
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit LiveCarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
"text-indent": [],
"font-variant": ["small-caps"],
"text-align": [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
"align": [],
"font": [],
"font-family": [],
"font-size": [],
"font-weight": ["bold", "600", "700", "800", "900"], # <strong>
"font-style": ["italic"], # <i>
"text-decoration": ["underline", "line-through"], # <u> , <s>
"text-decoration-line": ["underline", "line-through"], # <u> , <s>
"vertical-align": ["super"], # <sup>
"color": [],
"background-color": [],
"background": [],
"width": [],
"border": [],
"border-top-width": [],
"border-right-width": [],
"border-left-width": [],
"border-bottom-width": [],
"border-top": [],
"border-bottom": [],
"list-style-type": [],
"list-style-image": [],
"margin-left": [],
"margin-top": [],
"margin": [],
}
"""
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit LiveCarta style convention.
"""
LIVECARTA_STYLE_ATTRS_MAPPING = {
"text-indent": convert_indents_tag_values,
"font-variant": lambda x: x,
"text-align": lambda x: x,
"font": lambda x: "",
"font-family": lambda x: x,
"font-size": convert_tag_style_values,
"color": get_text_color,
"background-color": get_bg_color,
"background": get_bg_color,
"border": lambda x: x if x != "0" else "",
"border-top-width": lambda x: x if x != "0" else "",
"border-right-width": lambda x: x if x != "0" else "",
"border-left-width": lambda x: x if x != "0" else "",
"border-bottom-width": lambda x: x if x != "0" else "",
"border-top": lambda x: x if x != "0" else "",
"border-bottom": lambda x: x if x != "0" else "",
"list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
"list-style-image": lambda x: "disc",
"margin-left": convert_indents_tag_values,
"margin-top": convert_tag_style_values,
"margin": convert_indents_tag_values,
}
def style_conditions(style_value, style_name):
cleaned_value = style_value.replace("\"", "")
constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
style_name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_name]
return cleaned_value, constraints_on_value, value_not_in_possible_values_list
def update_inline_styles_to_livecarta_convention(split_style: list):
for i, style in enumerate(split_style):
style_name, style_value = style.split(":")
if style_name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ""
return split_style
cleaned_value, constraints_on_value, value_not_in_possible_values_list =\
style_conditions(style_value, style_name)
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ""
else:
if style_name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
style_value = func(cleaned_value)
split_style[i] = style_name + ":" + style_value
return split_style
def build_inline_style_content(style: str) -> str:
"""Build inline style with LiveCarta convention"""
# replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", style)
# when we split style by ";", last element of the list is "" - None
# remove it
split_style: list = list(filter(None, style.split(";")))
# replace all spaces between ": & letter" to ":"
split_style = [el.replace(
re.search(r"(:\s*)", el).group(1), ":") for el in split_style]
split_style = update_inline_styles_to_livecarta_convention(split_style)
style = "; ".join(split_style)
return style
def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule,
style_type: cssutils.css.property.Property):
if style_type.name == "font-family":
pass
if style_type.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ""
return
cleaned_value, constraints_on_value, value_not_in_possible_values_list =\
style_conditions(style_type.value, style_type.name)
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ""
else:
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
css_rule.style[style_type.name] = func(cleaned_value)
def build_css_file_content(css_content: str) -> str:
"""Build css content with LiveCarta convention"""
sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet:
if css_rule.type == css_rule.STYLE_RULE:
for style_type in css_rule.style:
update_css_styles_to_livecarta_convention(
css_rule, style_type)
css_text: str = sheet._getCssText().decode()
return css_text
if __name__ == "__main__":
file = "../../epub/9781627222174.epub"
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href("css/epub.css")
css_ = css_.get_content().decode()
css_cleaned = build_css_file_content(css_)
html_ = ebooklib_book.get_item_with_href(
"pr01s05.xhtml").get_body_content().decode()
html_soup = BeautifulSoup(html_, features="lxml")

View File

@@ -0,0 +1,186 @@
import re
import cssutils
from src.util.helpers import BookLogger
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
class CSSPreprocessor:
def __init__(self, logger=None):
self.logger: BookLogger = logger
"""
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit LiveCarta style convention.
"""
self.LIVECARTA_STYLE_ATTRS_MAPPING = {
"text-indent": self.convert_indents_tag_values,
"font-variant": lambda x: x,
"text-align": lambda x: x,
"font": lambda x: "",
"font-family": lambda x: x,
"font-size": self.convert_tag_style_values,
"color": self.get_text_color,
"background-color": self.get_bg_color,
"background": self.get_bg_color,
"border": lambda x: x if x != "0" else "",
"border-top-width": lambda x: x if x != "0" else "",
"border-right-width": lambda x: x if x != "0" else "",
"border-left-width": lambda x: x if x != "0" else "",
"border-bottom-width": lambda x: x if x != "0" else "",
"border-top": lambda x: x if x != "0" else "",
"border-bottom": lambda x: x if x != "0" else "",
"list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
"list-style-image": lambda x: "disc",
"margin-left": self.convert_indents_tag_values,
"margin-top": self.convert_tag_style_values,
"margin": self.convert_indents_tag_values,
}
@staticmethod
def get_text_color(x):
color = str2hex(x)
color = color if color not in ["#000000", "#000", "black"] else ""
return color
@staticmethod
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ["#ffffff", "#fff", "white"] else ""
return color
@staticmethod
def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str:
"""
Function
- converts values of tags from em/%/pt to px
- find closest font-size px
Parameters
----------
size_value: str
is_indent: bool
Returns
-------
size_value: str
converted value size
"""
size_regexp = re.compile(
r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)")
has_style_attrs = re.search(size_regexp, size_value)
if has_style_attrs:
if has_style_attrs.group(1):
multiplier = 5.76 if is_indent else 0.16
size_value = float(size_value.replace("%", "")) * multiplier
return str(size_value)+'px'
elif has_style_attrs.group(3):
multiplier = 18 if is_indent else 16
size_value = float(size_value.replace("em", "")) * multiplier
return str(size_value)+'px'
elif has_style_attrs.group(5):
size_value = float(size_value.replace("pt", "")) * 4/3
return str(size_value)+'px'
else:
return ""
return size_value
def convert_indents_tag_values(self, size_value: str) -> str:
"""
Function converts values of ["text-indent", "margin-left", "margin"]
Parameters
----------
size_value: str
Returns
-------
size_value: str
"""
if len(size_value.split(" ")) == 3:
size_value = self.convert_tag_style_values(size_value.split(
" ")[-2], True) # returns middle value
else:
size_value = self.convert_tag_style_values(size_value.split(
" ")[-1], True) # returns last value
return size_value
@staticmethod
def style_conditions(style_value, style_name):
cleaned_value = style_value.replace("\"", "")
constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get(
style_name)
value_not_in_possible_values_list = cleaned_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[
style_name]
return cleaned_value, constraints_on_value, value_not_in_possible_values_list
def update_inline_styles_to_livecarta_convention(self, split_style: list):
for i, style in enumerate(split_style):
style_name, style_value = style.split(":")
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ""
return split_style
cleaned_value, constraints_on_value, value_not_in_possible_values_list =\
self.style_conditions(style_value, style_name)
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ""
else:
if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
style_value = func(cleaned_value)
split_style[i] = style_name + ":" + style_value
return split_style
def build_inline_style_content(self, style: str) -> str:
"""Build inline style with LiveCarta convention"""
# replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", style)
# when we split style by ";", last element of the list is "" - None
# remove it
split_style: list = list(filter(None, style.split(";")))
# replace all spaces between ": & letter" to ":"
split_style = [el.replace(
re.search(r"(:\s*)", el).group(1), ":") for el in split_style]
split_style = self.update_inline_styles_to_livecarta_convention(split_style)
style = "; ".join(split_style)
return style
def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
style_type: cssutils.css.property.Property):
if style_type.name == "font-family":
pass
if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ""
return
cleaned_value, constraints_on_value, value_not_in_possible_values_list =\
self.style_conditions(style_type.value, style_type.name)
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ""
else:
if style_type.name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
css_rule.style[style_type.name] = func(cleaned_value)
def build_css_file_content(self, css_content: str) -> str:
"""Build css content with LiveCarta convention"""
sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet:
if css_rule.type == css_rule.STYLE_RULE:
for style_type in css_rule.style:
self.update_css_styles_to_livecarta_convention(
css_rule, style_type)
css_text: str = sheet._getCssText().decode()
return css_text

View File

@@ -1,5 +1,5 @@
import re
from typing import Tuple
from bs4 import BeautifulSoup, Tag
@@ -84,4 +84,10 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
footnote_tag = footnote_tag.find(
attrs={"role": "doc-backlink"}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)):
noteref.attrs["data-id"] = i + 1
noteref.attrs["id"] = f"footnote-{i + 1}"
footnote.attrs["href"] = f"#footnote-{i + 1}"
return footnotes, new_noterefs_tags, new_footnotes_tags

View File

@@ -4,15 +4,13 @@ from typing import List
from logging import CRITICAL
from bs4 import BeautifulSoup
from premailer import transform
from src.livecarta_config import LiveCartaConfig
from src.epub_converter.css_preprocessing import LIVECARTA_STYLE_ATTRS
cssutils.log.setLevel(CRITICAL)
class TagStyleConverter:
class TagInlineStyleProcessor:
def __init__(self, tag_inline_style):
# tag with inline style + style parsed from css file
self.tag_inline_style = tag_inline_style
@@ -190,7 +188,7 @@ class TagStyleConverter:
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items()
if re.match(tag, initial_tag.name)
for style in styles]
styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS
styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS
if attr not in styles_can_be_in_tag]
span_style = initial_tag.attrs["style"]
# here check that this style is exactly the same.
@@ -218,41 +216,3 @@ class TagStyleConverter:
self.change_attrs_with_corresponding_tags()
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
return self.tag_inline_style
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""
Function adds styles from .css to inline style.
Parameters
----------
html_soup: BeautifulSoup
html page with inline style
css_text: str
css content from css file
Returns
-------
inline_soup: BeautifulSoup
soup with styles from css
"""
# remove this specification because it causes problems
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:
style_converter = TagStyleConverter(tag_inline_style)
style_converter.convert_initial_tag()
return inline_soup

View File

@@ -101,25 +101,39 @@ class LiveCartaConfig:
r"(^h[1-9]$)": ["list-style-type"]
}
WRAP_TAGS_WITH_TABLE = {
("div",): ["width", "border", "bgcolor"],
("section", "blockquote",): ("class", r"feature[1234]"),
}
"""('what to replace', 'parent tag', 'child tag')"""
REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS = {
(r"^h[6-9]$", "^figure$", "^section$", "^div$"): "p",
("^aside$",): "blockquote",
("^header$", "^footer$", ("child", ":not(pre)", "code, kbd, var")): "span",
("^b$",): "strong",
# (("parent", ":not(pre)", "code")): "p",
}
""" > == in (p in li)"""
TAGS_TO_UNWRAP = [
"section", "article", "figcaption", "main", "body", "html", "li > p",
]
INSERT_TAG_IN_PARENT_TAG = {
("pre", "code, kbd, var"): "code",
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit LiveCarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
"text-indent": [],
"font-variant": ["small-caps"],
"text-align": [x for x in ["justify", "right", "center", "left"] if x != "left"],
"align": [],
"font": [],
"font-family": [],
"font-size": [],
"font-weight": ["bold", "600", "700", "800", "900"], # <strong>
"font-style": ["italic"], # <i>
"text-decoration": ["underline", "line-through"], # <u> , <s>
"text-decoration-line": ["underline", "line-through"], # <u> , <s>
"vertical-align": ["super"], # <sup>
"color": [],
"background-color": [],
"background": [],
"width": [],
"border": [],
"border-top-width": [],
"border-right-width": [],
"border-left-width": [],
"border-bottom-width": [],
"border-top": [],
"border-bottom": [],
"list-style-type": [],
"list-style-image": [],
"margin-left": [],
"margin-top": [],
"margin": [],
}