From 687c09417a5cd4359d2348c9c06db2c114470b02 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 7 Jul 2022 19:31:16 +0300 Subject: [PATCH] css processing formatting --- src/docx_converter/footnotes_processing.py | 2 +- src/epub_converter/css_preprocessing.py | 237 ------------------ src/epub_converter/css_preprocessor.py | 186 ++++++++++++++ src/epub_converter/footnotes_processing.py | 8 +- ...erter.py => tag_inline_style_processor.py} | 44 +--- src/livecarta_config.py | 56 +++-- 6 files changed, 231 insertions(+), 302 deletions(-) delete mode 100644 src/epub_converter/css_preprocessing.py create mode 100644 src/epub_converter/css_preprocessor.py rename src/epub_converter/{tag_css_style_converter.py => tag_inline_style_processor.py} (84%) diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py index 84861d7..beb6d15 100644 --- a/src/docx_converter/footnotes_processing.py +++ b/src/docx_converter/footnotes_processing.py @@ -1,5 +1,5 @@ import re -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString @staticmethod def _clean_footnote_content(content): diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py deleted file mode 100644 index 0ad0ff7..0000000 --- a/src/epub_converter/css_preprocessing.py +++ /dev/null @@ -1,237 +0,0 @@ -import re -import cssutils - -from ebooklib import epub -from bs4 import BeautifulSoup -from itertools import takewhile - -from src.util.color_reader import str2hex -from src.livecarta_config import LiveCartaConfig - - -def get_text_color(x): - color = str2hex(x) - color = color if color not in ["#000000", "#000", "black"] else "" - return color - - -def get_bg_color(x): - color = str2hex(x) - color = color if color not in ["#ffffff", "#fff", "white"] else "" - return color - - -def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str: - """ - Function - - converts values of tags from em/%/pt to px - - find closest font-size px - Parameters - ---------- - size_value: str - - Returns - ------- - size_value: str - converted value size - """ - size_regexp = re.compile( - r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)") - has_style_attrs = re.search(size_regexp, size_value) - if has_style_attrs: - if has_style_attrs.group(1): - multiplier = 5.76 if is_indent else 0.16 - size_value = float(size_value.replace("%", "")) * multiplier - return str(size_value)+'px' - elif has_style_attrs.group(3): - multiplier = 18 if is_indent else 16 - size_value = float(size_value.replace("em", "")) * multiplier - return str(size_value)+'px' - elif has_style_attrs.group(5): - size_value = float(size_value.replace("pt", "")) * 4/3 - return str(size_value)+'px' - else: - return "" - return size_value - - -def convert_indents_tag_values(size_value: str) -> str: - """ - Function converts values of ["text-indent", "margin-left", "margin"] - Parameters - ---------- - size_value: str - - Returns - ------- - size_value: str - - """ - if len(size_value.split(" ")) == 3: - size_value = convert_tag_style_values(size_value.split( - " ")[-2], True) # returns middle value - else: - size_value = convert_tag_style_values(size_value.split( - " ")[-1], True) # returns last value - return size_value - - -""" -Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } -Style properties that can be used to fit LiveCarta css style convention. -If property has empty list, it means that any value can be converted. -If property has not empty list, it means that only certain property-value combinations can be transformed. -""" -LIVECARTA_STYLE_ATTRS = { - "text-indent": [], - "font-variant": ["small-caps"], - "text-align": [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], - "align": [], - "font": [], - "font-family": [], - "font-size": [], - "font-weight": ["bold", "600", "700", "800", "900"], # - "font-style": ["italic"], # - "text-decoration": ["underline", "line-through"], # , - "text-decoration-line": ["underline", "line-through"], # , - "vertical-align": ["super"], # - "color": [], - "background-color": [], - "background": [], - "width": [], - "border": [], - "border-top-width": [], - "border-right-width": [], - "border-left-width": [], - "border-bottom-width": [], - "border-top": [], - "border-bottom": [], - "list-style-type": [], - "list-style-image": [], - "margin-left": [], - "margin-top": [], - "margin": [], -} - -""" -Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } - -Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated -to suit LiveCarta style convention. -""" -LIVECARTA_STYLE_ATTRS_MAPPING = { - "text-indent": convert_indents_tag_values, - "font-variant": lambda x: x, - "text-align": lambda x: x, - "font": lambda x: "", - "font-family": lambda x: x, - "font-size": convert_tag_style_values, - "color": get_text_color, - "background-color": get_bg_color, - "background": get_bg_color, - "border": lambda x: x if x != "0" else "", - "border-top-width": lambda x: x if x != "0" else "", - "border-right-width": lambda x: x if x != "0" else "", - "border-left-width": lambda x: x if x != "0" else "", - "border-bottom-width": lambda x: x if x != "0" else "", - "border-top": lambda x: x if x != "0" else "", - "border-bottom": lambda x: x if x != "0" else "", - "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", - "list-style-image": lambda x: "disc", - "margin-left": convert_indents_tag_values, - "margin-top": convert_tag_style_values, - "margin": convert_indents_tag_values, -} - - -def style_conditions(style_value, style_name): - cleaned_value = style_value.replace("\"", "") - constraints_on_value = LIVECARTA_STYLE_ATTRS.get( - style_name) - value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ - style_name] - return cleaned_value, constraints_on_value, value_not_in_possible_values_list - - -def update_inline_styles_to_livecarta_convention(split_style: list): - for i, style in enumerate(split_style): - style_name, style_value = style.split(":") - if style_name not in LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = "" - return split_style - - cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ - style_conditions(style_value, style_name) - if constraints_on_value and value_not_in_possible_values_list: - # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = "" - else: - if style_name in LIVECARTA_STYLE_ATTRS_MAPPING: - # function that converts our data - func = LIVECARTA_STYLE_ATTRS_MAPPING[style_name] - style_value = func(cleaned_value) - split_style[i] = style_name + ":" + style_value - return split_style - - -def build_inline_style_content(style: str) -> str: - """Build inline style with LiveCarta convention""" - # replace all spaces between "; & letter" to ";" - style = re.sub(r"; *", ";", style) - # when we split style by ";", last element of the list is "" - None - # remove it - split_style: list = list(filter(None, style.split(";"))) - # replace all spaces between ": & letter" to ":" - split_style = [el.replace( - re.search(r"(:\s*)", el).group(1), ":") for el in split_style] - - split_style = update_inline_styles_to_livecarta_convention(split_style) - style = "; ".join(split_style) - return style - - -def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule, - style_type: cssutils.css.property.Property): - if style_type.name == "font-family": - pass - if style_type.name not in LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = "" - return - - cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ - style_conditions(style_type.value, style_type.name) - if constraints_on_value and value_not_in_possible_values_list: - # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = "" - else: - if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: - # function that converts our data - func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] - css_rule.style[style_type.name] = func(cleaned_value) - - -def build_css_file_content(css_content: str) -> str: - """Build css content with LiveCarta convention""" - sheet = cssutils.parseString(css_content, validate=False) - - for css_rule in sheet: - if css_rule.type == css_rule.STYLE_RULE: - for style_type in css_rule.style: - update_css_styles_to_livecarta_convention( - css_rule, style_type) - - css_text: str = sheet._getCssText().decode() - return css_text - - -if __name__ == "__main__": - file = "../../epub/9781627222174.epub" - ebooklib_book = epub.read_epub(file) - css_ = ebooklib_book.get_item_with_href("css/epub.css") - css_ = css_.get_content().decode() - css_cleaned = build_css_file_content(css_) - html_ = ebooklib_book.get_item_with_href( - "pr01s05.xhtml").get_body_content().decode() - html_soup = BeautifulSoup(html_, features="lxml") diff --git a/src/epub_converter/css_preprocessor.py b/src/epub_converter/css_preprocessor.py new file mode 100644 index 0000000..57c0388 --- /dev/null +++ b/src/epub_converter/css_preprocessor.py @@ -0,0 +1,186 @@ +import re +import cssutils + +from src.util.helpers import BookLogger +from src.util.color_reader import str2hex +from src.livecarta_config import LiveCartaConfig + + +class CSSPreprocessor: + def __init__(self, logger=None): + self.logger: BookLogger = logger + """ + Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } + + Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated + to suit LiveCarta style convention. + """ + self.LIVECARTA_STYLE_ATTRS_MAPPING = { + "text-indent": self.convert_indents_tag_values, + "font-variant": lambda x: x, + "text-align": lambda x: x, + "font": lambda x: "", + "font-family": lambda x: x, + "font-size": self.convert_tag_style_values, + "color": self.get_text_color, + "background-color": self.get_bg_color, + "background": self.get_bg_color, + "border": lambda x: x if x != "0" else "", + "border-top-width": lambda x: x if x != "0" else "", + "border-right-width": lambda x: x if x != "0" else "", + "border-left-width": lambda x: x if x != "0" else "", + "border-bottom-width": lambda x: x if x != "0" else "", + "border-top": lambda x: x if x != "0" else "", + "border-bottom": lambda x: x if x != "0" else "", + "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", + "list-style-image": lambda x: "disc", + "margin-left": self.convert_indents_tag_values, + "margin-top": self.convert_tag_style_values, + "margin": self.convert_indents_tag_values, + } + + @staticmethod + def get_text_color(x): + color = str2hex(x) + color = color if color not in ["#000000", "#000", "black"] else "" + return color + + @staticmethod + def get_bg_color(x): + color = str2hex(x) + color = color if color not in ["#ffffff", "#fff", "white"] else "" + return color + + @staticmethod + def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str: + """ + Function + - converts values of tags from em/%/pt to px + - find closest font-size px + Parameters + ---------- + size_value: str + + is_indent: bool + + Returns + ------- + size_value: str + converted value size + """ + size_regexp = re.compile( + r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)") + has_style_attrs = re.search(size_regexp, size_value) + if has_style_attrs: + if has_style_attrs.group(1): + multiplier = 5.76 if is_indent else 0.16 + size_value = float(size_value.replace("%", "")) * multiplier + return str(size_value)+'px' + elif has_style_attrs.group(3): + multiplier = 18 if is_indent else 16 + size_value = float(size_value.replace("em", "")) * multiplier + return str(size_value)+'px' + elif has_style_attrs.group(5): + size_value = float(size_value.replace("pt", "")) * 4/3 + return str(size_value)+'px' + else: + return "" + return size_value + + def convert_indents_tag_values(self, size_value: str) -> str: + """ + Function converts values of ["text-indent", "margin-left", "margin"] + Parameters + ---------- + size_value: str + + Returns + ------- + size_value: str + + """ + if len(size_value.split(" ")) == 3: + size_value = self.convert_tag_style_values(size_value.split( + " ")[-2], True) # returns middle value + else: + size_value = self.convert_tag_style_values(size_value.split( + " ")[-1], True) # returns last value + return size_value + + @staticmethod + def style_conditions(style_value, style_name): + cleaned_value = style_value.replace("\"", "") + constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get( + style_name) + value_not_in_possible_values_list = cleaned_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[ + style_name] + return cleaned_value, constraints_on_value, value_not_in_possible_values_list + + def update_inline_styles_to_livecarta_convention(self, split_style: list): + for i, style in enumerate(split_style): + style_name, style_value = style.split(":") + if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: + # property not in LIVECARTA_STYLE_ATTRS, remove from css file + split_style[i] = "" + return split_style + + cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ + self.style_conditions(style_value, style_name) + if constraints_on_value and value_not_in_possible_values_list: + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file + split_style[i] = "" + else: + if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING: + # function that converts our data + func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name] + style_value = func(cleaned_value) + split_style[i] = style_name + ":" + style_value + return split_style + + def build_inline_style_content(self, style: str) -> str: + """Build inline style with LiveCarta convention""" + # replace all spaces between "; & letter" to ";" + style = re.sub(r"; *", ";", style) + # when we split style by ";", last element of the list is "" - None + # remove it + split_style: list = list(filter(None, style.split(";"))) + # replace all spaces between ": & letter" to ":" + split_style = [el.replace( + re.search(r"(:\s*)", el).group(1), ":") for el in split_style] + + split_style = self.update_inline_styles_to_livecarta_convention(split_style) + style = "; ".join(split_style) + return style + + def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule, + style_type: cssutils.css.property.Property): + if style_type.name == "font-family": + pass + if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: + # property not in LIVECARTA_STYLE_ATTRS, remove from css file + css_rule.style[style_type.name] = "" + return + + cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ + self.style_conditions(style_type.value, style_type.name) + if constraints_on_value and value_not_in_possible_values_list: + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file + css_rule.style[style_type.name] = "" + else: + if style_type.name in self.LIVECARTA_STYLE_ATTRS_MAPPING: + # function that converts our data + func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] + css_rule.style[style_type.name] = func(cleaned_value) + + def build_css_file_content(self, css_content: str) -> str: + """Build css content with LiveCarta convention""" + sheet = cssutils.parseString(css_content, validate=False) + + for css_rule in sheet: + if css_rule.type == css_rule.STYLE_RULE: + for style_type in css_rule.style: + self.update_css_styles_to_livecarta_convention( + css_rule, style_type) + + css_text: str = sheet._getCssText().decode() + return css_text diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py index ef2eac0..ae568e0 100644 --- a/src/epub_converter/footnotes_processing.py +++ b/src/epub_converter/footnotes_processing.py @@ -1,5 +1,5 @@ +import re from typing import Tuple - from bs4 import BeautifulSoup, Tag @@ -84,4 +84,10 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note footnote_tag = footnote_tag.find( attrs={"role": "doc-backlink"}) or footnote_tag new_footnotes_tags.append(footnote_tag) + + for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)): + noteref.attrs["data-id"] = i + 1 + noteref.attrs["id"] = f"footnote-{i + 1}" + footnote.attrs["href"] = f"#footnote-{i + 1}" + return footnotes, new_noterefs_tags, new_footnotes_tags diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_inline_style_processor.py similarity index 84% rename from src/epub_converter/tag_css_style_converter.py rename to src/epub_converter/tag_inline_style_processor.py index 1032d49..c4e0b45 100644 --- a/src/epub_converter/tag_css_style_converter.py +++ b/src/epub_converter/tag_inline_style_processor.py @@ -4,15 +4,13 @@ from typing import List from logging import CRITICAL from bs4 import BeautifulSoup -from premailer import transform from src.livecarta_config import LiveCartaConfig -from src.epub_converter.css_preprocessing import LIVECARTA_STYLE_ATTRS cssutils.log.setLevel(CRITICAL) -class TagStyleConverter: +class TagInlineStyleProcessor: def __init__(self, tag_inline_style): # tag with inline style + style parsed from css file self.tag_inline_style = tag_inline_style @@ -190,7 +188,7 @@ class TagStyleConverter: for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items() if re.match(tag, initial_tag.name) for style in styles] - styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS + styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS if attr not in styles_can_be_in_tag] span_style = initial_tag.attrs["style"] # here check that this style is exactly the same. @@ -218,41 +216,3 @@ class TagStyleConverter: self.change_attrs_with_corresponding_tags() self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) return self.tag_inline_style - - -def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: - """ - Function adds styles from .css to inline style. - Parameters - ---------- - html_soup: BeautifulSoup - html page with inline style - css_text: str - css content from css file - Returns - ------- - inline_soup: BeautifulSoup - soup with styles from css - - """ - # remove this specification because it causes problems - css_text = css_text.replace( - '@namespace epub "http://www.idpf.org/2007/ops";', '') - # here we add css styles to inline style - html_with_css_styles: str = transform(str(html_soup), css_text=css_text, - remove_classes=False, - external_styles=False, - allow_network=False, - disable_validation=True, - ) - # soup with converted styles from css - inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") - - tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={"style": re.compile(".*")}) - - # go through the tags with inline style + style parsed from css file - for tag_inline_style in tags_with_inline_style: - style_converter = TagStyleConverter(tag_inline_style) - style_converter.convert_initial_tag() - return inline_soup diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 9a94545..9ae2d40 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -101,25 +101,39 @@ class LiveCartaConfig: r"(^h[1-9]$)": ["list-style-type"] } - WRAP_TAGS_WITH_TABLE = { - ("div",): ["width", "border", "bgcolor"], - ("section", "blockquote",): ("class", r"feature[1234]"), - } - - """('what to replace', 'parent tag', 'child tag')""" - REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS = { - (r"^h[6-9]$", "^figure$", "^section$", "^div$"): "p", - ("^aside$",): "blockquote", - ("^header$", "^footer$", ("child", ":not(pre)", "code, kbd, var")): "span", - ("^b$",): "strong", - # (("parent", ":not(pre)", "code")): "p", - } - - """ > == in (p in li)""" - TAGS_TO_UNWRAP = [ - "section", "article", "figcaption", "main", "body", "html", "li > p", - ] - - INSERT_TAG_IN_PARENT_TAG = { - ("pre", "code, kbd, var"): "code", + """ + Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } + Style properties that can be used to fit LiveCarta css style convention. + If property has empty list, it means that any value can be converted. + If property has not empty list, it means that only certain property-value combinations can be transformed. + """ + LIVECARTA_STYLE_ATTRS = { + "text-indent": [], + "font-variant": ["small-caps"], + "text-align": [x for x in ["justify", "right", "center", "left"] if x != "left"], + "align": [], + "font": [], + "font-family": [], + "font-size": [], + "font-weight": ["bold", "600", "700", "800", "900"], # + "font-style": ["italic"], # + "text-decoration": ["underline", "line-through"], # , + "text-decoration-line": ["underline", "line-through"], # , + "vertical-align": ["super"], # + "color": [], + "background-color": [], + "background": [], + "width": [], + "border": [], + "border-top-width": [], + "border-right-width": [], + "border-left-width": [], + "border-bottom-width": [], + "border-top": [], + "border-bottom": [], + "list-style-type": [], + "list-style-image": [], + "margin-left": [], + "margin-top": [], + "margin": [], }