Fix span wrapper processing

This commit is contained in:
Kibzik
2023-03-15 20:00:32 +03:00
parent b5ad043335
commit 047bfeca20
7 changed files with 82 additions and 75 deletions

View File

@@ -33,7 +33,7 @@ class BookSolver:
self.status_wrapper = BookStatusWrapper( self.status_wrapper = BookStatusWrapper(
access, self.book_logger, book_id) access, self.book_logger, book_id)
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \ assert LiveCartaConfig.NUM_SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADER_TAGS), \
"Length of headers doesn't match allowed levels." "Length of headers doesn't match allowed levels."
def save_file(self, content: bytes, path_to_save: str, file_type: str) -> str: def save_file(self, content: bytes, path_to_save: str, file_type: str) -> str:

View File

@@ -67,14 +67,14 @@ class ChapterItem:
for i in self.sub_items: for i in self.sub_items:
sub_dicts.append(i.to_dict(lvl + 1)) sub_dicts.append(i.to_dict(lvl + 1))
if lvl > LiveCartaConfig.SUPPORTED_LEVELS: if lvl > LiveCartaConfig.NUM_SUPPORTED_LEVELS:
return { return {
"title": self.title, "title": self.title,
"contents": [self.content] + [x['contents'] for x in sub_dicts], "contents": [self.content] + [x['contents'] for x in sub_dicts],
"sub_items": [] "sub_items": []
} }
if (lvl == LiveCartaConfig.SUPPORTED_LEVELS) and sub_dicts: if (lvl == LiveCartaConfig.NUM_SUPPORTED_LEVELS) and sub_dicts:
return { return {
"title": self.title, "title": self.title,
"contents": [self.content] + flatten([x['contents'] for x in sub_dicts]), "contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),

View File

@@ -49,7 +49,7 @@ class LibreHtml2JsonConverter:
result, ind result, ind
""" """
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
title = str(self.content[ind]) title = str(self.content[ind])
title = title.replace(f"<{self.content[ind].name}>", "") title = title.replace(f"<{self.content[ind].name}>", "")
title = title.replace(f"</{self.content[ind].name}>", "") title = title.replace(f"</{self.content[ind].name}>", "")
@@ -67,7 +67,7 @@ class LibreHtml2JsonConverter:
while ind < len(self.content): while ind < len(self.content):
# 1. next tag is a header # 1. next tag is a header
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
outline = int(re.sub(r"^h", "", self.content[ind].name)) outline = int(re.sub(r"^h", "", self.content[ind].name))
# - recursion step until h_i > h_initial # - recursion step until h_i > h_initial
if outline > curr_outline: if outline > curr_outline:
@@ -116,13 +116,13 @@ class LibreHtml2JsonConverter:
while ind < len(self.content): while ind < len(self.content):
res = {} res = {}
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
res, ind = self.header_to_livecarta_chapter_item(ind) res, ind = self.header_to_livecarta_chapter_item(ind)
else: else:
chapter_title = f"Untitled chapter {ch_num}" chapter_title = f"Untitled chapter {ch_num}"
chapter = [] chapter = []
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
if not self._is_empty_p_tag(self.content[ind]): if not self._is_empty_p_tag(self.content[ind]):
chapter.append(self.format_html( chapter.append(self.format_html(
str(self.content[ind]))) str(self.content[ind])))

View File

@@ -564,7 +564,7 @@ class EpubConverter:
indent: str = " " * lvl indent: str = " " * lvl
self.book_logger.log(indent + f"Chapter: {title} is processing.") self.book_logger.log(indent + f"Chapter: {title} is processing.")
is_chapter: bool = lvl <= LiveCartaConfig.SUPPORTED_LEVELS is_chapter: bool = lvl <= LiveCartaConfig.NUM_SUPPORTED_LEVELS
self.book_logger.log(indent + "Process title.") self.book_logger.log(indent + "Process title.")
title_preprocessed: str = self.html_processor.prepare_title(title) title_preprocessed: str = self.html_processor.prepare_title(title)
self.book_logger.log(indent + "Process content.") self.book_logger.log(indent + "Process content.")

View File

@@ -171,7 +171,7 @@ class InlineStyleProcessor:
""" """
styles_to_remove = [] styles_to_remove = []
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: for k in LiveCartaConfig.STYLE_ATTRS_TO_TAGS:
if f"{k[0]}:{k[1]}" in style: if f"{k[0]}:{k[1]}" in style:
styles_to_remove.append(k) styles_to_remove.append(k)
return styles_to_remove return styles_to_remove
@@ -182,7 +182,7 @@ class InlineStyleProcessor:
for i, (attr, value) in enumerate(styles_to_remove): for i, (attr, value) in enumerate(styles_to_remove):
self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\ self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\
.replace(f"{attr}:{value};", "").strip() .replace(f"{attr}:{value};", "").strip()
corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( corr_tag_name = LiveCartaConfig.STYLE_ATTRS_TO_TAGS[(
attr, value)] attr, value)]
correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name) correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name)
for content in reversed(self.tag_inline_style.contents): for content in reversed(self.tag_inline_style.contents):
@@ -190,41 +190,56 @@ class InlineStyleProcessor:
self.tag_inline_style.append(correspond_tag) self.tag_inline_style.append(correspond_tag)
@staticmethod @staticmethod
def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag): def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag) -> Tag:
"""Function designed to save style attrs that cannot be in tag.name -> span""" """Function designed to save style attrs that cannot be in tag.name -> span"""
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG))
# Compile a regex pattern to match tag names that can have certain style attributes
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG))
# Check if the tag's name is one of the allowed tags, and it has a "style" attribute
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"): if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
styles_can_be_in_tag = [style # Get a list of style attributes that can be in the tag
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items() styles_can_be_in_tag = [
if re.match(tag, initial_tag.name) style for tag, styles in LiveCartaConfig.TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG.items()
for style in styles] if re.match(tag, initial_tag.name) for style in styles
styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS ]
if attr not in styles_can_be_in_tag] # Get a list of style attributes that cannot be in the tag
styles_cant_be_in_tag = [
attr for attr in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT if attr not in styles_can_be_in_tag
]
# Get the "style" attribute value of the initial tag
span_style = initial_tag.attrs["style"] span_style = initial_tag.attrs["style"]
# here check that this style is exactly the same. # Check that the style attributes in "style" are exactly the same and wrap them in a <span> tag
# Not "align" when we have "text-align", or "border" when we have "border-top" styles_to_be_saved_in_span = [
styles_to_be_saved_in_span = [((attr + ":") in span_style) & ( (attr + ":") in span_style and ("-" + attr) not in span_style for attr in styles_cant_be_in_tag
"-" + attr not in span_style) for attr in styles_cant_be_in_tag] ]
if any(styles_to_be_saved_in_span): if any(styles_to_be_saved_in_span):
# if we find styles that cannot be in <tag.name> -> wrap them in span # Create a new <tag> element with the same tag name as the initial tag
tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}") tag = BeautifulSoup(features="lxml").new_tag(initial_tag.name)
# Create an empty string for the style attribute
style = "" style = ""
possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag] # Compile a list of regex patterns to match style attributes that can be in the tag
possible_attrs_regexp = [
re.compile(fr"({style}: *[#a-zA-Z\d]+;)", re.IGNORECASE) for style in styles_can_be_in_tag
]
# Iterate over the list of regex patterns and search for matching style attributes
for possible_attr_regexp in possible_attrs_regexp: for possible_attr_regexp in possible_attrs_regexp:
has_style_attrs = re.search( has_style_attrs = re.search(possible_attr_regexp, span_style)
possible_attr_regexp, span_style)
if has_style_attrs and has_style_attrs.group(1): if has_style_attrs and has_style_attrs.group(1):
style += has_style_attrs.group(1) style += has_style_attrs.group(1)
span_style = span_style.replace( span_style = span_style.replace(has_style_attrs.group(1), "")
has_style_attrs.group(1), "") # Add the style attribute to the new <tag> element if it exists
tag.attrs["style"] = style if style:
initial_tag.name = "span" tag.attrs["style"] = style
# Set the "style" attribute of the initial tag to the remaining style attributes
initial_tag.attrs["style"] = span_style initial_tag.attrs["style"] = span_style
# Wrap the new <tag> element around the initial tag and return it
initial_tag.wrap(tag) initial_tag.wrap(tag)
return initial_tag
def convert_initial_tag(self) -> Tag: def convert_initial_tag(self) -> Tag:
self.change_attrs_with_corresponding_tags() self.change_attrs_with_corresponding_tags()
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) self.tag_inline_style = self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
return self.tag_inline_style return self.tag_inline_style
@@ -259,7 +274,7 @@ def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = "
# soup with converted styles from css # soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.REGEX_TAGS_WITH_STYLE_ATTR,
attrs={"style": re.compile(".*")}) attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file # go through the tags with inline style + style parsed from css file

View File

@@ -2,24 +2,18 @@ import re
class LiveCartaConfig: class LiveCartaConfig:
"""Class of values that LiveCarta platform using and supports""" """Class of values that LiveCarta platform supports"""
# tag with inline style to be updated with style attribute
SUPPORTED_LEVELS = 5
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
list_types = ["circle", "disc", "armenian", "decimal", NUM_SUPPORTED_LEVELS = 5
"decimal-leading-zero", "georgian", "lower-alpha", "lower-latin", SUPPORTED_HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5"}
"lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"]
could_have_style_in_livecarta_regexp = re.compile( # Regular expression to match HTML tags that can have a style attribute
"(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)") REGEX_TAGS_WITH_STYLE_ATTR = re.compile(
"(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)"
)
""" # Dictionary mapping CSS style attribute-value pairs to HTML tags
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } STYLE_ATTRS_TO_TAGS = {
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
("font-weight", "bold"): "strong", ("font-weight", "bold"): "strong",
("font-weight", "600"): "strong", ("font-weight", "600"): "strong",
("font-weight", "700"): "strong", ("font-weight", "700"): "strong",
@@ -33,28 +27,26 @@ class LiveCartaConfig:
("vertical-align", "super"): "sup" ("vertical-align", "super"): "sup"
} }
LIVECARTA_STYLES_CAN_BE_IN_TAG = { # Dictionary mapping HTML tags to CSS style attributes that can be contained within them
"p": ["text-align", "text-indent", "border-bottom", "border-top"], TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG = {
"li": ["text-align", "list-style-type"], "^p$": ["text-align", "text-indent", "border-bottom", "border-top", "border-left", "border-right",
"ul": ["list-style-type"], "background-color"],
"ol": ["list-style-type"], "^li$": ["text-align", "list-style-type"],
r"(^h[1-9]$)": ["list-style-type"] "^ul$": ["list-style-type"],
"^ol$": ["list-style-type"],
r"(^h[1-9]$)": ["list-style-type", "border-bottom", "border-top", "border-left", "border-right",
"background-color", "color"]
} }
""" # Dictionary mapping CSS style attribute names to names that should replace them
Dictionary LIVECARTA_STYLE_ATTRS_REPLACE = { css property: css property to replace with } STYLE_ATTR_TO_REPLACEMENT = {
"""
LIVECARTA_STYLE_ATTRS_REPLACE = {
"list-style": "list-style-type", "list-style": "list-style-type",
} }
""" # Dictionary mapping CSS style attribute names to lists of allowed values
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } # If an empty list is provided, any value is allowed for the attribute
Style properties that can be used to fit LiveCarta css style convention. # If a non-empty list is provided, only values in the list are allowed for the attribute
If property has empty list, it means that any value can be converted. STYLE_ATTR_TO_VALUE_LIMIT = {
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
"align": [], "align": [],
"font": [], "font": [],
"font-family": [], "font-family": [],

View File

@@ -111,24 +111,24 @@ class StyleReader:
for symbol in ["+", "*", ".", "%", "?", "$", "^", "[", "]"]: for symbol in ["+", "*", ".", "%", "?", "$", "^", "[", "]"]:
cleaned_value = re.sub( cleaned_value = re.sub(
re.escape(f"{symbol}"), rf"\\{symbol}", cleaned_value) re.escape(f"{symbol}"), rf"\\{symbol}", cleaned_value)
cleaned_value = replace_str(cleaned_value, LiveCartaConfig.LIVECARTA_STYLE_ATTRS[style_name]) cleaned_value = replace_str(cleaned_value, LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT[style_name])
return cleaned_value return cleaned_value
@staticmethod @staticmethod
def style_conditions(style_value: str, style_name: str) -> Tuple[bool, bool]: def style_conditions(style_value: str, style_name: str) -> Tuple[bool, bool]:
constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get( constraints_on_value = LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT.get(
style_name) style_name)
value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[ value_not_in_possible_values_list = style_value not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT[
style_name] style_name]
return constraints_on_value, value_not_in_possible_values_list return constraints_on_value, value_not_in_possible_values_list
def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list: def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
try: try:
for i, style in reversed(list(enumerate(split_style))): for i, style in reversed(list(enumerate(split_style))):
if style.split(":")[0] in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE: if style.split(":")[0] in LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT:
style = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE[style.split(":")[0]] + ":" + style.split(":")[1] style = LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT[style.split(":")[0]] + ":" + style.split(":")[1]
style_name, style_value = style.split(":") style_name, style_value = style.split(":")
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: if style_name not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT:
# property not in LIVECARTA_STYLE_ATTRS, remove # property not in LIVECARTA_STYLE_ATTRS, remove
split_style.remove(style) split_style.remove(style)
continue continue
@@ -165,7 +165,7 @@ class StyleReader:
def process_inline_styles_in_html_soup(self, html_content): def process_inline_styles_in_html_soup(self, html_content):
"""This function is designed to convert inline html styles""" """This function is designed to convert inline html styles"""
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, tags_with_inline_style = html_content.find_all(LiveCartaConfig.REGEX_TAGS_WITH_STYLE_ATTR,
attrs={"style": re.compile(".*")}) attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style: for tag_initial_inline_style in tags_with_inline_style:
@@ -194,10 +194,10 @@ class StyleReader:
def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule, def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
style_type: cssutils.css.property.Property): style_type: cssutils.css.property.Property):
if style_type.name in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE: if style_type.name in LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT:
# attributes to replace # attributes to replace
style_type.name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE[style_type.name] style_type.name = LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT[style_type.name]
if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: if style_type.name not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file # property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = "" css_rule.style[style_type.name] = ""
return return