Fix span wrapper processing

This commit is contained in:
Kibzik
2023-03-15 20:00:32 +03:00
parent b5ad043335
commit 047bfeca20
7 changed files with 82 additions and 75 deletions

View File

@@ -33,7 +33,7 @@ class BookSolver:
self.status_wrapper = BookStatusWrapper(
access, self.book_logger, book_id)
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
assert LiveCartaConfig.NUM_SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADER_TAGS), \
"Length of headers doesn't match allowed levels."
def save_file(self, content: bytes, path_to_save: str, file_type: str) -> str:

View File

@@ -67,14 +67,14 @@ class ChapterItem:
for i in self.sub_items:
sub_dicts.append(i.to_dict(lvl + 1))
if lvl > LiveCartaConfig.SUPPORTED_LEVELS:
if lvl > LiveCartaConfig.NUM_SUPPORTED_LEVELS:
return {
"title": self.title,
"contents": [self.content] + [x['contents'] for x in sub_dicts],
"sub_items": []
}
if (lvl == LiveCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
if (lvl == LiveCartaConfig.NUM_SUPPORTED_LEVELS) and sub_dicts:
return {
"title": self.title,
"contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),

View File

@@ -49,7 +49,7 @@ class LibreHtml2JsonConverter:
result, ind
"""
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
title = str(self.content[ind])
title = title.replace(f"<{self.content[ind].name}>", "")
title = title.replace(f"</{self.content[ind].name}>", "")
@@ -67,7 +67,7 @@ class LibreHtml2JsonConverter:
while ind < len(self.content):
# 1. next tag is a header
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
outline = int(re.sub(r"^h", "", self.content[ind].name))
# - recursion step until h_i > h_initial
if outline > curr_outline:
@@ -116,13 +116,13 @@ class LibreHtml2JsonConverter:
while ind < len(self.content):
res = {}
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
res, ind = self.header_to_livecarta_chapter_item(ind)
else:
chapter_title = f"Untitled chapter {ch_num}"
chapter = []
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
if not self._is_empty_p_tag(self.content[ind]):
chapter.append(self.format_html(
str(self.content[ind])))

View File

@@ -564,7 +564,7 @@ class EpubConverter:
indent: str = " " * lvl
self.book_logger.log(indent + f"Chapter: {title} is processing.")
is_chapter: bool = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
is_chapter: bool = lvl <= LiveCartaConfig.NUM_SUPPORTED_LEVELS
self.book_logger.log(indent + "Process title.")
title_preprocessed: str = self.html_processor.prepare_title(title)
self.book_logger.log(indent + "Process content.")

View File

@@ -171,7 +171,7 @@ class InlineStyleProcessor:
"""
styles_to_remove = []
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
for k in LiveCartaConfig.STYLE_ATTRS_TO_TAGS:
if f"{k[0]}:{k[1]}" in style:
styles_to_remove.append(k)
return styles_to_remove
@@ -182,7 +182,7 @@ class InlineStyleProcessor:
for i, (attr, value) in enumerate(styles_to_remove):
self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\
.replace(f"{attr}:{value};", "").strip()
corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
corr_tag_name = LiveCartaConfig.STYLE_ATTRS_TO_TAGS[(
attr, value)]
correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name)
for content in reversed(self.tag_inline_style.contents):
@@ -190,41 +190,56 @@ class InlineStyleProcessor:
self.tag_inline_style.append(correspond_tag)
@staticmethod
def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag):
def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag) -> Tag:
"""Function designed to save style attrs that cannot be in tag.name -> span"""
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG))
# Compile a regex pattern to match tag names that can have certain style attributes
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG))
# Check if the tag's name is one of the allowed tags, and it has a "style" attribute
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
styles_can_be_in_tag = [style
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items()
if re.match(tag, initial_tag.name)
for style in styles]
styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS
if attr not in styles_can_be_in_tag]
# Get a list of style attributes that can be in the tag
styles_can_be_in_tag = [
style for tag, styles in LiveCartaConfig.TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG.items()
if re.match(tag, initial_tag.name) for style in styles
]
# Get a list of style attributes that cannot be in the tag
styles_cant_be_in_tag = [
attr for attr in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT if attr not in styles_can_be_in_tag
]
# Get the "style" attribute value of the initial tag
span_style = initial_tag.attrs["style"]
# here check that this style is exactly the same.
# Not "align" when we have "text-align", or "border" when we have "border-top"
styles_to_be_saved_in_span = [((attr + ":") in span_style) & (
"-" + attr not in span_style) for attr in styles_cant_be_in_tag]
# Check that the style attributes in "style" are exactly the same and wrap them in a <span> tag
styles_to_be_saved_in_span = [
(attr + ":") in span_style and ("-" + attr) not in span_style for attr in styles_cant_be_in_tag
]
if any(styles_to_be_saved_in_span):
# if we find styles that cannot be in <tag.name> -> wrap them in span
tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}")
# Create a new <tag> element with the same tag name as the initial tag
tag = BeautifulSoup(features="lxml").new_tag(initial_tag.name)
# Create an empty string for the style attribute
style = ""
possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag]
# Compile a list of regex patterns to match style attributes that can be in the tag
possible_attrs_regexp = [
re.compile(fr"({style}: *[#a-zA-Z\d]+;)", re.IGNORECASE) for style in styles_can_be_in_tag
]
# Iterate over the list of regex patterns and search for matching style attributes
for possible_attr_regexp in possible_attrs_regexp:
has_style_attrs = re.search(
possible_attr_regexp, span_style)
has_style_attrs = re.search(possible_attr_regexp, span_style)
if has_style_attrs and has_style_attrs.group(1):
style += has_style_attrs.group(1)
span_style = span_style.replace(
has_style_attrs.group(1), "")
tag.attrs["style"] = style
initial_tag.name = "span"
span_style = span_style.replace(has_style_attrs.group(1), "")
# Add the style attribute to the new <tag> element if it exists
if style:
tag.attrs["style"] = style
# Set the "style" attribute of the initial tag to the remaining style attributes
initial_tag.attrs["style"] = span_style
# Wrap the new <tag> element around the initial tag and return it
initial_tag.wrap(tag)
return initial_tag
def convert_initial_tag(self) -> Tag:
self.change_attrs_with_corresponding_tags()
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
self.tag_inline_style = self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
return self.tag_inline_style
@@ -259,7 +274,7 @@ def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = "
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.REGEX_TAGS_WITH_STYLE_ATTR,
attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file

View File

@@ -2,24 +2,18 @@ import re
class LiveCartaConfig:
"""Class of values that LiveCarta platform using and supports"""
# tag with inline style to be updated with style attribute
SUPPORTED_LEVELS = 5
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
"""Class of values that LiveCarta platform supports"""
list_types = ["circle", "disc", "armenian", "decimal",
"decimal-leading-zero", "georgian", "lower-alpha", "lower-latin",
"lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"]
NUM_SUPPORTED_LEVELS = 5
SUPPORTED_HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5"}
could_have_style_in_livecarta_regexp = re.compile(
"(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)")
# Regular expression to match HTML tags that can have a style attribute
REGEX_TAGS_WITH_STYLE_ATTR = re.compile(
"(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)"
)
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
# Dictionary mapping CSS style attribute-value pairs to HTML tags
STYLE_ATTRS_TO_TAGS = {
("font-weight", "bold"): "strong",
("font-weight", "600"): "strong",
("font-weight", "700"): "strong",
@@ -33,28 +27,26 @@ class LiveCartaConfig:
("vertical-align", "super"): "sup"
}
LIVECARTA_STYLES_CAN_BE_IN_TAG = {
"p": ["text-align", "text-indent", "border-bottom", "border-top"],
"li": ["text-align", "list-style-type"],
"ul": ["list-style-type"],
"ol": ["list-style-type"],
r"(^h[1-9]$)": ["list-style-type"]
# Dictionary mapping HTML tags to CSS style attributes that can be contained within them
TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG = {
"^p$": ["text-align", "text-indent", "border-bottom", "border-top", "border-left", "border-right",
"background-color"],
"^li$": ["text-align", "list-style-type"],
"^ul$": ["list-style-type"],
"^ol$": ["list-style-type"],
r"(^h[1-9]$)": ["list-style-type", "border-bottom", "border-top", "border-left", "border-right",
"background-color", "color"]
}
"""
Dictionary LIVECARTA_STYLE_ATTRS_REPLACE = { css property: css property to replace with }
"""
LIVECARTA_STYLE_ATTRS_REPLACE = {
# Dictionary mapping CSS style attribute names to names that should replace them
STYLE_ATTR_TO_REPLACEMENT = {
"list-style": "list-style-type",
}
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit LiveCarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
# Dictionary mapping CSS style attribute names to lists of allowed values
# If an empty list is provided, any value is allowed for the attribute
# If a non-empty list is provided, only values in the list are allowed for the attribute
STYLE_ATTR_TO_VALUE_LIMIT = {
"align": [],
"font": [],
"font-family": [],

View File

@@ -111,24 +111,24 @@ class StyleReader:
for symbol in ["+", "*", ".", "%", "?", "$", "^", "[", "]"]:
cleaned_value = re.sub(
re.escape(f"{symbol}"), rf"\\{symbol}", cleaned_value)
cleaned_value = replace_str(cleaned_value, LiveCartaConfig.LIVECARTA_STYLE_ATTRS[style_name])
cleaned_value = replace_str(cleaned_value, LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT[style_name])
return cleaned_value
@staticmethod
def style_conditions(style_value: str, style_name: str) -> Tuple[bool, bool]:
constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get(
constraints_on_value = LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT.get(
style_name)
value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[
value_not_in_possible_values_list = style_value not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT[
style_name]
return constraints_on_value, value_not_in_possible_values_list
def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
try:
for i, style in reversed(list(enumerate(split_style))):
if style.split(":")[0] in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE:
style = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE[style.split(":")[0]] + ":" + style.split(":")[1]
if style.split(":")[0] in LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT:
style = LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT[style.split(":")[0]] + ":" + style.split(":")[1]
style_name, style_value = style.split(":")
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
if style_name not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT:
# property not in LIVECARTA_STYLE_ATTRS, remove
split_style.remove(style)
continue
@@ -165,7 +165,7 @@ class StyleReader:
def process_inline_styles_in_html_soup(self, html_content):
"""This function is designed to convert inline html styles"""
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
tags_with_inline_style = html_content.find_all(LiveCartaConfig.REGEX_TAGS_WITH_STYLE_ATTR,
attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style:
@@ -194,10 +194,10 @@ class StyleReader:
def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
style_type: cssutils.css.property.Property):
if style_type.name in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE:
if style_type.name in LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT:
# attributes to replace
style_type.name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE[style_type.name]
if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
style_type.name = LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT[style_type.name]
if style_type.name not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ""
return