forked from LiveCarta/BookConverter
Fix span wrapper processing
This commit is contained in:
@@ -33,7 +33,7 @@ class BookSolver:
|
||||
self.status_wrapper = BookStatusWrapper(
|
||||
access, self.book_logger, book_id)
|
||||
|
||||
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
|
||||
assert LiveCartaConfig.NUM_SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADER_TAGS), \
|
||||
"Length of headers doesn't match allowed levels."
|
||||
|
||||
def save_file(self, content: bytes, path_to_save: str, file_type: str) -> str:
|
||||
|
||||
@@ -67,14 +67,14 @@ class ChapterItem:
|
||||
for i in self.sub_items:
|
||||
sub_dicts.append(i.to_dict(lvl + 1))
|
||||
|
||||
if lvl > LiveCartaConfig.SUPPORTED_LEVELS:
|
||||
if lvl > LiveCartaConfig.NUM_SUPPORTED_LEVELS:
|
||||
return {
|
||||
"title": self.title,
|
||||
"contents": [self.content] + [x['contents'] for x in sub_dicts],
|
||||
"sub_items": []
|
||||
}
|
||||
|
||||
if (lvl == LiveCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
|
||||
if (lvl == LiveCartaConfig.NUM_SUPPORTED_LEVELS) and sub_dicts:
|
||||
return {
|
||||
"title": self.title,
|
||||
"contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),
|
||||
|
||||
@@ -49,7 +49,7 @@ class LibreHtml2JsonConverter:
|
||||
result, ind
|
||||
|
||||
"""
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
|
||||
title = str(self.content[ind])
|
||||
title = title.replace(f"<{self.content[ind].name}>", "")
|
||||
title = title.replace(f"</{self.content[ind].name}>", "")
|
||||
@@ -67,7 +67,7 @@ class LibreHtml2JsonConverter:
|
||||
|
||||
while ind < len(self.content):
|
||||
# 1. next tag is a header
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
|
||||
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
||||
# - recursion step until h_i > h_initial
|
||||
if outline > curr_outline:
|
||||
@@ -116,13 +116,13 @@ class LibreHtml2JsonConverter:
|
||||
while ind < len(self.content):
|
||||
res = {}
|
||||
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
|
||||
res, ind = self.header_to_livecarta_chapter_item(ind)
|
||||
|
||||
else:
|
||||
chapter_title = f"Untitled chapter {ch_num}"
|
||||
chapter = []
|
||||
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADER_TAGS:
|
||||
if not self._is_empty_p_tag(self.content[ind]):
|
||||
chapter.append(self.format_html(
|
||||
str(self.content[ind])))
|
||||
|
||||
@@ -564,7 +564,7 @@ class EpubConverter:
|
||||
|
||||
indent: str = " " * lvl
|
||||
self.book_logger.log(indent + f"Chapter: {title} is processing.")
|
||||
is_chapter: bool = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||
is_chapter: bool = lvl <= LiveCartaConfig.NUM_SUPPORTED_LEVELS
|
||||
self.book_logger.log(indent + "Process title.")
|
||||
title_preprocessed: str = self.html_processor.prepare_title(title)
|
||||
self.book_logger.log(indent + "Process content.")
|
||||
|
||||
@@ -171,7 +171,7 @@ class InlineStyleProcessor:
|
||||
|
||||
"""
|
||||
styles_to_remove = []
|
||||
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
||||
for k in LiveCartaConfig.STYLE_ATTRS_TO_TAGS:
|
||||
if f"{k[0]}:{k[1]}" in style:
|
||||
styles_to_remove.append(k)
|
||||
return styles_to_remove
|
||||
@@ -182,7 +182,7 @@ class InlineStyleProcessor:
|
||||
for i, (attr, value) in enumerate(styles_to_remove):
|
||||
self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\
|
||||
.replace(f"{attr}:{value};", "").strip()
|
||||
corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
|
||||
corr_tag_name = LiveCartaConfig.STYLE_ATTRS_TO_TAGS[(
|
||||
attr, value)]
|
||||
correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name)
|
||||
for content in reversed(self.tag_inline_style.contents):
|
||||
@@ -190,41 +190,56 @@ class InlineStyleProcessor:
|
||||
self.tag_inline_style.append(correspond_tag)
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag):
|
||||
def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag) -> Tag:
|
||||
"""Function designed to save style attrs that cannot be in tag.name -> span"""
|
||||
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG))
|
||||
|
||||
# Compile a regex pattern to match tag names that can have certain style attributes
|
||||
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG))
|
||||
|
||||
# Check if the tag's name is one of the allowed tags, and it has a "style" attribute
|
||||
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
|
||||
styles_can_be_in_tag = [style
|
||||
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items()
|
||||
if re.match(tag, initial_tag.name)
|
||||
for style in styles]
|
||||
styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS
|
||||
if attr not in styles_can_be_in_tag]
|
||||
# Get a list of style attributes that can be in the tag
|
||||
styles_can_be_in_tag = [
|
||||
style for tag, styles in LiveCartaConfig.TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG.items()
|
||||
if re.match(tag, initial_tag.name) for style in styles
|
||||
]
|
||||
# Get a list of style attributes that cannot be in the tag
|
||||
styles_cant_be_in_tag = [
|
||||
attr for attr in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT if attr not in styles_can_be_in_tag
|
||||
]
|
||||
# Get the "style" attribute value of the initial tag
|
||||
span_style = initial_tag.attrs["style"]
|
||||
# here check that this style is exactly the same.
|
||||
# Not "align" when we have "text-align", or "border" when we have "border-top"
|
||||
styles_to_be_saved_in_span = [((attr + ":") in span_style) & (
|
||||
"-" + attr not in span_style) for attr in styles_cant_be_in_tag]
|
||||
# Check that the style attributes in "style" are exactly the same and wrap them in a <span> tag
|
||||
styles_to_be_saved_in_span = [
|
||||
(attr + ":") in span_style and ("-" + attr) not in span_style for attr in styles_cant_be_in_tag
|
||||
]
|
||||
if any(styles_to_be_saved_in_span):
|
||||
# if we find styles that cannot be in <tag.name> -> wrap them in span
|
||||
tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}")
|
||||
# Create a new <tag> element with the same tag name as the initial tag
|
||||
tag = BeautifulSoup(features="lxml").new_tag(initial_tag.name)
|
||||
# Create an empty string for the style attribute
|
||||
style = ""
|
||||
possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag]
|
||||
# Compile a list of regex patterns to match style attributes that can be in the tag
|
||||
possible_attrs_regexp = [
|
||||
re.compile(fr"({style}: *[#a-zA-Z\d]+;)", re.IGNORECASE) for style in styles_can_be_in_tag
|
||||
]
|
||||
# Iterate over the list of regex patterns and search for matching style attributes
|
||||
for possible_attr_regexp in possible_attrs_regexp:
|
||||
has_style_attrs = re.search(
|
||||
possible_attr_regexp, span_style)
|
||||
has_style_attrs = re.search(possible_attr_regexp, span_style)
|
||||
if has_style_attrs and has_style_attrs.group(1):
|
||||
style += has_style_attrs.group(1)
|
||||
span_style = span_style.replace(
|
||||
has_style_attrs.group(1), "")
|
||||
tag.attrs["style"] = style
|
||||
initial_tag.name = "span"
|
||||
span_style = span_style.replace(has_style_attrs.group(1), "")
|
||||
# Add the style attribute to the new <tag> element if it exists
|
||||
if style:
|
||||
tag.attrs["style"] = style
|
||||
# Set the "style" attribute of the initial tag to the remaining style attributes
|
||||
initial_tag.attrs["style"] = span_style
|
||||
# Wrap the new <tag> element around the initial tag and return it
|
||||
initial_tag.wrap(tag)
|
||||
return initial_tag
|
||||
|
||||
def convert_initial_tag(self) -> Tag:
|
||||
self.change_attrs_with_corresponding_tags()
|
||||
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
|
||||
self.tag_inline_style = self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
|
||||
return self.tag_inline_style
|
||||
|
||||
|
||||
@@ -259,7 +274,7 @@ def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = "
|
||||
# soup with converted styles from css
|
||||
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
|
||||
|
||||
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.REGEX_TAGS_WITH_STYLE_ATTR,
|
||||
attrs={"style": re.compile(".*")})
|
||||
|
||||
# go through the tags with inline style + style parsed from css file
|
||||
|
||||
@@ -2,24 +2,18 @@ import re
|
||||
|
||||
|
||||
class LiveCartaConfig:
|
||||
"""Class of values that LiveCarta platform using and supports"""
|
||||
# tag with inline style to be updated with style attribute
|
||||
SUPPORTED_LEVELS = 5
|
||||
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
|
||||
"""Class of values that LiveCarta platform supports"""
|
||||
|
||||
list_types = ["circle", "disc", "armenian", "decimal",
|
||||
"decimal-leading-zero", "georgian", "lower-alpha", "lower-latin",
|
||||
"lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"]
|
||||
NUM_SUPPORTED_LEVELS = 5
|
||||
SUPPORTED_HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5"}
|
||||
|
||||
could_have_style_in_livecarta_regexp = re.compile(
|
||||
"(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)")
|
||||
# Regular expression to match HTML tags that can have a style attribute
|
||||
REGEX_TAGS_WITH_STYLE_ATTR = re.compile(
|
||||
"(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)"
|
||||
)
|
||||
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
|
||||
|
||||
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
||||
# Dictionary mapping CSS style attribute-value pairs to HTML tags
|
||||
STYLE_ATTRS_TO_TAGS = {
|
||||
("font-weight", "bold"): "strong",
|
||||
("font-weight", "600"): "strong",
|
||||
("font-weight", "700"): "strong",
|
||||
@@ -33,28 +27,26 @@ class LiveCartaConfig:
|
||||
("vertical-align", "super"): "sup"
|
||||
}
|
||||
|
||||
LIVECARTA_STYLES_CAN_BE_IN_TAG = {
|
||||
"p": ["text-align", "text-indent", "border-bottom", "border-top"],
|
||||
"li": ["text-align", "list-style-type"],
|
||||
"ul": ["list-style-type"],
|
||||
"ol": ["list-style-type"],
|
||||
r"(^h[1-9]$)": ["list-style-type"]
|
||||
# Dictionary mapping HTML tags to CSS style attributes that can be contained within them
|
||||
TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG = {
|
||||
"^p$": ["text-align", "text-indent", "border-bottom", "border-top", "border-left", "border-right",
|
||||
"background-color"],
|
||||
"^li$": ["text-align", "list-style-type"],
|
||||
"^ul$": ["list-style-type"],
|
||||
"^ol$": ["list-style-type"],
|
||||
r"(^h[1-9]$)": ["list-style-type", "border-bottom", "border-top", "border-left", "border-right",
|
||||
"background-color", "color"]
|
||||
}
|
||||
|
||||
"""
|
||||
Dictionary LIVECARTA_STYLE_ATTRS_REPLACE = { css property: css property to replace with }
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS_REPLACE = {
|
||||
# Dictionary mapping CSS style attribute names to names that should replace them
|
||||
STYLE_ATTR_TO_REPLACEMENT = {
|
||||
"list-style": "list-style-type",
|
||||
}
|
||||
|
||||
"""
|
||||
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||
Style properties that can be used to fit LiveCarta css style convention.
|
||||
If property has empty list, it means that any value can be converted.
|
||||
If property has not empty list, it means that only certain property-value combinations can be transformed.
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS = {
|
||||
# Dictionary mapping CSS style attribute names to lists of allowed values
|
||||
# If an empty list is provided, any value is allowed for the attribute
|
||||
# If a non-empty list is provided, only values in the list are allowed for the attribute
|
||||
STYLE_ATTR_TO_VALUE_LIMIT = {
|
||||
"align": [],
|
||||
"font": [],
|
||||
"font-family": [],
|
||||
|
||||
@@ -111,24 +111,24 @@ class StyleReader:
|
||||
for symbol in ["+", "*", ".", "%", "?", "$", "^", "[", "]"]:
|
||||
cleaned_value = re.sub(
|
||||
re.escape(f"{symbol}"), rf"\\{symbol}", cleaned_value)
|
||||
cleaned_value = replace_str(cleaned_value, LiveCartaConfig.LIVECARTA_STYLE_ATTRS[style_name])
|
||||
cleaned_value = replace_str(cleaned_value, LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT[style_name])
|
||||
return cleaned_value
|
||||
|
||||
@staticmethod
|
||||
def style_conditions(style_value: str, style_name: str) -> Tuple[bool, bool]:
|
||||
constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get(
|
||||
constraints_on_value = LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT.get(
|
||||
style_name)
|
||||
value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[
|
||||
value_not_in_possible_values_list = style_value not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT[
|
||||
style_name]
|
||||
return constraints_on_value, value_not_in_possible_values_list
|
||||
|
||||
def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
|
||||
try:
|
||||
for i, style in reversed(list(enumerate(split_style))):
|
||||
if style.split(":")[0] in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE:
|
||||
style = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE[style.split(":")[0]] + ":" + style.split(":")[1]
|
||||
if style.split(":")[0] in LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT:
|
||||
style = LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT[style.split(":")[0]] + ":" + style.split(":")[1]
|
||||
style_name, style_value = style.split(":")
|
||||
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
|
||||
if style_name not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT:
|
||||
# property not in LIVECARTA_STYLE_ATTRS, remove
|
||||
split_style.remove(style)
|
||||
continue
|
||||
@@ -165,7 +165,7 @@ class StyleReader:
|
||||
|
||||
def process_inline_styles_in_html_soup(self, html_content):
|
||||
"""This function is designed to convert inline html styles"""
|
||||
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
tags_with_inline_style = html_content.find_all(LiveCartaConfig.REGEX_TAGS_WITH_STYLE_ATTR,
|
||||
attrs={"style": re.compile(".*")})
|
||||
|
||||
for tag_initial_inline_style in tags_with_inline_style:
|
||||
@@ -194,10 +194,10 @@ class StyleReader:
|
||||
|
||||
def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
|
||||
style_type: cssutils.css.property.Property):
|
||||
if style_type.name in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE:
|
||||
if style_type.name in LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT:
|
||||
# attributes to replace
|
||||
style_type.name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE[style_type.name]
|
||||
if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
|
||||
style_type.name = LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT[style_type.name]
|
||||
if style_type.name not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT:
|
||||
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||
css_rule.style[style_type.name] = ""
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user