From 047bfeca205e268fb2c0b75c6b8cba45705daf34 Mon Sep 17 00:00:00 2001 From: Kibzik Date: Wed, 15 Mar 2023 20:00:32 +0300 Subject: [PATCH] Fix span wrapper processing --- src/book_solver.py | 2 +- src/data_objects.py | 4 +- .../libre_html2json_converter.py | 8 +-- src/epub_converter/epub_converter.py | 2 +- src/inline_style_processor.py | 65 ++++++++++++------- src/livecarta_config.py | 56 +++++++--------- src/style_reader.py | 20 +++--- 7 files changed, 82 insertions(+), 75 deletions(-) diff --git a/src/book_solver.py b/src/book_solver.py index 8c8e1b8..5a88d20 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -33,7 +33,7 @@ class BookSolver: self.status_wrapper = BookStatusWrapper( access, self.book_logger, book_id) - assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \ + assert LiveCartaConfig.NUM_SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADER_TAGS), \ "Length of headers doesn't match allowed levels." def save_file(self, content: bytes, path_to_save: str, file_type: str) -> str: diff --git a/src/data_objects.py b/src/data_objects.py index f1ca163..80f1b12 100644 --- a/src/data_objects.py +++ b/src/data_objects.py @@ -67,14 +67,14 @@ class ChapterItem: for i in self.sub_items: sub_dicts.append(i.to_dict(lvl + 1)) - if lvl > LiveCartaConfig.SUPPORTED_LEVELS: + if lvl > LiveCartaConfig.NUM_SUPPORTED_LEVELS: return { "title": self.title, "contents": [self.content] + [x['contents'] for x in sub_dicts], "sub_items": [] } - if (lvl == LiveCartaConfig.SUPPORTED_LEVELS) and sub_dicts: + if (lvl == LiveCartaConfig.NUM_SUPPORTED_LEVELS) and sub_dicts: return { "title": self.title, "contents": [self.content] + flatten([x['contents'] for x in sub_dicts]), diff --git a/src/docx_converter/libre_html2json_converter.py b/src/docx_converter/libre_html2json_converter.py index 948ecf1..d746624 100644 --- a/src/docx_converter/libre_html2json_converter.py +++ b/src/docx_converter/libre_html2json_converter.py @@ -49,7 +49,7 @@ class LibreHtml2JsonConverter: result, ind """ - if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: + if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS: title = str(self.content[ind]) title = title.replace(f"<{self.content[ind].name}>", "") title = title.replace(f"", "") @@ -67,7 +67,7 @@ class LibreHtml2JsonConverter: while ind < len(self.content): # 1. next tag is a header - if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: + if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS: outline = int(re.sub(r"^h", "", self.content[ind].name)) # - recursion step until h_i > h_initial if outline > curr_outline: @@ -116,13 +116,13 @@ class LibreHtml2JsonConverter: while ind < len(self.content): res = {} - if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: + if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADER_TAGS: res, ind = self.header_to_livecarta_chapter_item(ind) else: chapter_title = f"Untitled chapter {ch_num}" chapter = [] - while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: + while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADER_TAGS: if not self._is_empty_p_tag(self.content[ind]): chapter.append(self.format_html( str(self.content[ind]))) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 461b203..9eec34e 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -564,7 +564,7 @@ class EpubConverter: indent: str = " " * lvl self.book_logger.log(indent + f"Chapter: {title} is processing.") - is_chapter: bool = lvl <= LiveCartaConfig.SUPPORTED_LEVELS + is_chapter: bool = lvl <= LiveCartaConfig.NUM_SUPPORTED_LEVELS self.book_logger.log(indent + "Process title.") title_preprocessed: str = self.html_processor.prepare_title(title) self.book_logger.log(indent + "Process content.") diff --git a/src/inline_style_processor.py b/src/inline_style_processor.py index 8985206..06c5fae 100644 --- a/src/inline_style_processor.py +++ b/src/inline_style_processor.py @@ -171,7 +171,7 @@ class InlineStyleProcessor: """ styles_to_remove = [] - for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: + for k in LiveCartaConfig.STYLE_ATTRS_TO_TAGS: if f"{k[0]}:{k[1]}" in style: styles_to_remove.append(k) return styles_to_remove @@ -182,7 +182,7 @@ class InlineStyleProcessor: for i, (attr, value) in enumerate(styles_to_remove): self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\ .replace(f"{attr}:{value};", "").strip() - corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( + corr_tag_name = LiveCartaConfig.STYLE_ATTRS_TO_TAGS[( attr, value)] correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name) for content in reversed(self.tag_inline_style.contents): @@ -190,41 +190,56 @@ class InlineStyleProcessor: self.tag_inline_style.append(correspond_tag) @staticmethod - def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag): + def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag) -> Tag: """Function designed to save style attrs that cannot be in tag.name -> span""" - dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG)) + + # Compile a regex pattern to match tag names that can have certain style attributes + dictkeys_pattern = re.compile("|".join(LiveCartaConfig.TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG)) + + # Check if the tag's name is one of the allowed tags, and it has a "style" attribute if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"): - styles_can_be_in_tag = [style - for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items() - if re.match(tag, initial_tag.name) - for style in styles] - styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS - if attr not in styles_can_be_in_tag] + # Get a list of style attributes that can be in the tag + styles_can_be_in_tag = [ + style for tag, styles in LiveCartaConfig.TAGS_TO_STYLE_ATTRS_CAN_BE_IN_TAG.items() + if re.match(tag, initial_tag.name) for style in styles + ] + # Get a list of style attributes that cannot be in the tag + styles_cant_be_in_tag = [ + attr for attr in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT if attr not in styles_can_be_in_tag + ] + # Get the "style" attribute value of the initial tag span_style = initial_tag.attrs["style"] - # here check that this style is exactly the same. - # Not "align" when we have "text-align", or "border" when we have "border-top" - styles_to_be_saved_in_span = [((attr + ":") in span_style) & ( - "-" + attr not in span_style) for attr in styles_cant_be_in_tag] + # Check that the style attributes in "style" are exactly the same and wrap them in a tag + styles_to_be_saved_in_span = [ + (attr + ":") in span_style and ("-" + attr) not in span_style for attr in styles_cant_be_in_tag + ] if any(styles_to_be_saved_in_span): - # if we find styles that cannot be in -> wrap them in span - tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}") + # Create a new element with the same tag name as the initial tag + tag = BeautifulSoup(features="lxml").new_tag(initial_tag.name) + # Create an empty string for the style attribute style = "" - possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag] + # Compile a list of regex patterns to match style attributes that can be in the tag + possible_attrs_regexp = [ + re.compile(fr"({style}: *[#a-zA-Z\d]+;)", re.IGNORECASE) for style in styles_can_be_in_tag + ] + # Iterate over the list of regex patterns and search for matching style attributes for possible_attr_regexp in possible_attrs_regexp: - has_style_attrs = re.search( - possible_attr_regexp, span_style) + has_style_attrs = re.search(possible_attr_regexp, span_style) if has_style_attrs and has_style_attrs.group(1): style += has_style_attrs.group(1) - span_style = span_style.replace( - has_style_attrs.group(1), "") - tag.attrs["style"] = style - initial_tag.name = "span" + span_style = span_style.replace(has_style_attrs.group(1), "") + # Add the style attribute to the new element if it exists + if style: + tag.attrs["style"] = style + # Set the "style" attribute of the initial tag to the remaining style attributes initial_tag.attrs["style"] = span_style + # Wrap the new element around the initial tag and return it initial_tag.wrap(tag) + return initial_tag def convert_initial_tag(self) -> Tag: self.change_attrs_with_corresponding_tags() - self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) + self.tag_inline_style = self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) return self.tag_inline_style @@ -259,7 +274,7 @@ def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = " # soup with converted styles from css inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") - tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.REGEX_TAGS_WITH_STYLE_ATTR, attrs={"style": re.compile(".*")}) # go through the tags with inline style + style parsed from css file diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 0d5f0b5..528c514 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -2,24 +2,18 @@ import re class LiveCartaConfig: - """Class of values that LiveCarta platform using and supports""" - # tag with inline style to be updated with style attribute - SUPPORTED_LEVELS = 5 - SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"} + """Class of values that LiveCarta platform supports""" - list_types = ["circle", "disc", "armenian", "decimal", - "decimal-leading-zero", "georgian", "lower-alpha", "lower-latin", - "lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"] + NUM_SUPPORTED_LEVELS = 5 + SUPPORTED_HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5"} - could_have_style_in_livecarta_regexp = re.compile( - "(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)") + # Regular expression to match HTML tags that can have a style attribute + REGEX_TAGS_WITH_STYLE_ATTR = re.compile( + "(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)" + ) - """ - LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } - -

Tuple[bool, bool]: - constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get( + constraints_on_value = LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT.get( style_name) - value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[ + value_not_in_possible_values_list = style_value not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT[ style_name] return constraints_on_value, value_not_in_possible_values_list def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list: try: for i, style in reversed(list(enumerate(split_style))): - if style.split(":")[0] in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE: - style = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE[style.split(":")[0]] + ":" + style.split(":")[1] + if style.split(":")[0] in LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT: + style = LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT[style.split(":")[0]] + ":" + style.split(":")[1] style_name, style_value = style.split(":") - if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: + if style_name not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT: # property not in LIVECARTA_STYLE_ATTRS, remove split_style.remove(style) continue @@ -165,7 +165,7 @@ class StyleReader: def process_inline_styles_in_html_soup(self, html_content): """This function is designed to convert inline html styles""" - tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + tags_with_inline_style = html_content.find_all(LiveCartaConfig.REGEX_TAGS_WITH_STYLE_ATTR, attrs={"style": re.compile(".*")}) for tag_initial_inline_style in tags_with_inline_style: @@ -194,10 +194,10 @@ class StyleReader: def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule, style_type: cssutils.css.property.Property): - if style_type.name in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE: + if style_type.name in LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT: # attributes to replace - style_type.name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_REPLACE[style_type.name] - if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: + style_type.name = LiveCartaConfig.STYLE_ATTR_TO_REPLACEMENT[style_type.name] + if style_type.name not in LiveCartaConfig.STYLE_ATTR_TO_VALUE_LIMIT: # property not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = "" return