diff --git a/src/docx_converter/html_docx_processor.py b/src/docx_converter/html_docx_processor.py index 8650865..05c413e 100644 --- a/src/docx_converter/html_docx_processor.py +++ b/src/docx_converter/html_docx_processor.py @@ -14,13 +14,12 @@ class HtmlDocxProcessor: def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor): self.logger = logger self.html_soup = html_soup - self.body_tag = self.html_soup.body self.html_preprocessor = html_preprocessor self.style_preprocessor = style_preprocessor self.content: List[Tag] = [] def _font_to_span(self): - for font in self.body_tag.find_all("font"): + for font in self.html_soup.find_all("font"): font.name = "span" @@ -226,10 +225,12 @@ class HtmlDocxProcessor: self.logger.log("Inline style reading.") self.style_preprocessor.process_inline_styles_in_html_soup( - self.body_tag) + self.html_soup) self.logger.log("Inline style processing.") - modify_html_soup_with_css_styles(self.body_tag) + self.html_soup = modify_html_soup_with_css_styles(self.html_soup) + + self.body_tag = self.html_soup.body self.logger.log("Image processing.") images = process_images(access, path_to_html=html_path, diff --git a/src/inline_style_processor.py b/src/inline_style_processor.py index d63122a..cc7c14d 100644 --- a/src/inline_style_processor.py +++ b/src/inline_style_processor.py @@ -14,7 +14,7 @@ class InlineStyleProcessor: def __init__(self, tag_inline_style: Tag): # tag with inline style + style parsed from css file self.tag_inline_style = tag_inline_style - self.tag_inline_style.attrs['style']: str = self.process_inline_style() + self.tag_inline_style.attrs["style"]: str = self.process_inline_style() @staticmethod def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str: @@ -80,19 +80,19 @@ class InlineStyleProcessor: processed_style = ";".join(split_style)+';' margin_left_regexp = re.compile( - r"((margin-left|margin): *(-*\w+);*)") + r"((margin-left|margin): *-*((\d*)\.*\d+)\w+;*)") text_indent_regexp = re.compile( - r"(text-indent: *(-*\w+);*)") + r"(text-indent: *-*((\d*)\.*\d+)\w+;*)") has_margin = re.search(margin_left_regexp, processed_style) has_text_indent = re.search(text_indent_regexp, processed_style) if has_margin: num_m = abs(int("0" + "".join( - filter(str.isdigit, str(has_margin.group(3)))))) + filter(str.isdigit, str(has_margin.group(4)))))) if has_text_indent: num_ti = abs(int("0" + "".join( - filter(str.isdigit, str(has_text_indent.group(2)))))) + filter(str.isdigit, str(has_text_indent.group(3)))))) processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + str(abs(num_m - num_ti)) + "px; ") processed_style = processed_style.replace( @@ -106,7 +106,7 @@ class InlineStyleProcessor: elif has_text_indent: processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + str(abs(int("0" + "".join( - filter(str.isdigit, str(has_text_indent.group(2))))))) + filter(str.isdigit, str(has_text_indent.group(3))))))) + "px; ") return processed_style return processed_style @@ -127,22 +127,25 @@ class InlineStyleProcessor: processed inline style """ - inline_style = self.tag_inline_style.attrs.get("style") + ";" - # 1. Remove white color if tag doesn"t have background color in style - inline_style = self.remove_white_if_no_bgcolor( - inline_style, self.tag_inline_style) - inline_style = inline_style.replace( - "list-style-image", "list-style-type") - # 2. Create list of styles from inline style - # replace all spaces between "; & letter" to ";" - style = re.sub(r"; *", ";", inline_style) - # when we split style by ";", last element of the list is "" - None (remove it) - split_inline_style: list = list(filter(None, style.split(";"))) - # 3. Duplicate styles check - if the tag had duplicate styles - # split_inline_style = self.duplicate_styles_check(split_inline_style) - # 4. Processing indents - inline_style: str = self.indents_processing(split_inline_style) - return inline_style + if self.tag_inline_style.attrs.get("style"): + inline_style = self.tag_inline_style.attrs.get("style") + ";" + # 1. Remove white color if tag doesn't have background color in style + inline_style = self.remove_white_if_no_bgcolor( + inline_style, self.tag_inline_style) + inline_style = inline_style.replace( + "list-style-image", "list-style-type") + # 2. Create list of styles from inline style + # replace all spaces between "; & letter" to ";" + style = re.sub(r"; *", ";", inline_style) + # when we split style by ";", last element of the list is "" - None (remove it) + split_inline_style: list = list(filter(None, style.split(";"))) + # 3. Duplicate styles check - if the tag had duplicate styles + # split_inline_style = self.duplicate_styles_check(split_inline_style) + # 4. Processing indents + inline_style: str = self.indents_processing(split_inline_style) + return inline_style + else: + return "" @staticmethod def check_style_to_be_tag(style: str) -> List[tuple]: diff --git a/src/style_reader.py b/src/style_reader.py index 08b0809..bda5912 100644 --- a/src/style_reader.py +++ b/src/style_reader.py @@ -126,17 +126,18 @@ class StyleReader: return constraints_on_value, value_not_in_possible_values_list def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list: - for i, style in enumerate(split_style): + for i, style in reversed(list(enumerate(split_style))): style_name, style_value = style.split(":") if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = "" - return split_style + # property not in LIVECARTA_STYLE_ATTRS, remove + split_style.remove(style) + continue cleaned_value = self.clean_value(style_value, style_name) if all(self.style_conditions(cleaned_value, style_name)): - # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = "" + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove + split_style.remove(style) + continue else: if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING: # function that converts our data @@ -157,7 +158,7 @@ class StyleReader: split_style = self.update_inline_styles_to_livecarta_convention( split_style) - style = "; ".join(split_style) + style = "; ".join(split_style) if split_style else "" return style def process_inline_styles_in_html_soup(self, html_content):