From 9e31d3152c46992215d2685b3e16c2ba0630c9de Mon Sep 17 00:00:00 2001 From: Kiryl Date: Mon, 5 Sep 2022 17:23:03 +0300 Subject: [PATCH] Add replacer(-two_columns,-font_to_span, preproc_headings) --- presets/docx_presets.json | 70 ++++++++-- src/docx_converter/html_docx_processor.py | 157 +++++++--------------- 2 files changed, 106 insertions(+), 121 deletions(-) diff --git a/presets/docx_presets.json b/presets/docx_presets.json index 2d6e141..16f55c6 100644 --- a/presets/docx_presets.json +++ b/presets/docx_presets.json @@ -11,12 +11,63 @@ { "name": "title", "value": "footer" + }, + { + "name": "id", + "value": "^Table of Contents\\d+" } ] } } ] }, + { + "preset_name": "replacer", + "rules": [ + { + "tags": ["^font$"], + "condition": null, + "tag_to_replace": "span" + }, + { + "tags": ["^h[6-9]$"], + "condition": null, + "tag_to_replace": "p" + }, + { + "tags": ["^div$"], + "condition": { + "parent_tags": null, + "child_tags": null, + "attrs": [ + { + "name": "style", + "value": "column-count: 2" + } + ] + }, + "tag_to_replace": "p" + } + ] + }, + { + "preset_name": "attr_replacer", + "rules": [ + { + "attr": { + "name": "style", + "value": "column-count: 2" + }, + "condition": { + "tags": ["^p$"] + }, + "attr_to_replace": { + "name": null, + "value": "columns2" + } + } + ] + }, { "preset_name": "unwrapper", "rules": [ @@ -33,16 +84,7 @@ { "name": "lang", "value": "^ru-RU$" - } - ] - } - }, - { - "tags": ["^font$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ + }, { "name": "face", "value": "^Times New Roman[\\w, ]+$" @@ -53,7 +95,7 @@ { "tags": ["^p$"], "condition": { - "parent_tags": "li", + "parent_tags": ":is(li)", "child_tags": null, "attrs": null } @@ -74,8 +116,8 @@ { "tags": ["^u$"], "condition": { - "parent_tags": "a", - "child_tags": "a", + "parent_tags": ":is(a)", + "child_tags": ":is(a)", "attrs": null } }, @@ -89,4 +131,4 @@ } ] } -] \ No newline at end of file +] diff --git a/src/docx_converter/html_docx_processor.py b/src/docx_converter/html_docx_processor.py index 9ea15c2..213d2fc 100644 --- a/src/docx_converter/html_docx_processor.py +++ b/src/docx_converter/html_docx_processor.py @@ -22,6 +22,8 @@ class HTMLDocxProcessor: self.style_processor = style_processor self.name2action = { "decomposer": self._decompose_tag, + "replacer": self._replace_tag, + "attr_replacer": self._replace_attr, "unwrapper": self._unwrap_tag } self.top_level_headers = None @@ -59,15 +61,34 @@ class HTMLDocxProcessor: f"Tag name: {tag.name}") @staticmethod - def _decompose_tag(tag): - tag.decompose() + def _decompose_tag(**kwargs): + kwargs["tag"].decompose() @staticmethod - def _unwrap_tag(tag): - tag.unwrap() + def _replace_tag(**kwargs): + tag_to_replace: str = kwargs["rule"]["tag_to_replace"] + kwargs["tag"].name = tag_to_replace @staticmethod - def _process_tags(body_tag: BeautifulSoup, + def _replace_attr(**kwargs): + attr, attr_value =\ + kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"] + attr_to_replace, attr_value_to_replace =\ + kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] + if attr_to_replace: + kwargs["tag"][attr_to_replace] = kwargs["tag"][attr] + if attr_value_to_replace: + kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace + del kwargs["tag"][attr] + elif attr_value_to_replace: + kwargs["tag"].attrs[attr] = attr_value_to_replace + + @staticmethod + def _unwrap_tag(**kwargs): + kwargs["tag"].unwrap() + + @staticmethod + def _process_tags(body_tag: Tag, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], action): """ @@ -87,104 +108,32 @@ class HTMLDocxProcessor: """ for rule in rules: - tags: List[str] = rule["tags"] + tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"] if rule["condition"]: for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): if condition_on_tag[0] == "parent_tags": for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag) for tag in tags])): tag.parent.attrs.update(tag.attrs) - action(tag) + action(body_tag=body_tag, tag=tag, rule=rule) elif condition_on_tag[0] == "child_tags": for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1] for tag in tags])): - action(tag) + action(body_tag=body_tag, tag=tag, rule=rule) elif condition_on_tag[0] == "attrs": for attr in rule["condition"]["attrs"]: for tag in body_tag.find_all([re.compile(tag) for tag in tags], {attr["name"]: re.compile(fr"{attr['value']}")}): - action(tag) + action(body_tag=body_tag, tag=tag, rule=rule) + # attr replacer + elif condition_on_tag[0] == "tags": + attr = rule["attr"] + for tag in body_tag.find_all([re.compile(tag) for tag in tags], + {attr['name']: re.compile(fr"{attr['value']}")}): + action(body_tag=body_tag, tag=tag, rule=rule) else: for tag in body_tag.find_all([re.compile(tag) for tag in tags]): - action(tag) - - @classmethod - def convert_pt_to_px(cls, value: float) -> float: - value = float(value) - if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE: - return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE - else: - return value - - @classmethod - def convert_font_pt_to_px(cls, style: str) -> str: - """ - Function converts point in the font-size to pixels. - Parameters - ---------- - style: str - str with style to proces - - Returns - ------- - : str - str with converted style - - """ - size = re.search(r"font-size: (\d{1,3})pt", style) - if size is None: - return style - size = size.group(1) - new_size = cls.convert_pt_to_px(size) - if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE: - return "" - return re.sub(size + "pt", str(new_size) + "px", style) - - def _font_to_span(self): - """ - Function to convert tag to . - If font style is default, then remove this tag. - """ - fonts = self.body_tag.find_all("font") - for font in fonts: - face, style, color =\ - font.get("face"), font.get("style"), font.get("color") - - font.attrs, font.name = {}, "span" - if style: - style = self.convert_font_pt_to_px(style) - if style != "": - if color and color in LiveCartaConfig.COLORS_MAP: - style += f"; color: {color};" - font.attrs["style"] = style - elif color and color in LiveCartaConfig.COLORS_MAP: - font.attrs["style"] = f"color: {color};" - - if len(font.attrs) == 0: - font.unwrap() - - # on this step there should be no more tags - assert len(self.body_tag.find_all("font")) == 0 - - def clean_trash(self): - """Function to remove all styles and tags we don"t need.""" - # todo replacer - self._font_to_span() - - # replace toc with empty tag - tables = self.body_tag.find_all( - "div", id=re.compile(r"^Table of Contents\d+")) - for table in tables: - table.wrap(self.html_soup.new_tag("TOC")) - table.decompose() - - def _preprocessing_headings(self): - # todo replacer - """Function to convert all lower level headings to p tags""" - pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$" - header_tags = self.body_tag.find_all(re.compile(pattern)) - for tag in header_tags: - tag.name = "p" + action(body_tag=body_tag, tag=tag, rule=rule) def _process_paragraph(self): """Function to process

tags (text-align and text-indent value).""" @@ -237,16 +186,6 @@ class HTMLDocxProcessor: if style: p.attrs["style"] = style - def _process_two_columns(self): - """Function to process paragraphs which has two columns layout.""" - # todo replacer - two_columns = self.body_tag.find_all("div", style="column-count: 2") - for div in two_columns: - for child in div.children: - if child.name == "p": - child["class"] = "columns2" - div.unwrap() - def _process_quotes(self): """ Function to process block quotes. @@ -285,6 +224,14 @@ class HTMLDocxProcessor: table.replaceWith(new_div) + @staticmethod + def convert_pt_to_px(value: float) -> float: + value = float(value) + if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE: + return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE + else: + return value + def _process_tables(self): """Function to process tables. Set "border" attribute.""" tables = self.body_tag.find_all("table") @@ -490,24 +437,20 @@ class HTMLDocxProcessor: self.logger.log(f"Processing TOC and headers.") self._process_toc_links() + for rule in self.preset: + self.logger.log(rule["preset_name"] + " process.") + action = self.name2action[rule["preset_name"]] + self._process_tags(self.body_tag, rule["rules"], action) + self.logger.log("CSS inline style preprocessing.") self.style_processor.process_inline_styles_in_html_soup(self.html_soup) self.logger.log("CSS inline style processing.") modify_html_soup_with_css_styles(self.html_soup) - for rule in self.preset: - self.logger.log(rule["preset_name"] + " process.") - action = self.name2action[rule["preset_name"]] - self._process_tags(self.body_tag, rule["rules"], action) - - self.clean_trash() - # process main elements of the .html doc self.logger.log(f"Processing main elements of html.") - self._preprocessing_headings() self._process_paragraph() - self._process_two_columns() self.logger.log("Block quotes processing.") self._process_quotes()