diff --git a/presets/docx_presets.json b/presets/docx_presets.json index 96f861b..6d5613b 100644 --- a/presets/docx_presets.json +++ b/presets/docx_presets.json @@ -1,5 +1,5 @@ [ - { + { "preset_name": "wrapper", "rules": [ { @@ -34,7 +34,17 @@ { "name": "title", "value": "footer" - }, + } + ], + "text": null + } + }, + { + "tags": ["^div$"], + "condition": { + "parent_tags": null, + "child_tags": null, + "attrs": [ { "name": "id", "value": "^Table of Contents\\d+" @@ -104,15 +114,44 @@ "condition": { "parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)", "child_tags": null, + "attrs": null, + "text": null + } + }, + { + "tags": ["^span$"], + "condition": { + "parent_tags": null, + "child_tags": null, "attrs": [ { "name": "style", "value": "(^background: #[\\da-fA-F]{6}$)|(^letter-spacing: -?[\\d.]+pt$)" - }, + } + ], + "text": null + } + }, + { + "tags": ["^span$"], + "condition": { + "parent_tags": null, + "child_tags": null, + "attrs": [ { "name": "lang", "value": "^ru-RU$" - }, + } + ], + "text": null + } + }, + { + "tags": ["^span$"], + "condition": { + "parent_tags": null, + "child_tags": null, + "attrs": [ { "name": "face", "value": "^Times New Roman[\\w, ]+$" @@ -148,6 +187,15 @@ "tags": ["^u$"], "condition": { "parent_tags": ":is(a)", + "child_tags": null, + "attrs": null, + "text": null + } + }, + { + "tags": ["^u$"], + "condition": { + "parent_tags": null, "child_tags": ":is(a)", "attrs": null, "text": null diff --git a/presets/epub_presets.json b/presets/epub_presets.json index 7c8f672..d30e619 100644 --- a/presets/epub_presets.json +++ b/presets/epub_presets.json @@ -11,15 +11,42 @@ { "name": "width", "value": ".*" - }, + } + ] + } + }, + { + "tags": ["^div$"], + "condition": { + "parent_tags": null, + "child_tags": null, + "attrs": [ { "name": "border", "value": ".*" - }, + } + ] + } + }, + { + "tags": ["^div$"], + "condition": { + "parent_tags": null, + "child_tags": null, + "attrs": [ { "name": "style", "value": "border.*" - }, + } + ] + } + }, + { + "tags": ["^div$"], + "condition": { + "parent_tags": null, + "child_tags": null, + "attrs": [ { "name": "bgcolor", "value": ".*" @@ -69,7 +96,7 @@ { "tags": ["^code$", "^kbd$", "^var$"], "condition": { - "parent_tags": ":not(pre)", + "parent_tags": ":not(pre, span)", "child_tags": null, "attrs": null }, @@ -99,6 +126,15 @@ } } ] + }, + { + "preset_name": "attrs_remover", + "rules": [ + { + "tags": ["^sup$"], + "condition": null + } + ] }, { "preset_name": "attr_replacer", @@ -171,4 +207,4 @@ } ] } -] \ No newline at end of file +] diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 5a6a9b5..3106797 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -1,5 +1,6 @@ import json import codecs +import logging from src.book_solver import BookSolver from src.util.helpers import BookLogger @@ -30,11 +31,19 @@ class EpubBook(BookSolver): json for LiveCarta platform """ - html_preprocessor = HtmlPresetsProcessor( - logger=self.logger_object, preset_path="presets/epub_presets.json") style_preprocessor = StyleReader() - html_processor = HtmlEpubProcessor(logger=self.logger_object, - html_preprocessor=html_preprocessor) + # Parses and cleans html, gets list of tags, gets footnotes + try: + html_preprocessor = HtmlPresetsProcessor( + logger=self.logger_object, preset_path="presets/epub_presets.json") + html_processor = HtmlEpubProcessor(logger=self.logger_object, + html_preprocessor=html_preprocessor) + except Exception as exc: + self.logger_object.log( + "Error has occurred while processing .html", logging.ERROR) + self.logger_object.log_error_to_main_log() + self.status_wrapper.set_error() + raise exc json_converter = EpubConverter( self.book_path, access=self.access, logger=self.logger_object, style_processor=style_preprocessor, html_processor=html_processor) diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index fd29817..711406d 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -107,12 +107,10 @@ class HtmlEpubProcessor: len(text_preparing(tag)) != 0 and re.findall(r"^h[1-5]$", tag.name or chapter_tag.name)) if title_in_text: - self.html_preprocessor._add_span_to_save_ids_for_links( - title_in_text[-1], chapter_tag) + self.html_preprocessor.add_span_to_save_ids_for_links(title_in_text[-1], chapter_tag) title_in_text[-1].extract() elif text_in_title: - [self.html_preprocessor._add_span_to_save_ids_for_links( - tag, chapter_tag) for tag in text_in_title] + [self.html_preprocessor.add_span_to_save_ids_for_links(tag, chapter_tag) for tag in text_in_title] [tag.extract() for tag in text_in_title] @staticmethod @@ -135,12 +133,12 @@ class HtmlEpubProcessor: and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): del tag.attrs["class"] - def prepare_content(self, title_str: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: + def prepare_content(self, title: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: """ Function finalise processing/cleaning content Parameters ---------- - title_str: str + title: str chapter_tag: BeautifulSoup, soup object @@ -170,7 +168,7 @@ class HtmlEpubProcessor: self._wrap_strings_with_p(chapter_tag) # 3. if remove_title_from_chapter: - self._remove_headings_content(chapter_tag, title_str) + self._remove_headings_content(chapter_tag, title) # 4. _process_presets( html_preprocessor=self.html_preprocessor, html_soup=chapter_tag) diff --git a/src/html_presets_processor.py b/src/html_presets_processor.py index c908ccb..7de4c05 100644 --- a/src/html_presets_processor.py +++ b/src/html_presets_processor.py @@ -2,7 +2,7 @@ import re import json from bs4 import BeautifulSoup, Tag from bs4.element import PageElement -from typing import List, Dict, Union +from typing import List, Set, Dict, Union from src.util.helpers import BookLogger @@ -16,15 +16,60 @@ class HtmlPresetsProcessor: "table_wrapper": self._process_tag_using_table, "decomposer": self._decompose_tag, "replacer": self._replace_tag, + "attrs_remover": self._remove_attrs, "attr_replacer": self._replace_attr, "unwrapper": self._unwrap_tag, "inserter": self._insert_tag, "text_replacer": self._replace_text } + self.conditions = { + "parent_tags": self._tags_with_parent_condition, + "child_tags": self._tags_with_child_condition, + "attrs": self._tags_with_attrs_condition, + "text": self._tags_with_text_condition + } + + @staticmethod + def _tags_with_parent_condition(**kwargs): + found_tags: Set[Tag] = set() + for parent_tag in kwargs["body_tag"].select(kwargs["family_condition"]): + for tag in parent_tag.find_all([re.compile(tag) for tag in kwargs["tags"]]): + found_tags.add(tag) + return len(found_tags) != 0, list(found_tags) + + @staticmethod + def _tags_with_child_condition(**kwargs): + found_tags: Set[Tag] = set() + for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]]): + if tag.select(kwargs["family_condition"]): + found_tags.add(tag) + return len(found_tags) != 0, list(found_tags) + + @staticmethod + def _tags_with_attrs_condition(**kwargs): + found_tags: Set[Tag] = set() + names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]] + values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]] + attr_conditions: Dict[str, str] = dict(zip(names, values)) + for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]], + attr_conditions): + found_tags.add(tag) + return len(found_tags) != 0, list(found_tags) + + @staticmethod + def _tags_with_text_condition(**kwargs): + # find all tags that are in List of tags and tags that contains required text + found_tags: Set[Tag] = set() + for tag in kwargs["body_tag"].find_all( + lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))", + t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]), + t.text)): + found_tags.add(tag) + return len(found_tags) != 0, list(found_tags) @staticmethod def _wrap_tag(**kwargs): - kwargs["tag"].wrap(kwargs["body_tag"].new_tag( + kwargs["found_tag"].wrap(kwargs["body_tag"].new_tag( kwargs["rule"]["tag_to_wrap"]["name"])) @staticmethod @@ -34,13 +79,13 @@ class HtmlPresetsProcessor: parent_tag.attrs[key] = tag.attrs[key] def _decompose_tag(self, **kwargs): - if kwargs["tag"].parent: - self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent) - kwargs["tag"].decompose() + if kwargs["found_tag"].parent: + self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent) + kwargs["found_tag"].decompose() @staticmethod - def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup], - chapter_tag: BeautifulSoup): + def add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup], + chapter_tag: BeautifulSoup): """ Function adds span with id from tag_to_be_removed because this tag will be removed(unwrapped/extract) @@ -82,29 +127,33 @@ class HtmlPresetsProcessor: kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag( "tr"), kwargs["body_tag"].new_tag("td") td.attrs["bgcolor"] = bg_color - kwargs["tag"].wrap(td) + kwargs["found_tag"].wrap(td) td.wrap(tr) tr.wrap(tbody) tbody.wrap(table) table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) return table _wrap_tag_with_table( - width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get( + width=kwargs["found_tag"].attrs["width"] if kwargs["found_tag"].attrs.get( "width") else "100", - border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get( + border=kwargs["found_tag"].attrs["border"] if kwargs["found_tag"].attrs.get( "border") else None, - bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None) - self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"]) - kwargs["tag"].unwrap() + bg_color=kwargs["found_tag"].attrs["bgcolor"] if kwargs["found_tag"].attrs.get("bgcolor") else None) + self.add_span_to_save_ids_for_links(kwargs["found_tag"], kwargs["body_tag"]) + kwargs["found_tag"].unwrap() @staticmethod def _replace_tag(**kwargs): tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"] - kwargs["tag"].name = tag_to_replace + kwargs["found_tag"].name = tag_to_replace if kwargs["rule"]["tag_to_replace"].get("attrs"): dict_attributes = {attr["name"]: attr["value"] for attr in kwargs["rule"]["tag_to_replace"]["attrs"]} - kwargs["tag"].attrs = dict_attributes + kwargs["found_tag"].attrs = dict_attributes + + @staticmethod + def _remove_attrs(**kwargs): + kwargs["found_tag"].attrs = {} @staticmethod def _replace_attr(**kwargs): @@ -114,21 +163,21 @@ class HtmlPresetsProcessor: attr_to_replace, attr_value_to_replace =\ kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] if attr_to_replace: - kwargs["tag"][attr_to_replace] = kwargs["tag"][attr_name] \ - if kwargs["tag"].get(attr_name)\ + kwargs["found_tag"][attr_to_replace] = kwargs["found_tag"][attr_name] \ + if kwargs["found_tag"].get(attr_name)\ else "" if attr_value_to_replace: - kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace - del kwargs["tag"][attr_name] + kwargs["found_tag"].attrs[attr_to_replace] = attr_value_to_replace + del kwargs["found_tag"][attr_name] elif attr_value_to_replace: - kwargs["tag"].attrs[attr_name] = attr_value_to_replace + kwargs["found_tag"].attrs[attr_name] = attr_value_to_replace elif attr_name: - del kwargs["tag"][attr_name] + del kwargs["found_tag"][attr_name] def _unwrap_tag(self, **kwargs): - if kwargs["tag"].parent: - self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent) - kwargs["tag"].unwrap() + if kwargs["found_tag"].parent: + self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent) + kwargs["found_tag"].unwrap() @staticmethod def _insert_tag(**kwargs): @@ -138,29 +187,29 @@ class HtmlPresetsProcessor: kwargs["body_tag"].new_tag( kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes) # insert all items that was in tag to subtag and remove from tag - for content in reversed(kwargs["tag"].contents): + for content in reversed(kwargs["found_tag"].contents): tag_to_insert.insert(0, content.extract()) # wrap subtag with items - kwargs["tag"].append(tag_to_insert) + kwargs["found_tag"].append(tag_to_insert) @staticmethod def _replace_text(**kwargs): - if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["tag"].string): + if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["found_tag"].string): new_text = re.sub(re.compile( - kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["tag"].string) - kwargs["tag"].string.replace_with(new_text) + kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["found_tag"].string) + kwargs["found_tag"].string.replace_with(new_text) - @staticmethod - def _process_tags(body_tag: BeautifulSoup, - rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], - action): + def process_tags(self, + body_tag: BeautifulSoup, + preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], + action): """ Function does action with tags Parameters ---------- body_tag: BeautifulSoup Tag & contents of the body tag - rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] + preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] list of conditions when fire function action: function action what to do with tag @@ -170,39 +219,34 @@ class HtmlPresetsProcessor: Body Tag with processed certain tags """ - for rule in rules: - tags: List[str] = rule["tags"] if rule.get( - "tags") else rule["condition"]["tags"] - if rule["condition"]: - for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): - if condition_on_tag[0] == "parent_tags": - for parent_tag in body_tag.select(condition_on_tag[1]): - for tag in parent_tag.find_all([re.compile(tag) for tag in tags]): - # parent_tag != tag.parent - action(body_tag=body_tag, tag=tag, rule=rule) - elif condition_on_tag[0] == "child_tags": - for tag in body_tag.find_all([re.compile(tag) for tag in tags]): - if tag.select(condition_on_tag[1]): - action(body_tag=body_tag, tag=tag, rule=rule) - elif condition_on_tag[0] == "attrs": - for attr in rule["condition"]["attrs"]: - for tag in body_tag.find_all([re.compile(tag) for tag in tags], - {attr["name"]: re.compile(fr"{attr['value']}")}): - action(body_tag=body_tag, tag=tag, rule=rule) - elif condition_on_tag[0] == "text": - # find all tags that are in List of tags and tags that contains required text - for tag in body_tag.find_all( - lambda t: re.search(r"(?=(" + '|'.join([tag for tag in tags]) + r"))", - t.name) and re.search(re.compile(rule["condition"]["text"]), - t.text)): - action(body_tag=body_tag, tag=tag, rule=rule) + for preset_rule in preset_rules: + tags: List[str] = preset_rule["tags"] if preset_rule.get( + "tags") else preset_rule["condition"]["tags"] + found_tags: List[Tag] = [] + if preset_rule["condition"]: + conditions_on_tag = tuple((k, v) for k, v in preset_rule["condition"].items() if v) + for condition_on_tag in conditions_on_tag: + condition_func = self.conditions[condition_on_tag[0]] + was_found, f_tags = condition_func(body_tag=body_tag, + tags=tags, + rule=preset_rule, + family_condition=condition_on_tag[1]) + found_tags = found_tags + f_tags if was_found else [] + if not was_found: + break + # if there are several conditions on tags and found_tags isn't empty + if len(conditions_on_tag) > 1 and found_tags: + # tags satisfying all conditions(>1) + found_tags = [tag for tag in found_tags if found_tags.count(tag) > 1] + for found_tag in found_tags: + action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule) else: - for tag in body_tag.find_all([re.compile(tag) for tag in tags]): - action(body_tag=body_tag, tag=tag, rule=rule) + for found_tag in body_tag.find_all([re.compile(tag) for tag in tags]): + action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule) def _process_presets(html_preprocessor: HtmlPresetsProcessor, html_soup: BeautifulSoup): - for rule in html_preprocessor.preset: + for preset in html_preprocessor.preset: # html_preprocessor.logger.log(rule["preset_name"].title() + " process.") - action = html_preprocessor.name2action[rule["preset_name"]] - html_preprocessor._process_tags(html_soup, rule["rules"], action) + action = html_preprocessor.name2action[preset["preset_name"]] + html_preprocessor.process_tags(html_soup, preset["rules"], action) diff --git a/src/style_reader.py b/src/style_reader.py index daa2c3e..8831f9a 100644 --- a/src/style_reader.py +++ b/src/style_reader.py @@ -109,24 +109,27 @@ class StyleReader: return constraints_on_value, value_not_in_possible_values_list def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list: - for i, style in reversed(list(enumerate(split_style))): - style_name, style_value = style.split(":") - if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove - split_style.remove(style) - continue + try: + for i, style in reversed(list(enumerate(split_style))): + style_name, style_value = style.split(":") + if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: + # property not in LIVECARTA_STYLE_ATTRS, remove + split_style.remove(style) + continue - cleaned_value = self.clean_value(style_value, style_name) - if all(self.style_conditions(cleaned_value, style_name)): - # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove - split_style.remove(style) - continue - else: - if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING: - # function that converts our data - func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name] - style_value = func(cleaned_value) - split_style[i] = style_name + ":" + style_value + cleaned_value = self.clean_value(style_value, style_name) + if all(self.style_conditions(cleaned_value, style_name)): + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove + split_style.remove(style) + continue + else: + if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING: + # function that converts our data + func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name] + style_value = func(cleaned_value) + split_style[i] = style_name + ":" + style_value + except ValueError as ve: + print(f"Style value isn't correct.") return split_style def build_inline_style_content(self, style: str) -> str: