From 256a1abba08142e293e14043a9f542eead6965b2 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 9 Dec 2022 16:24:22 +0300 Subject: [PATCH] LAW-5957 --- preset/epub_presets.json | 56 ++----------------------------- src/epub_converter/epub_solver.py | 3 +- src/html_presets_processor.py | 35 +++++++++++-------- src/livecarta_config.py | 2 ++ src/style_reader.py | 2 ++ 5 files changed, 29 insertions(+), 69 deletions(-) diff --git a/preset/epub_presets.json b/preset/epub_presets.json index d30e619..b4a5ce5 100644 --- a/preset/epub_presets.json +++ b/preset/epub_presets.json @@ -2,58 +2,6 @@ { "preset_name": "table_wrapper", "rules": [ - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "width", - "value": ".*" - } - ] - } - }, - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "border", - "value": ".*" - } - ] - } - }, - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "style", - "value": "border.*" - } - ] - } - }, - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "bgcolor", - "value": ".*" - } - ] - } - }, { "tags": ["^section$", "^blockquote$"], "condition": { @@ -73,7 +21,7 @@ "preset_name": "replacer", "rules": [ { - "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"], + "tags": ["^h[6-9]$", "^figure$", "^section$", "blockquote"], "condition": null, "tag_to_replace": { "name": "p" @@ -127,7 +75,7 @@ } ] }, - { + { "preset_name": "attrs_remover", "rules": [ { diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 470d307..1992aa3 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -52,12 +52,13 @@ class EpubBook(BookSolver): if __name__ == "__main__": - epub_file_path = f"../../books/epub/9781614382264.epub" + epub_file_path = f"../../books/epub/Deep_Learning_with_Python_Second_Editio.epub" logger_object = BookLogger(name="epub") logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1]) html_preprocessor = HtmlPresetsProcessor( + logger=logger_object, preset_path="../../preset/epub_presets.json") style_preprocessor = StyleReader() html_processor = HtmlEpubProcessor(logger=logger_object, diff --git a/src/html_presets_processor.py b/src/html_presets_processor.py index 7de4c05..78ad9cf 100644 --- a/src/html_presets_processor.py +++ b/src/html_presets_processor.py @@ -2,7 +2,7 @@ import re import json from bs4 import BeautifulSoup, Tag from bs4.element import PageElement -from typing import List, Set, Dict, Union +from typing import Union from src.util.helpers import BookLogger @@ -29,42 +29,49 @@ class HtmlPresetsProcessor: "text": self._tags_with_text_condition } + @staticmethod def _tags_with_parent_condition(**kwargs): - found_tags: Set[Tag] = set() + found_tags: list[Tag] = list() + # add unique id in order not to add duplicates to the + # found_tags(because tag with subtag could duplicate found_tag) + u_id = 0 for parent_tag in kwargs["body_tag"].select(kwargs["family_condition"]): for tag in parent_tag.find_all([re.compile(tag) for tag in kwargs["tags"]]): - found_tags.add(tag) + if not tag.attrs.get("unique_id"): + tag.attrs["unique_id"] = u_id + u_id += 1 + found_tags.append(tag) return len(found_tags) != 0, list(found_tags) @staticmethod def _tags_with_child_condition(**kwargs): - found_tags: Set[Tag] = set() + found_tags: list[Tag] = list() for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]]): if tag.select(kwargs["family_condition"]): - found_tags.add(tag) + found_tags.append(tag) return len(found_tags) != 0, list(found_tags) @staticmethod def _tags_with_attrs_condition(**kwargs): - found_tags: Set[Tag] = set() + found_tags: list[Tag] = list() names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]] values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]] - attr_conditions: Dict[str, str] = dict(zip(names, values)) + attr_conditions: dict[str, re] = dict(zip(names, values)) for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]], attr_conditions): - found_tags.add(tag) + found_tags.append(tag) return len(found_tags) != 0, list(found_tags) @staticmethod def _tags_with_text_condition(**kwargs): # find all tags that are in List of tags and tags that contains required text - found_tags: Set[Tag] = set() + found_tags: list[Tag] = list() for tag in kwargs["body_tag"].find_all( lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))", t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]), t.text)): - found_tags.add(tag) + found_tags.append(tag) return len(found_tags) != 0, list(found_tags) @staticmethod @@ -104,7 +111,7 @@ class HtmlPresetsProcessor: def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, - class_: Union[List[str], str]): + class_: Union[list[str], str]): """Function inserts span before tag aren't supported by LiveCarta""" new_tag: Tag = chapter_tag.new_tag("span") new_tag.attrs["id"] = id_ or "" @@ -201,7 +208,7 @@ class HtmlPresetsProcessor: def process_tags(self, body_tag: BeautifulSoup, - preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], + preset_rules: list[dict[str, Union[list[str], str, dict[str, Union[list[dict[str, str]], int, str]]]]], action): """ Function does action with tags @@ -220,9 +227,9 @@ class HtmlPresetsProcessor: """ for preset_rule in preset_rules: - tags: List[str] = preset_rule["tags"] if preset_rule.get( + tags: list[str] = preset_rule["tags"] if preset_rule.get( "tags") else preset_rule["condition"]["tags"] - found_tags: List[Tag] = [] + found_tags: list[Tag] = [] if preset_rule["condition"]: conditions_on_tag = tuple((k, v) for k, v in preset_rule["condition"].items() if v) for condition_on_tag in conditions_on_tag: diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 8f1dc24..3d671c1 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -131,6 +131,8 @@ class LiveCartaConfig: "border-left-width": [], "border-bottom-width": [], "border-top": [], + "border-right": [], + "border-left": [], "border-bottom": [], "list-style-type": [], "list-style-image": [], diff --git a/src/style_reader.py b/src/style_reader.py index d178e32..c3f56bb 100644 --- a/src/style_reader.py +++ b/src/style_reader.py @@ -32,6 +32,8 @@ class StyleReader: "border-left-width": self.convert_tag_style_values, "border-bottom-width": self.convert_tag_style_values, "border-top": self.convert_tag_style_values, + "border-right": self.convert_tag_style_values, + "border-left": self.convert_tag_style_values, "border-bottom": self.convert_tag_style_values, "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", "list-style-image": lambda x: "disc",