From 0481f5e98f15c0a45ed583d5a8e735f3bfce2d28 Mon Sep 17 00:00:00 2001 From: Kibzik Date: Tue, 23 May 2023 14:18:36 +0300 Subject: [PATCH] LAW-6736|Fix duplicate parent-child tags in attr condition --- consumer.py | 2 +- src/book_solver.py | 2 +- src/docx_converter/docx_solver.py | 4 ++-- src/epub_converter/epub_solver.py | 2 +- src/html_presets_processor.py | 27 ++++++++++++++++----------- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/consumer.py b/consumer.py index b5fc79a..bcb31ec 100644 --- a/consumer.py +++ b/consumer.py @@ -18,7 +18,7 @@ from src.epub_converter.epub_solver import EpubBook def local_convert_book(book_type: [DocxBook, EpubBook], book_id: int, main_logger: logging.Logger, params: dict): main_logger.info(f"Start processing book-{book_id}.") try: - json_file_path = "books/json/9781264269044.json" + json_file_path = "books/json/9781839211973.json" book = book_type(book_id=book_id, main_logger=main_logger, **params) book.conversion_local(json_file_path) except Exception as exc: diff --git a/src/book_solver.py b/src/book_solver.py index 7ca8b28..9901268 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -168,7 +168,7 @@ class BookSolver: self.status_wrapper.set_processing() content_dict: Dict[str, List[Dict[Union[str, List]]]] = self.get_converted_book() [os.remove(path) for path in [self.book_path, - # self.preset_path + self.preset_path ]] self.book_logger.log("Beginning of processing .json output.") self.status_wrapper.set_generating() diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 9193224..b6531f7 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -80,7 +80,7 @@ class DocxBook(BookSolver): if __name__ == "__main__": - docx_file_path = f"../../books/docx/3cd6f561b8d7ee6a510c783784c9d018.docx" + docx_file_path = f"../../books/docx/Land_Use_Law_book_2023_(new).docx" book_logger = BookLogger(name="epub") book_logger.configure_book_logger(book_id=docx_file_path.split("/")[-1]) @@ -91,7 +91,7 @@ if __name__ == "__main__": html_converter = Docx2LibreHtml(file_path=docx_file_path, logger=book_logger, libre_locker=locker) html_preset_processor = HtmlPresetsProcessor( - logger=book_logger, preset_path="../../preset/docx_presets.json") + logger=book_logger, preset_path="../../preset/presets.json") style_preprocessor = StyleReader() html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=book_logger, html_preprocessor=html_preset_processor, style_preprocessor=style_preprocessor) diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 2b949f3..42bcd0b 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -52,7 +52,7 @@ class EpubBook(BookSolver): if __name__ == "__main__": - epub_file_path = f"../../books/epub/9781284127362.epub" + epub_file_path = f"../../books/epub/9781801815093.epub" logger_object = BookLogger(name="epub") logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1]) diff --git a/src/html_presets_processor.py b/src/html_presets_processor.py index e0ed486..65a447c 100644 --- a/src/html_presets_processor.py +++ b/src/html_presets_processor.py @@ -55,21 +55,24 @@ class HtmlPresetsProcessor: names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]] values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]] attr_conditions: dict[str, re] = dict(zip(names, values)) - for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]], - attr_conditions): - found_tags.append(tag) + found_tags = kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]], + attr_conditions) return len(found_tags) != 0, list(found_tags) @staticmethod def _tags_with_text_condition(**kwargs): - # find all tags that are in List of tags and tags that contains required text - found_tags: list[Tag] = list() - for tag in kwargs["body_tag"].find_all( - lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))", - t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]), - t.text)): - found_tags.append(tag) - return len(found_tags) != 0, list(found_tags) + # find all tags that are in List of tags and tags that contain required text + found_tags = kwargs["body_tag"].find_all( + lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))", t.name) and re.search( + re.compile(kwargs["rule"]["condition"]["text"]), t.text)) + + # Get the parent tags in the tag_list + parent_tags = set([tag.find_parent() for tag in found_tags if tag.find_parent()]) + + # Filter out tags that are parents to other tags + filtered_tags = [tag for tag in found_tags if tag not in parent_tags] + + return len(filtered_tags) != 0, list(filtered_tags) @staticmethod def _wrap_tag(**kwargs): @@ -308,6 +311,8 @@ class HtmlPresetsProcessor: if len(conditions_on_tag) > 1 and found_tags: # tags satisfying all conditions((more than 1 condition) found_tags = [tag for tag in found_tags if found_tags.count(tag) == len(conditions_on_tag)] + + # Make an action with necessary tags for found_tag in found_tags: action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule) else: