forked from LiveCarta/BookConverter
LAW-6736|Fix duplicate parent-child tags in attr condition
This commit is contained in:
@@ -18,7 +18,7 @@ from src.epub_converter.epub_solver import EpubBook
|
|||||||
def local_convert_book(book_type: [DocxBook, EpubBook], book_id: int, main_logger: logging.Logger, params: dict):
|
def local_convert_book(book_type: [DocxBook, EpubBook], book_id: int, main_logger: logging.Logger, params: dict):
|
||||||
main_logger.info(f"Start processing book-{book_id}.")
|
main_logger.info(f"Start processing book-{book_id}.")
|
||||||
try:
|
try:
|
||||||
json_file_path = "books/json/9781264269044.json"
|
json_file_path = "books/json/9781839211973.json"
|
||||||
book = book_type(book_id=book_id, main_logger=main_logger, **params)
|
book = book_type(book_id=book_id, main_logger=main_logger, **params)
|
||||||
book.conversion_local(json_file_path)
|
book.conversion_local(json_file_path)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
|||||||
@@ -168,7 +168,7 @@ class BookSolver:
|
|||||||
self.status_wrapper.set_processing()
|
self.status_wrapper.set_processing()
|
||||||
content_dict: Dict[str, List[Dict[Union[str, List]]]] = self.get_converted_book()
|
content_dict: Dict[str, List[Dict[Union[str, List]]]] = self.get_converted_book()
|
||||||
[os.remove(path) for path in [self.book_path,
|
[os.remove(path) for path in [self.book_path,
|
||||||
# self.preset_path
|
self.preset_path
|
||||||
]]
|
]]
|
||||||
self.book_logger.log("Beginning of processing .json output.")
|
self.book_logger.log("Beginning of processing .json output.")
|
||||||
self.status_wrapper.set_generating()
|
self.status_wrapper.set_generating()
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ class DocxBook(BookSolver):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
docx_file_path = f"../../books/docx/3cd6f561b8d7ee6a510c783784c9d018.docx"
|
docx_file_path = f"../../books/docx/Land_Use_Law_book_2023_(new).docx"
|
||||||
|
|
||||||
book_logger = BookLogger(name="epub")
|
book_logger = BookLogger(name="epub")
|
||||||
book_logger.configure_book_logger(book_id=docx_file_path.split("/")[-1])
|
book_logger.configure_book_logger(book_id=docx_file_path.split("/")[-1])
|
||||||
@@ -91,7 +91,7 @@ if __name__ == "__main__":
|
|||||||
html_converter = Docx2LibreHtml(file_path=docx_file_path,
|
html_converter = Docx2LibreHtml(file_path=docx_file_path,
|
||||||
logger=book_logger, libre_locker=locker)
|
logger=book_logger, libre_locker=locker)
|
||||||
html_preset_processor = HtmlPresetsProcessor(
|
html_preset_processor = HtmlPresetsProcessor(
|
||||||
logger=book_logger, preset_path="../../preset/docx_presets.json")
|
logger=book_logger, preset_path="../../preset/presets.json")
|
||||||
style_preprocessor = StyleReader()
|
style_preprocessor = StyleReader()
|
||||||
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=book_logger,
|
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=book_logger,
|
||||||
html_preprocessor=html_preset_processor, style_preprocessor=style_preprocessor)
|
html_preprocessor=html_preset_processor, style_preprocessor=style_preprocessor)
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ class EpubBook(BookSolver):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
epub_file_path = f"../../books/epub/9781284127362.epub"
|
epub_file_path = f"../../books/epub/9781801815093.epub"
|
||||||
|
|
||||||
logger_object = BookLogger(name="epub")
|
logger_object = BookLogger(name="epub")
|
||||||
logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1])
|
logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1])
|
||||||
|
|||||||
@@ -55,21 +55,24 @@ class HtmlPresetsProcessor:
|
|||||||
names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]]
|
names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]]
|
||||||
values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]]
|
values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]]
|
||||||
attr_conditions: dict[str, re] = dict(zip(names, values))
|
attr_conditions: dict[str, re] = dict(zip(names, values))
|
||||||
for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]],
|
found_tags = kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]],
|
||||||
attr_conditions):
|
attr_conditions)
|
||||||
found_tags.append(tag)
|
|
||||||
return len(found_tags) != 0, list(found_tags)
|
return len(found_tags) != 0, list(found_tags)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _tags_with_text_condition(**kwargs):
|
def _tags_with_text_condition(**kwargs):
|
||||||
# find all tags that are in List of tags and tags that contains required text
|
# find all tags that are in List of tags and tags that contain required text
|
||||||
found_tags: list[Tag] = list()
|
found_tags = kwargs["body_tag"].find_all(
|
||||||
for tag in kwargs["body_tag"].find_all(
|
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))", t.name) and re.search(
|
||||||
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))",
|
re.compile(kwargs["rule"]["condition"]["text"]), t.text))
|
||||||
t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]),
|
|
||||||
t.text)):
|
# Get the parent tags in the tag_list
|
||||||
found_tags.append(tag)
|
parent_tags = set([tag.find_parent() for tag in found_tags if tag.find_parent()])
|
||||||
return len(found_tags) != 0, list(found_tags)
|
|
||||||
|
# Filter out tags that are parents to other tags
|
||||||
|
filtered_tags = [tag for tag in found_tags if tag not in parent_tags]
|
||||||
|
|
||||||
|
return len(filtered_tags) != 0, list(filtered_tags)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _wrap_tag(**kwargs):
|
def _wrap_tag(**kwargs):
|
||||||
@@ -308,6 +311,8 @@ class HtmlPresetsProcessor:
|
|||||||
if len(conditions_on_tag) > 1 and found_tags:
|
if len(conditions_on_tag) > 1 and found_tags:
|
||||||
# tags satisfying all conditions((more than 1 condition)
|
# tags satisfying all conditions((more than 1 condition)
|
||||||
found_tags = [tag for tag in found_tags if found_tags.count(tag) == len(conditions_on_tag)]
|
found_tags = [tag for tag in found_tags if found_tags.count(tag) == len(conditions_on_tag)]
|
||||||
|
|
||||||
|
# Make an action with necessary tags
|
||||||
for found_tag in found_tags:
|
for found_tag in found_tags:
|
||||||
action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
|
action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user