From 5a57bf4d32b34237ab5578e89acf999a838fe48d Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 13 Oct 2022 11:05:32 +0300 Subject: [PATCH] docx presets improves --- presets/docx_presets.json | 86 +++++++++++++++++++++++++------ src/docx_converter/docx_solver.py | 4 +- 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/presets/docx_presets.json b/presets/docx_presets.json index fed2d62..96f861b 100644 --- a/presets/docx_presets.json +++ b/presets/docx_presets.json @@ -12,9 +12,13 @@ "name": "id", "value": "^Table of Contents\\d+" } - ] + ], + "text": null }, - "tag_to_wrap": "TOC" + "tag_to_wrap": { + "name": "TOC", + "attrs": [] + } } ] }, @@ -35,7 +39,8 @@ "name": "id", "value": "^Table of Contents\\d+" } - ] + ], + "text": null } } ] @@ -46,7 +51,10 @@ { "tags": ["^h[6-9]$"], "condition": null, - "tag_to_replace": "p" + "tag_to_replace": { + "name": "p", + "attrs": null + } }, { "tags": ["^div$"], @@ -58,9 +66,13 @@ "name": "style", "value": "column-count: 2" } - ] + ], + "text": null }, - "tag_to_replace": "p" + "tag_to_replace": { + "name": "p", + "attrs": null + } } ] }, @@ -68,12 +80,14 @@ "preset_name": "attr_replacer", "rules": [ { - "attr": { - "name": "style", - "value": "column-count: 2" - }, + "tags": ["^p$"], "condition": { - "tags": ["^p$"] + "attrs": [ + { + "name": "style", + "value": "column-count: 2" + } + ] }, "attr_to_replace": { "name": "class", @@ -103,7 +117,8 @@ "name": "face", "value": "^Times New Roman[\\w, ]+$" } - ] + ], + "text": null } }, { @@ -111,7 +126,8 @@ "condition": { "parent_tags": ":is(li)", "child_tags": null, - "attrs": null + "attrs": null, + "text": null } }, { @@ -124,7 +140,8 @@ "name": "name", "value": "_GoBack" } - ] + ], + "text": null } }, { @@ -132,7 +149,8 @@ "condition": { "parent_tags": ":is(a)", "child_tags": ":is(a)", - "attrs": null + "attrs": null, + "text": null } }, { @@ -140,7 +158,8 @@ "condition": { "parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)", "child_tags": null, - "attrs": null + "attrs": null, + "text": null } }, { @@ -148,5 +167,40 @@ "condition": null } ] + }, + { + "preset_name": "inserter", + "rules": [ + { + "tags": ["^p$"], + "condition": { + "parent_tags": null, + "child_tags": null, + "attrs": null, + "text": "\\$\\$[\\s\\S]*?\\$\\$" + }, + "tag_to_insert": { + "name": "span", + "attrs": [ + { + "name": "class", + "value": "math-tex" + } + ] + } + } + ] + }, + { + "preset_name": "text_replacer", + "rules": [ + { + "tags": ["^p$"], + "condition": { + "text": "(\\\\nonumber\\\\\\\\\\\\noalign{\\\\pagebreak}[\\s\\S]*?)\\\\" + }, + "text_to_replace": "\\\\" + } + ] } ] diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 3cd324d..abb13e3 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -80,7 +80,8 @@ class DocxBook(BookSolver): if __name__ == "__main__": - docx_file_path = "../../books/docx/AmericanGovernment3e-WEB.docx" + + docx_file_path = "../../books/docx/output.docx" logger_object = BookLogger( name="docx", book_id=docx_file_path.split("/")[-1]) locker = Event() @@ -88,7 +89,6 @@ if __name__ == "__main__": html_converter = Docx2LibreHtml(file_path=docx_file_path, logger=logger_object, libre_locker=locker) - html_preprocessor = HtmlPresetsProcessor( logger=logger_object, preset_path="../../presets/docx_presets.json") style_preprocessor = StyleReader()