Merge processing tags[Docx, Epub]

2022-09-06 16:26:08 +03:00
parent ea37b19c36
commit ddc45e2d04
6 changed files with 226 additions and 277 deletions
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -5,6 +5,7 @@ from threading import Event

 from src.book_solver import BookSolver
 from src.util.helpers import BookLogger
+from src.html_preprocessor import HtmlPreprocessor
 from src.style_preprocessor import StylePreprocessor
 from src.docx_converter.docx2libre_html import Docx2LibreHTML
 from src.docx_converter.html_docx_processor import HTMLDocxProcessor
@@ -48,10 +49,14 @@ class DocxBook(BookSolver):

        # 2. Parses and cleans html, gets list of tags, gets footnotes
        try:
-            style_processor = StylePreprocessor()
-            parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
-                                       logger=self.logger_object, style_processor=style_processor)
-            bs_tags, footnotes, top_level_headers = parser.process_html(
+            html_preprocessor = HtmlPreprocessor(
+                logger=self.logger_object, preset_path="presets/docx_presets.json")
+            style_preprocessor = StylePreprocessor()
+            html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup,
+                                               logger=self.logger_object,
+                                               html_preprocessor=html_preprocessor,
+                                               style_preprocessor=style_preprocessor)
+            bs_tags, footnotes, top_level_headers = html_processor.process_html(
                self.access, html_converter.html_path, self.book_id)
        except Exception as exc:
            self.logger_object.log(
@@ -84,10 +89,12 @@ if __name__ == "__main__":
    html_converter = Docx2LibreHTML(file_path=docx_file_path,
                                    logger=logger_object, libre_locker=locker)

-    css_processor = StylePreprocessor()
-    parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
-                               style_processor=css_processor, preset_path="../../presets/docx_presets.json")
-    content, footnotes, top_level_headers = parser.process_html(
+    html_preprocessor = HtmlPreprocessor(
+        logger=logger_object, preset_path="../../presets/docx_presets.json")
+    style_preprocessor = StylePreprocessor()
+    html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
+                                       html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
+    content, footnotes, top_level_headers = html_processor.process_html(
        html_path=html_converter.html_path, book_id=html_converter.book_id)

    json_converter = LibreHTML2JSONConverter(