Add inline style processor [Docx]

2022-09-02 14:47:06 +03:00
parent b97c5d8371
commit dfdf6bc7e9
3 changed files with 50 additions and 27 deletions
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -5,19 +5,20 @@ from threading import Event

 from src.book_solver import BookSolver
 from src.util.helpers import BookLogger
+from src.style_preprocessor import StylePreprocessor
 from src.docx_converter.docx2libre_html import Docx2LibreHTML
-from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor
+from src.docx_converter.html_docx_processor import HTMLDocxProcessor
 from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter


 class DocxBook(BookSolver):
    """Class of .docx type book - child of BookSolver"""

-    def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker=None):
+    def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker: Event = None):
        super().__init__(book_id, access, main_logger)
        self.book_type = "docx"
        # critical section for occupying libreoffice by one thread
-        self.libre_locker: Event() = libre_locker
+        self.libre_locker = libre_locker

    def get_converted_book(self):
        """
@@ -47,8 +48,9 @@ class DocxBook(BookSolver):

        # 2. Parses and cleans html, gets list of tags, gets footnotes
        try:
-            parser = HTMLDocxPreprocessor(
-                html_converter.html_soup, self.logger_object)
+            style_processor = StylePreprocessor()
+            parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
+                                       logger=self.logger_object, style_processor=style_processor)
            bs_tags, footnotes, top_level_headers = parser.process_html(
                self.access, html_converter.html_path, self.book_id)
        except Exception as exc:
@@ -73,7 +75,7 @@ class DocxBook(BookSolver):


 if __name__ == "__main__":
-    docx_file_path = "../../books/docx/music_inquiry.docx"
+    docx_file_path = "../../books/docx/Bar_Exam_MPT_2e_prepared.docx"
    logger_object = BookLogger(
        name="docx", book_id=docx_file_path.split("/")[-1])
    locker = Event()
@@ -82,7 +84,9 @@ if __name__ == "__main__":
    html_converter = Docx2LibreHTML(file_path=docx_file_path,
                                    logger=logger_object, libre_locker=locker)

-    parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object)
+    css_processor = StylePreprocessor()
+    parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
+                               style_processor=css_processor, preset_path="../../presets/docx_presets.json")
    content, footnotes, top_level_headers = parser.process_html(
        html_path=html_converter.html_path, book_id=html_converter.book_id)