epub converter: prettify docx_solver.py

2021-09-06 18:18:29 +03:00
parent 60f2ceb0f4
commit 8e42c49e1b
1 changed files with 9 additions and 26 deletions
--- a/src/docx_solver.py
+++ b/src/docx_solver.py
@@ -18,7 +18,7 @@ class DocxBook(BookSolver):
        super().__init__(book_id, access, main_logger, logging_format)
        self.book_type = 'docx'
        self.html_path = html_path  # path to html file, file appears after libre-conversion
-        self.libra_locker: Event() = libra_locker
+        self.libra_locker: Event() = libra_locker  # critical section for occupying libreoffice by one thread
    def _libra_run(self, out_dir_path):
        command = ['libreoffice', '--headless',
@@ -124,30 +124,13 @@ class DocxBook(BookSolver):
        self.write_to_json(content_dict)
        self.write_html_from_list(parser.body_tag)
    def test_conversion(self):
        self.logger_object.log('Beginning of the test.')
        folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        folder_path = os.path.join(folder_path, f'docx')
        file_path = os.path.join(folder_path, f'{self.book_id}.docx')
        self.file_path = pathlib.Path(file_path)
        self.logger_object.log(f'Test docx path: {self.file_path}')
        self.convert_doc_to_html()
        self.check_output_directory()
        html_soup = self.read_html()
        parser = HTMLPreprocessor(html_soup, self.logger_object)
        content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
        json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
        content_dict = json_converter.convert_to_dict()
        self.write_to_json(content_dict)
        self.write_html_from_list(parser.body_tag)
        self.logger_object.log('End of the test.')
    def get_converted_book(self):
        """
        1. Convert docx to html with libra office
        2. Parse and clean html, get list of tags, get footnotes
        3. Parse from line structure to nested structure with JSONConverter
        """
        self.convert_doc_to_html()
        self.check_output_directory()
@@ -155,12 +138,12 @@ class DocxBook(BookSolver):
        self.logger_object.log('Beginning of processing .html file.')
        parser = HTMLPreprocessor(html_soup, self.logger_object)
-        content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
+        bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
        self.logger_object.log('Beginning of processing json output.')
        self.status_wrapper.set_generating()
-        json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
+        json_converter = JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
        content_dict = json_converter.convert_to_dict()
        return content_dict