diff --git a/src/docx_solver.py b/src/docx_solver.py index a41d567..1bd9475 100644 --- a/src/docx_solver.py +++ b/src/docx_solver.py @@ -18,7 +18,7 @@ class DocxBook(BookSolver): super().__init__(book_id, access, main_logger, logging_format) self.book_type = 'docx' self.html_path = html_path # path to html file, file appears after libre-conversion - self.libra_locker: Event() = libra_locker + self.libra_locker: Event() = libra_locker # critical section for occupying libreoffice by one thread def _libra_run(self, out_dir_path): command = ['libreoffice', '--headless', @@ -124,30 +124,13 @@ class DocxBook(BookSolver): self.write_to_json(content_dict) self.write_html_from_list(parser.body_tag) - def test_conversion(self): - self.logger_object.log('Beginning of the test.') - - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - folder_path = os.path.join(folder_path, f'docx') - file_path = os.path.join(folder_path, f'{self.book_id}.docx') - self.file_path = pathlib.Path(file_path) - self.logger_object.log(f'Test docx path: {self.file_path}') - - self.convert_doc_to_html() - self.check_output_directory() - - html_soup = self.read_html() - parser = HTMLPreprocessor(html_soup, self.logger_object) - content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) - - json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper) - content_dict = json_converter.convert_to_dict() - - self.write_to_json(content_dict) - self.write_html_from_list(parser.body_tag) - self.logger_object.log('End of the test.') - def get_converted_book(self): + """ + 1. Convert docx to html with libra office + 2. Parse and clean html, get list of tags, get footnotes + 3. Parse from line structure to nested structure with JSONConverter + + """ self.convert_doc_to_html() self.check_output_directory() @@ -155,12 +138,12 @@ class DocxBook(BookSolver): self.logger_object.log('Beginning of processing .html file.') parser = HTMLPreprocessor(html_soup, self.logger_object) - content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) + bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) self.logger_object.log('Beginning of processing json output.') self.status_wrapper.set_generating() - json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper) + json_converter = JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper) content_dict = json_converter.convert_to_dict() return content_dict