epub converter: prettify docx_solver.py

This commit is contained in:
shirshasa
2021-09-06 18:18:29 +03:00
parent 60f2ceb0f4
commit 8e42c49e1b

View File

@@ -18,7 +18,7 @@ class DocxBook(BookSolver):
super().__init__(book_id, access, main_logger, logging_format)
self.book_type = 'docx'
self.html_path = html_path # path to html file, file appears after libre-conversion
self.libra_locker: Event() = libra_locker
self.libra_locker: Event() = libra_locker # critical section for occupying libreoffice by one thread
def _libra_run(self, out_dir_path):
command = ['libreoffice', '--headless',
@@ -124,30 +124,13 @@ class DocxBook(BookSolver):
self.write_to_json(content_dict)
self.write_html_from_list(parser.body_tag)
def test_conversion(self):
self.logger_object.log('Beginning of the test.')
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'docx')
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
self.file_path = pathlib.Path(file_path)
self.logger_object.log(f'Test docx path: {self.file_path}')
self.convert_doc_to_html()
self.check_output_directory()
html_soup = self.read_html()
parser = HTMLPreprocessor(html_soup, self.logger_object)
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
self.write_to_json(content_dict)
self.write_html_from_list(parser.body_tag)
self.logger_object.log('End of the test.')
def get_converted_book(self):
"""
1. Convert docx to html with libra office
2. Parse and clean html, get list of tags, get footnotes
3. Parse from line structure to nested structure with JSONConverter
"""
self.convert_doc_to_html()
self.check_output_directory()
@@ -155,12 +138,12 @@ class DocxBook(BookSolver):
self.logger_object.log('Beginning of processing .html file.')
parser = HTMLPreprocessor(html_soup, self.logger_object)
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
self.logger_object.log('Beginning of processing json output.')
self.status_wrapper.set_generating()
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
json_converter = JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
return content_dict