forked from LiveCarta/BookConverter
epub converter: prettify docx_solver.py
This commit is contained in:
@@ -18,7 +18,7 @@ class DocxBook(BookSolver):
|
||||
super().__init__(book_id, access, main_logger, logging_format)
|
||||
self.book_type = 'docx'
|
||||
self.html_path = html_path # path to html file, file appears after libre-conversion
|
||||
self.libra_locker: Event() = libra_locker
|
||||
self.libra_locker: Event() = libra_locker # critical section for occupying libreoffice by one thread
|
||||
|
||||
def _libra_run(self, out_dir_path):
|
||||
command = ['libreoffice', '--headless',
|
||||
@@ -124,30 +124,13 @@ class DocxBook(BookSolver):
|
||||
self.write_to_json(content_dict)
|
||||
self.write_html_from_list(parser.body_tag)
|
||||
|
||||
def test_conversion(self):
|
||||
self.logger_object.log('Beginning of the test.')
|
||||
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
folder_path = os.path.join(folder_path, f'docx')
|
||||
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
|
||||
self.file_path = pathlib.Path(file_path)
|
||||
self.logger_object.log(f'Test docx path: {self.file_path}')
|
||||
|
||||
self.convert_doc_to_html()
|
||||
self.check_output_directory()
|
||||
|
||||
html_soup = self.read_html()
|
||||
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
||||
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
||||
|
||||
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
|
||||
self.write_to_json(content_dict)
|
||||
self.write_html_from_list(parser.body_tag)
|
||||
self.logger_object.log('End of the test.')
|
||||
|
||||
def get_converted_book(self):
|
||||
"""
|
||||
1. Convert docx to html with libra office
|
||||
2. Parse and clean html, get list of tags, get footnotes
|
||||
3. Parse from line structure to nested structure with JSONConverter
|
||||
|
||||
"""
|
||||
self.convert_doc_to_html()
|
||||
self.check_output_directory()
|
||||
|
||||
@@ -155,12 +138,12 @@ class DocxBook(BookSolver):
|
||||
self.logger_object.log('Beginning of processing .html file.')
|
||||
|
||||
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
||||
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
||||
bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
||||
|
||||
self.logger_object.log('Beginning of processing json output.')
|
||||
self.status_wrapper.set_generating()
|
||||
|
||||
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
||||
json_converter = JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
return content_dict
|
||||
|
||||
|
||||
Reference in New Issue
Block a user