forked from LiveCarta/BookConverter
epub converter: prettify docx_solver.py
This commit is contained in:
@@ -18,7 +18,7 @@ class DocxBook(BookSolver):
|
|||||||
super().__init__(book_id, access, main_logger, logging_format)
|
super().__init__(book_id, access, main_logger, logging_format)
|
||||||
self.book_type = 'docx'
|
self.book_type = 'docx'
|
||||||
self.html_path = html_path # path to html file, file appears after libre-conversion
|
self.html_path = html_path # path to html file, file appears after libre-conversion
|
||||||
self.libra_locker: Event() = libra_locker
|
self.libra_locker: Event() = libra_locker # critical section for occupying libreoffice by one thread
|
||||||
|
|
||||||
def _libra_run(self, out_dir_path):
|
def _libra_run(self, out_dir_path):
|
||||||
command = ['libreoffice', '--headless',
|
command = ['libreoffice', '--headless',
|
||||||
@@ -124,30 +124,13 @@ class DocxBook(BookSolver):
|
|||||||
self.write_to_json(content_dict)
|
self.write_to_json(content_dict)
|
||||||
self.write_html_from_list(parser.body_tag)
|
self.write_html_from_list(parser.body_tag)
|
||||||
|
|
||||||
def test_conversion(self):
|
|
||||||
self.logger_object.log('Beginning of the test.')
|
|
||||||
|
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
folder_path = os.path.join(folder_path, f'docx')
|
|
||||||
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
|
|
||||||
self.file_path = pathlib.Path(file_path)
|
|
||||||
self.logger_object.log(f'Test docx path: {self.file_path}')
|
|
||||||
|
|
||||||
self.convert_doc_to_html()
|
|
||||||
self.check_output_directory()
|
|
||||||
|
|
||||||
html_soup = self.read_html()
|
|
||||||
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
|
||||||
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
|
||||||
|
|
||||||
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
|
||||||
content_dict = json_converter.convert_to_dict()
|
|
||||||
|
|
||||||
self.write_to_json(content_dict)
|
|
||||||
self.write_html_from_list(parser.body_tag)
|
|
||||||
self.logger_object.log('End of the test.')
|
|
||||||
|
|
||||||
def get_converted_book(self):
|
def get_converted_book(self):
|
||||||
|
"""
|
||||||
|
1. Convert docx to html with libra office
|
||||||
|
2. Parse and clean html, get list of tags, get footnotes
|
||||||
|
3. Parse from line structure to nested structure with JSONConverter
|
||||||
|
|
||||||
|
"""
|
||||||
self.convert_doc_to_html()
|
self.convert_doc_to_html()
|
||||||
self.check_output_directory()
|
self.check_output_directory()
|
||||||
|
|
||||||
@@ -155,12 +138,12 @@ class DocxBook(BookSolver):
|
|||||||
self.logger_object.log('Beginning of processing .html file.')
|
self.logger_object.log('Beginning of processing .html file.')
|
||||||
|
|
||||||
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
||||||
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
||||||
|
|
||||||
self.logger_object.log('Beginning of processing json output.')
|
self.logger_object.log('Beginning of processing json output.')
|
||||||
self.status_wrapper.set_generating()
|
self.status_wrapper.set_generating()
|
||||||
|
|
||||||
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
json_converter = JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
return content_dict
|
return content_dict
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user