import json import codecs from threading import Event from src.book_solver import BookSolver from src.util.helpers import BookLogger from src.docx_converter.docx2libre_html import Docx2LibreHTML from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter class DocxBook(BookSolver): """Class of .docx type book - child of BookSolver""" def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None): super().__init__(book_id, access, main_logger) self.book_type = 'docx' # critical section for occupying libreoffice by one thread self.libre_locker: Event() = libre_locker def get_converted_book(self): """ Function Steps ---------- 1. Converts docx to html with LibreOffice 2. Parses and cleans html, gets list of tags, gets footnotes 3. Parses from line structure to nested structure with JSONConverter Returns ---------- content_dict json for LiveCarta platform """ # 1. Converts docx to html with LibreOffice html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, self.logger_object, self.status_wrapper, self.libre_locker) # TODO presets # 2. Parses and cleans html, gets list of tags, gets footnotes parser = HTMLDocxPreprocessor( html_converter.html_soup, self.logger_object) bs_tags, footnotes, top_level_headers = parser.process_html( self.access, html_converter.html_path, self.book_id) # 3. Parses from line structure to nested structure with JSONConverter json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper) content_dict = json_converter.convert_to_dict() return content_dict if __name__ == "__main__": docx_file_path = '../../docx/music_inquiry.docx' logger_object = BookLogger( name='docx', book_id=docx_file_path.split('/')[-1]) html_converter = Docx2LibreHTML(file_path=docx_file_path) parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object) content, footnotes, top_level_headers = parser.process_html( html_converter.html_path) json_converter = LibreHTML2JSONConverter( content, footnotes, top_level_headers, logger_object) content_dict = json_converter.convert_to_dict() with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f: json.dump(content_dict, f, ensure_ascii=False)