import json import codecs import logging from threading import Event from src.book_solver import BookSolver from src.util.helpers import BookLogger from src.html_preprocessor import HtmlPreprocessor from src.style_preprocessor import StylePreprocessor from src.docx_converter.docx2libre_html import Docx2LibreHtml from src.docx_converter.html_docx_processor import HtmlDocxProcessor from src.docx_converter.libre_html2json_converter import LibreHtml2JsonConverter class DocxBook(BookSolver): """Class of .docx type book - child of BookSolver""" def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker: Event = None): super().__init__(book_id, access, main_logger) self.book_type = "docx" # critical section for occupying libreoffice by one thread self.libre_locker = libre_locker def get_converted_book(self): """ Function Steps ---------- 1. Converts docx to html with LibreOffice 2. Parses and cleans html, gets list of tags, gets footnotes 3. Parses from line structure to nested structure with JSONConverter Returns ---------- content_dict json for LiveCarta platform """ # 1. Converts docx to html with LibreOffice try: html_converter = Docx2LibreHtml(self.book_id, self.book_path, self.access, self.logger_object, self.libre_locker) except Exception as exc: self.logger_object.log( "Error has occurred while converting .docx to .html.", logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc # 2. Parses and cleans html, gets list of tags, gets footnotes try: html_preprocessor = HtmlPreprocessor( logger=self.logger_object, preset_path="presets/docx_presets.json") style_preprocessor = StylePreprocessor() html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=self.logger_object, html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor) bs_tags, footnotes, top_level_headers = html_processor.process_html( self.access, html_converter.html_path, self.book_id) except Exception as exc: self.logger_object.log( "Error has occurred while processing .html", logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc # 3. Parses from line structure to nested structure with JSONConverter try: json_converter = LibreHtml2JsonConverter(bs_tags, footnotes, top_level_headers, self.logger_object) content_dict = json_converter.convert_to_dict() except Exception as exc: self.logger_object.log( "Error has occurred while converting .html to .json", logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc return content_dict if __name__ == "__main__": docx_file_path = "../../books/docx/Bar_Exam_MPT_2e_prepared.docx" logger_object = BookLogger( name="docx", book_id=docx_file_path.split("/")[-1]) locker = Event() locker.set() html_converter = Docx2LibreHtml(file_path=docx_file_path, logger=logger_object, libre_locker=locker) html_preprocessor = HtmlPreprocessor( logger=logger_object, preset_path="../../presets/docx_presets.json") style_preprocessor = StylePreprocessor() html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object, html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor) content, footnotes, top_level_headers = html_processor.process_html( html_path=html_converter.html_path, book_id=html_converter.book_id) json_converter = LibreHtml2JsonConverter( content, footnotes, top_level_headers, logger_object) content_dict = json_converter.convert_to_dict() with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f: json.dump(content_dict, f, ensure_ascii=False)