import codecs import json import logging import os import pathlib import subprocess from subprocess import PIPE from threading import Event from bs4 import BeautifulSoup from livecarta_config import BookLogger, BookStatusWrapper, LawCartaConfig from html_preprocessor import HTMLPreprocessor from json_postprocessor import JSONConverter class Book: def __init__(self, book_id=0, access=None, docx_path=None, html_path=None, output_path=None, main_logger=None, libra_locker=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'): self.book_id = book_id self.access = access self.docx_path = docx_path # path to docx file, appears after downloading from server self.html_path = html_path # path to html file, file appears after libre-conversion self.output_path = output_path # path to json file self.libra_locker: Event() = libra_locker self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}', logging_format=logging_format, book_id=book_id, main_logger=main_logger) self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id) assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \ "Length of headers doesn't match allowed levels." def save_docx(self, content): """ Save binary content of file to .docx. :param content: binary content of the file. """ folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.join(folder_path, f'docx/{self.book_id}') pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) file_path = os.path.join(folder_path, f'{self.book_id}.docx') try: with open(file_path, 'wb+') as file: file.write(content) self.logger_object.log(f'File was saved to folder: {folder_path}.') except Exception as exc: self.logger_object.log("Error in writing docx file.", logging.ERROR) self.logger_object.log_error_to_main_log() raise exc self.docx_path = pathlib.Path(file_path) def get_docx(self): """ Method for getting and saving book from queue. """ try: self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file') content = self.access.get_doc(self.book_id) self.logger_object.log('File was received from server.') self.save_docx(content) except FileNotFoundError as f_err: self.logger_object.log("Can't get docx from server.", logging.ERROR) self.logger_object.log_error_to_main_log() raise f_err except Exception as exc: raise exc def _libra_run(self, out_dir_path): command = ['libreoffice', '--headless', '--convert-to', 'html', f'{str(self.docx_path)}', '--outdir', f'{out_dir_path}'] result = subprocess.run(command, stdout=PIPE, stderr=PIPE) self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG) self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG) def convert_doc_to_html(self): """ Method for convert .docx document to .html file. """ self.logger_object.log(f'File - {self.docx_path}.') print(f'{self.docx_path}') self.logger_object.log('Beginning of conversion from .docx to .html.') try: f = open(self.docx_path) f.close() except FileNotFoundError as error: self.logger_object.log('Invalid path to input data.', logging.ERROR) self.status_wrapper.set_error() raise error folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) out_dir_path = os.path.join(folder_path, f'html/{self.book_id}') pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) is_book_converted = False try: if self.libra_locker.isSet(): self.libra_locker.clear() self.logger_object.log('Got flag...', logging.DEBUG) self._libra_run(out_dir_path) self.libra_locker.set() self.logger_object.log('Cleared flag...', logging.DEBUG) else: while not self.libra_locker.isSet() and not is_book_converted: self.logger_object.log('Waiting for libra...', logging.DEBUG) flag = self.libra_locker.wait(50) if flag: if self.libra_locker.isSet(): self.libra_locker.clear() self.logger_object.log(f'Got flag!', logging.DEBUG) self._libra_run(out_dir_path) self.libra_locker.set() break except Exception as exc: self.logger_object.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html') self.html_path = pathlib.Path(out_dir_path) try: f = open(self.html_path) f.close() except FileNotFoundError as exc: self.logger_object.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc self.logger_object.log('End of conversion from .docx to .html.') self.logger_object.log(f'Input file path after conversion: {self.html_path}.') def check_output_directory(self): if self.output_path is None: folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) output_path = os.path.join(folder_path, f'json/{self.book_id}.json') self.output_path = output_path self.output_path = pathlib.Path(self.output_path) self.logger_object.log(f'Output file path: {self.output_path}') pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) self.output_path.touch(exist_ok=True) def read_html(self): """ Method for reading .html file into beautiful soup tag. """ try: html_text = open(self.html_path, 'r', encoding='utf8').read() self.logger_object.log('HTML for book has been loaded.') except FileNotFoundError as exc: self.logger_object.log('There is no html to process.' 'Conversion went wrong or you specified wrong paths.', logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc html_soup = BeautifulSoup(html_text, features='lxml') return html_soup def write_html_from_list(self, body_tag, file_name='json/html_test.html'): folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) file_path = pathlib.Path(os.path.join(folder_path, file_name)) with open(file_path, 'w', encoding='utf-8') as f_out: f_out.write(body_tag.prettify()) self.logger_object.log(f'Check final prettified html: {file_name}.') def write_to_json(self, content: dict): try: with codecs.open(self.output_path, 'w', encoding='utf-8') as f: json.dump(content, f, ensure_ascii=False) self.logger_object.log(f'Data has been saved to .json file: {self.output_path}') except Exception as exc: self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR) def send_json_content(self, content: dict): try: self.access.send_book(self.book_id, content) self.logger_object.log(f'JSON data has been sent to server.') except Exception as exc: self.logger_object.log('Error has occurred while sending json content.', logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc def convert_from_html(self): html_soup = self.read_html() parser = HTMLPreprocessor(html_soup, self.logger_object) content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper) content_dict = json_converter.convert_to_dict() self.write_to_json(content_dict) self.write_html_from_list(parser.body_tag) def test_conversion(self): self.logger_object.log('Beginning of the test.') folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.join(folder_path, f'docx') file_path = os.path.join(folder_path, f'{self.book_id}.docx') self.docx_path = pathlib.Path(file_path) self.logger_object.log(f'Test docx path: {self.docx_path}') self.convert_doc_to_html() self.check_output_directory() html_soup = self.read_html() parser = HTMLPreprocessor(html_soup, self.logger_object) content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper) content_dict = json_converter.convert_to_dict() self.write_to_json(content_dict) self.write_html_from_list(parser.body_tag) self.logger_object.log('End of the test.') def conversion(self): try: self.logger_object.log('Beginning of conversion from .docx to .json.') self.get_docx() self.status_wrapper.set_processing() self.convert_doc_to_html() self.check_output_directory() html_soup = self.read_html() self.logger_object.log('Beginning of processing .html file.') parser = HTMLPreprocessor(html_soup, self.logger_object) content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) self.logger_object.log('Beginning of processing json output.') self.status_wrapper.set_generating() json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper) content_dict = json_converter.convert_to_dict() self.write_to_json(content_dict) self.send_json_content(content_dict) self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.') except Exception as exc: self.logger_object.log('Error has occurred while conversion.', logging.ERROR) self.logger_object.log_error_to_main_log(str(exc)) self.status_wrapper.set_error() raise exc if __name__ == "__main__": folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html')) out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json')) book = Book(html_path=file, output_path=out_path) book.convert_from_html()