diff --git a/src/book.py b/src/book.py index d570ee3..e87bd9f 100644 --- a/src/book.py +++ b/src/book.py @@ -4,6 +4,9 @@ import logging import os import pathlib import re +import subprocess +from subprocess import PIPE +from threading import Event from copy import copy from shutil import copyfile @@ -33,11 +36,14 @@ class Book: SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"} HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} - def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None): + def __init__(self, book_id=0, access=None, docx_path=None, html_path=None, output_path=None, main_logger=None, + libra_locker=None): self.book_id = book_id self.access = access - self.file_path = file_path - self.output_path = output_path + self.docx_path = docx_path # path to docx file, appears after downloading from server + self.html_path = html_path # path to html file, file appears after libre-conversion + self.output_path = output_path # path to json file + self.libra_locker: Event() = libra_locker self.main_logger = main_logger self.logger = None @@ -53,8 +59,11 @@ class Book: assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \ "Length of headers doesn't match allowed levels." - def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+', - logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'): + def configure_file_logger(self, name, attr_name='logger', + filename='logs/book_log.log', + filemode='w+', + logging_level=logging.INFO, + logging_format='%(asctime)s - %(message)s'): """ Method for Logger configuration. Logger will write in file. @@ -107,7 +116,8 @@ class Book: :param content: binary content of the file. """ folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - folder_path = os.path.join(folder_path, 'docx') + folder_path = os.path.join(folder_path, f'docx/{self.book_id}') + pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) file_path = os.path.join(folder_path, f'{self.book_id}.docx') try: @@ -119,7 +129,7 @@ class Book: self.log_error_to_main_log() raise exc - self.file_path = pathlib.Path(file_path) + self.docx_path = pathlib.Path(file_path) def get_docx(self): """ @@ -167,16 +177,24 @@ class Book: self.log_error_to_main_log() raise exc + def _libra_run(self, out_dir_path): + command = ['libreoffice', '--headless', + '--convert-to', 'html', f'{str(self.docx_path)}', + '--outdir', f'{out_dir_path}'] + result = subprocess.run(command, stdout=PIPE, stderr=PIPE) + self.log(f'STATUS book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG) + self.log(f'ERROR book_{self.book_id}: {result.stderr}', logging.DEBUG) + def convert_doc_to_html(self): """ Method for convert .docx document to .html file. """ - self.log(f'File - {self.file_path}.') - print(f'{self.file_path}') + self.log(f'File - {self.docx_path}.') + print(f'{self.docx_path}') self.log('Beginning of conversion from .docx to .html.') try: - f = open(self.file_path) + f = open(self.docx_path) f.close() except FileNotFoundError as error: self.log('Invalid path to input data.', logging.ERROR) @@ -185,21 +203,40 @@ class Book: folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) out_dir_path = os.path.join(folder_path, f'html/{self.book_id}') + pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) + is_book_converted = False try: - command = f'libreoffice --headless --convert-to html "{str(self.file_path)}" --outdir {out_dir_path}' - os.system(command) + if self.libra_locker.isSet(): + self.libra_locker.clear() + self.log('Got flag...', logging.DEBUG) + self._libra_run(out_dir_path) + self.libra_locker.set() + self.log('Cleared flag...', logging.DEBUG) + + else: + while not self.libra_locker.isSet() and not is_book_converted: + self.log('Waiting for libra...', logging.DEBUG) + flag = self.libra_locker.wait(50) + if flag: + if self.libra_locker.isSet(): + self.libra_locker.clear() + self.log(f'Got flag!', logging.DEBUG) + self._libra_run(out_dir_path) + self.libra_locker.set() + break + except Exception as exc: self.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) self.log_error_to_main_log() self.set_error_status() raise exc - out_dir_path = os.path.join(out_dir_path, f'{self.file_path.stem}.html') - self.file_path = pathlib.Path(out_dir_path) + out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html') + self.html_path = pathlib.Path(out_dir_path) try: - f = open(self.file_path) + f = open(self.html_path) f.close() except FileNotFoundError as exc: self.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR) @@ -208,12 +245,12 @@ class Book: raise exc self.log('End of conversion from .docx to .html.') - self.log(f'Input file path after conversion: {self.file_path}.') + self.log(f'Input file path after conversion: {self.html_path}.') def check_output_directory(self): if self.output_path is None: folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - output_path = os.path.join(folder_path, f'json/{self.file_path.stem}.json') + output_path = os.path.join(folder_path, f'json/{self.book_id}.json') self.output_path = output_path self.output_path = pathlib.Path(self.output_path) @@ -227,7 +264,7 @@ class Book: Method for reading .html file into beautiful soup tag. """ try: - html_text = open(self.file_path, 'r', encoding='utf8').read() + html_text = open(self.html_path, 'r', encoding='utf8').read() self.log('HTML for book has been loaded.') except FileNotFoundError as exc: self.log('There is no html to process. Conversion went wrong or you specified wrong paths.', logging.ERROR) @@ -549,12 +586,12 @@ class Book: if len(img_tags): if self.access is None: folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/')) + new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.book_id}/')) new_path.mkdir(exist_ok=True) for img in img_tags: img_name = img.attrs.get('src') - img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}') + img_path = pathlib.Path(f'{self.html_path.parent}/{img_name}') if self.access is not None: link = self.access.send_image(img_path, self.book_id) @@ -955,8 +992,18 @@ class Book: self.write_json() def test_conversion(self): - self.configure_file_logger(self.book_id, filemode='w+') + self.configure_file_logger(self.book_id, + filemode='w+', + logging_format='%(asctime)s - %(levelname)s - %(message)s', + logging_level=logging.INFO) self.log('Beginning of the test.') + + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + folder_path = os.path.join(folder_path, f'docx') + file_path = os.path.join(folder_path, f'{self.book_id}.docx') + self.docx_path = pathlib.Path(file_path) + self.log(f'Test docx path: {self.docx_path}') + self.convert_doc_to_html() self.check_output_directory() self.read_html() @@ -982,11 +1029,9 @@ class Book: if __name__ == "__main__": - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - file_path = pathlib.Path(os.path.join(folder_path, 'html/82/82.html')) - out_path = pathlib.Path(os.path.join(folder_path, 'json/82.json')) + folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + file = pathlib.Path(os.path.join(folder, 'html/82/82.html')) + out_path = pathlib.Path(os.path.join(folder, 'json/82.json')) - logging_format = '%(asctime)s - %(levelname)s - %(message)s' - - book = Book(file_path=file_path, output_path=out_path) - book.convert_from_html(logging_format=logging_format) + book = Book(html_path=file, output_path=out_path) + book.convert_from_html(logging_format='%(asctime)s - %(levelname)s - %(message)s')