From a6a54abb0a595b0bd698e296a44d1fc94a3b7fe0 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 19:13:59 +0300 Subject: [PATCH] Optimize docx2libre_html.py --- src/docx_converter/docx2libre_html.py | 102 ++++++++++++-------------- 1 file changed, 47 insertions(+), 55 deletions(-) diff --git a/src/docx_converter/docx2libre_html.py b/src/docx_converter/docx2libre_html.py index 889aa25..fbb24fe 100644 --- a/src/docx_converter/docx2libre_html.py +++ b/src/docx_converter/docx2libre_html.py @@ -10,12 +10,12 @@ from src.util.helpers import BookLogger class Docx2LibreHTML: - def __init__(self, book_id=0, file_path=None, access=None, logger=None, status_wrapper=None, libre_locker=None): - self.book_id = book_id + def __init__(self, book_id=0, file_path=None, access=None, logger=None, libre_locker=None): + self.book_id = book_id if book_id != 0 else pathlib.Path( + file_path).stem self.file_path = file_path self.access = access self.logger_object: BookLogger = logger - self.status_wrapper: status_wrapper = status_wrapper # critical section for occupying libreoffice by one thread self.libre_locker: Event() = libre_locker @@ -24,15 +24,15 @@ class Docx2LibreHTML: self.html_soup = self.read_html(self.html_path) def _libre_run(self, out_dir_path): - command = ['libreoffice', '--headless', - '--convert-to', 'html', f'{str(self.file_path)}', - '--outdir', f'{out_dir_path}'] + command = ["libreoffice", "--headless", + "--convert-to", "html", f"{str(self.file_path)}", + "--outdir", f"{out_dir_path}"] print(command) result = subprocess.run(command, stdout=PIPE, stderr=PIPE) - self.logger_object.log(f'Result of libre conversion for book_{self.book_id}:' - f' {result.returncode}, {result.stdout}', logging.DEBUG) - self.logger_object.log(f'Any error while libre conversion for book_' - f'{self.book_id}: {result.stderr}', logging.DEBUG) + self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:" + f" {result.returncode}, {result.stdout}", logging.DEBUG) + self.logger_object.log(f"Any error while libre conversion for book_" + f"{self.book_id}: {result.stderr}", logging.DEBUG) def convert_docx_to_html(self): """ @@ -48,82 +48,74 @@ class Docx2LibreHTML: path to html file, file appears after libre-conversion """ - self.logger_object.log(f'File - {self.file_path}.') - print(f'{self.file_path}') - self.logger_object.log('Beginning of conversion from .docx to .html.') + def get_and_clear_flag(out_dir_path: str): + self.libre_locker.clear() + self.logger_object.log(f"Got flag!", logging.DEBUG) + self._libre_run(out_dir_path) + self.libre_locker.set() + self.logger_object.log("Cleared flag...", logging.DEBUG) - try: - f = open(self.file_path) - f.close() - except FileNotFoundError as error: - self.logger_object.log( - 'Invalid path to input data.', logging.ERROR) - self.status_wrapper.set_error() - raise error + def check_file_exists(path, error_string: str): + try: + f = open(path) + f.close() + except FileNotFoundError as error: + self.logger_object.log( + error_string, logging.ERROR) + self.logger_object.log_error_to_main_log() + raise error + + self.logger_object.log(f"File - {self.file_path}.") + print(f"{self.file_path}") + self.logger_object.log("Beginning of conversion from .docx to .html.") + + check_file_exists( + self.file_path, error_string="Invalid path to input data.") folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) - out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}') + out_dir_path = os.path.join(folder_path, f"../html/{self.book_id}") pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) - is_book_converted = False try: if self.libre_locker.isSet(): - self.libre_locker.clear() - self.logger_object.log('Got flag...', logging.DEBUG) - self._libre_run(out_dir_path) - self.libre_locker.set() - self.logger_object.log('Cleared flag...', logging.DEBUG) - + get_and_clear_flag(out_dir_path) else: - while not self.libre_locker.isSet() and not is_book_converted: + while not self.libre_locker.isSet(): self.logger_object.log( - 'Waiting for libre...', logging.DEBUG) + "Waiting for libre...", logging.DEBUG) flag = self.libre_locker.wait(50) if flag: if self.libre_locker.isSet(): - self.libre_locker.clear() - self.logger_object.log(f'Got flag!', logging.DEBUG) - self._libre_run(out_dir_path) - self.libre_locker.set() + get_and_clear_flag(out_dir_path) break - except Exception as exc: self.logger_object.log( "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() raise exc - out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html') + out_dir_path = os.path.join(out_dir_path, f"{self.book_id}.html") html_path = pathlib.Path(out_dir_path) - try: - f = open(html_path) - f.close() - except FileNotFoundError as exc: - self.logger_object.log( - "Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR) - self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() - raise exc + check_file_exists( + html_path, error_string="Conversion has gone wrong. HTML file doesn't exist.") - self.logger_object.log('End of conversion from .docx to .html.') + self.logger_object.log("End of conversion from .docx to .html.") self.logger_object.log( - f'Input file path after conversion: {html_path}.') + f"Input file path after conversion: {html_path}.") return html_path def read_html(self, html_path): """Method for reading .html file into beautiful soup tag.""" try: - html_text = open(html_path, 'r', encoding='utf8').read() - self.logger_object.log('HTML for book has been loaded.') + html_text = open(html_path, "r", encoding="utf8").read() + self.logger_object.log("HTML for book has been loaded.") except FileNotFoundError as exc: - self.logger_object.log('There is no html to process.' - 'Conversion went wrong or you specified wrong paths.', logging.ERROR) + self.logger_object.log("There is no html to process." + "Conversion went wrong or you specified wrong paths.", logging.ERROR) self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() raise exc - html_soup = BeautifulSoup(html_text, features='lxml') + html_soup = BeautifulSoup(html_text, features="lxml") return html_soup