From 0d1ec03f575faae27f6c84d75e4c42b3cb3f8227 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 1 Jun 2022 16:24:19 +0300 Subject: [PATCH] Rewrite docx_solver.py --- src/docx_converter/docx2libre_html.py | 129 ++++++++++++++++++ src/docx_converter/docx_solver.py | 181 +++++++------------------- 2 files changed, 178 insertions(+), 132 deletions(-) create mode 100644 src/docx_converter/docx2libre_html.py diff --git a/src/docx_converter/docx2libre_html.py b/src/docx_converter/docx2libre_html.py new file mode 100644 index 0000000..889aa25 --- /dev/null +++ b/src/docx_converter/docx2libre_html.py @@ -0,0 +1,129 @@ +import os +import logging +import pathlib +import subprocess +from subprocess import PIPE +from threading import Event +from bs4 import BeautifulSoup + +from src.util.helpers import BookLogger + + +class Docx2LibreHTML: + def __init__(self, book_id=0, file_path=None, access=None, logger=None, status_wrapper=None, libre_locker=None): + self.book_id = book_id + self.file_path = file_path + self.access = access + self.logger_object: BookLogger = logger + self.status_wrapper: status_wrapper = status_wrapper + # critical section for occupying libreoffice by one thread + self.libre_locker: Event() = libre_locker + + # path to html file, file appears after libre-conversion + self.html_path = self.convert_docx_to_html() + self.html_soup = self.read_html(self.html_path) + + def _libre_run(self, out_dir_path): + command = ['libreoffice', '--headless', + '--convert-to', 'html', f'{str(self.file_path)}', + '--outdir', f'{out_dir_path}'] + print(command) + result = subprocess.run(command, stdout=PIPE, stderr=PIPE) + self.logger_object.log(f'Result of libre conversion for book_{self.book_id}:' + f' {result.returncode}, {result.stdout}', logging.DEBUG) + self.logger_object.log(f'Any error while libre conversion for book_' + f'{self.book_id}: {result.stderr}', logging.DEBUG) + + def convert_docx_to_html(self): + """ + Function converts .docx document to .html file. + Steps + ---------- + 1. Converts .epub to .html + 2. Parses from line structure to nested structure + + Returns + ---------- + html_path: str + path to html file, file appears after libre-conversion + + """ + self.logger_object.log(f'File - {self.file_path}.') + print(f'{self.file_path}') + self.logger_object.log('Beginning of conversion from .docx to .html.') + + try: + f = open(self.file_path) + f.close() + except FileNotFoundError as error: + self.logger_object.log( + 'Invalid path to input data.', logging.ERROR) + self.status_wrapper.set_error() + raise error + + folder_path = os.path.dirname( + os.path.dirname(os.path.abspath(__file__))) + out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}') + pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) + + is_book_converted = False + try: + if self.libre_locker.isSet(): + self.libre_locker.clear() + self.logger_object.log('Got flag...', logging.DEBUG) + self._libre_run(out_dir_path) + self.libre_locker.set() + self.logger_object.log('Cleared flag...', logging.DEBUG) + + else: + while not self.libre_locker.isSet() and not is_book_converted: + self.logger_object.log( + 'Waiting for libre...', logging.DEBUG) + flag = self.libre_locker.wait(50) + if flag: + if self.libre_locker.isSet(): + self.libre_locker.clear() + self.logger_object.log(f'Got flag!', logging.DEBUG) + self._libre_run(out_dir_path) + self.libre_locker.set() + break + + except Exception as exc: + self.logger_object.log( + "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) + self.logger_object.log_error_to_main_log() + self.status_wrapper.set_error() + raise exc + + out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html') + html_path = pathlib.Path(out_dir_path) + + try: + f = open(html_path) + f.close() + except FileNotFoundError as exc: + self.logger_object.log( + "Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR) + self.logger_object.log_error_to_main_log() + self.status_wrapper.set_error() + raise exc + + self.logger_object.log('End of conversion from .docx to .html.') + self.logger_object.log( + f'Input file path after conversion: {html_path}.') + return html_path + + def read_html(self, html_path): + """Method for reading .html file into beautiful soup tag.""" + try: + html_text = open(html_path, 'r', encoding='utf8').read() + self.logger_object.log('HTML for book has been loaded.') + except FileNotFoundError as exc: + self.logger_object.log('There is no html to process.' + 'Conversion went wrong or you specified wrong paths.', logging.ERROR) + self.logger_object.log_error_to_main_log() + self.status_wrapper.set_error() + raise exc + + html_soup = BeautifulSoup(html_text, features='lxml') + return html_soup diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 680a059..b4aa9b3 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -1,154 +1,71 @@ -import os -import logging -import pathlib -import subprocess -from subprocess import PIPE +import json +import codecs from threading import Event -from bs4 import BeautifulSoup -from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor -from src.docx_converter.libra_html2json_converter import LibraHTML2JSONConverter from src.book_solver import BookSolver +from src.util.helpers import BookLogger +from src.docx_converter.docx2libre_html import Docx2LibreHTML +from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor +from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter class DocxBook(BookSolver): """Class of .docx type book - child of BookSolver""" - def __init__(self, book_id=0, access=None, html_path=None, - main_logger=None, libra_locker=None): + def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None): super().__init__(book_id, access, main_logger) self.book_type = 'docx' - self.html_path = html_path # path to html file, file appears after libre-conversion - self.libra_locker: Event() = libra_locker # critical section for occupying libreoffice by one thread - - def _libra_run(self, out_dir_path): - command = ['libreoffice', '--headless', - '--convert-to', 'html', f'{str(self.file_path)}', - '--outdir', f'{out_dir_path}'] - print(command) - result = subprocess.run(command, stdout=PIPE, stderr=PIPE) - self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG) - self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG) - - def convert_doc_to_html(self): - """Method for convert .docx document to .html file.""" - self.logger_object.log(f'File - {self.file_path}.') - print(f'{self.file_path}') - self.logger_object.log('Beginning of conversion from .docx to .html.') - - try: - f = open(self.file_path) - f.close() - except FileNotFoundError as error: - self.logger_object.log('Invalid path to input data.', logging.ERROR) - self.status_wrapper.set_error() - raise error - - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}') - pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) - - is_book_converted = False - try: - if self.libra_locker.isSet(): - self.libra_locker.clear() - self.logger_object.log('Got flag...', logging.DEBUG) - self._libra_run(out_dir_path) - self.libra_locker.set() - self.logger_object.log('Cleared flag...', logging.DEBUG) - - else: - while not self.libra_locker.isSet() and not is_book_converted: - self.logger_object.log('Waiting for libra...', logging.DEBUG) - flag = self.libra_locker.wait(50) - if flag: - if self.libra_locker.isSet(): - self.libra_locker.clear() - self.logger_object.log(f'Got flag!', logging.DEBUG) - self._libra_run(out_dir_path) - self.libra_locker.set() - break - - except Exception as exc: - self.logger_object.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) - self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() - raise exc - - out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html') - self.html_path = pathlib.Path(out_dir_path) - - try: - f = open(self.html_path) - f.close() - except FileNotFoundError as exc: - self.logger_object.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR) - self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() - raise exc - - self.logger_object.log('End of conversion from .docx to .html.') - self.logger_object.log(f'Input file path after conversion: {self.html_path}.') - - def read_html(self): - """Method for reading .html file into beautiful soup tag.""" - try: - html_text = open(self.html_path, 'r', encoding='utf8').read() - self.logger_object.log('HTML for book has been loaded.') - except FileNotFoundError as exc: - self.logger_object.log('There is no html to process.' - 'Conversion went wrong or you specified wrong paths.', logging.ERROR) - self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() - raise exc - - html_soup = BeautifulSoup(html_text, features='lxml') - return html_soup - - def write_html_from_list(self, body_tag, file_name='json/html_test.html'): - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - file_path = pathlib.Path(os.path.join(folder_path, file_name)) - - with open(file_path, 'w', encoding='utf-8') as f_out: - f_out.write(body_tag.prettify()) - self.logger_object.log(f'Check final prettified html: {file_name}.') - - def convert_from_html(self): - html_soup = self.read_html() - parser = HTMLDocxPreprocessor(html_soup, self.logger_object) - content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) - json_converter = LibraHTML2JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper) - content_dict = json_converter.convert_to_dict() - self.write_to_json(content_dict) - self.write_html_from_list(parser.body_tag) + # critical section for occupying libreoffice by one thread + self.libre_locker: Event() = libre_locker def get_converted_book(self): """ - 1. Convert docx to html with libra office - 2. Parse and clean html, get list of tags, get footnotes - 3. Parse from line structure to nested structure with JSONConverter + Function + Steps + ---------- + 1. Converts docx to html with LibreOffice + 2. Parses and cleans html, gets list of tags, gets footnotes + 3. Parses from line structure to nested structure with JSONConverter + + Returns + ---------- + content_dict + json for LiveCarta platform + """ - self.convert_doc_to_html() - self.check_output_directory() + # 1. Converts docx to html with LibreOffice + html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, + self.logger_object, self.status_wrapper, self.libre_locker) + # TODO presets - html_soup = self.read_html() - self.logger_object.log('Beginning of processing .html file.') + # 2. Parses and cleans html, gets list of tags, gets footnotes + parser = HTMLDocxPreprocessor( + html_converter.html_soup, self.logger_object) + bs_tags, footnotes, top_level_headers = parser.process_html( + self.access, html_converter.html_path, self.book_id) - parser = HTMLDocxPreprocessor(html_soup, self.logger_object) - bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) - - self.logger_object.log('Beginning of processing json output.') - self.status_wrapper.set_generating() - - json_converter = LibraHTML2JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper) + # 3. Parses from line structure to nested structure with JSONConverter + json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers, + self.logger_object, self.status_wrapper) content_dict = json_converter.convert_to_dict() + return content_dict if __name__ == "__main__": - folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html')) - out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json')) + docx_file_path = '../../docx/music_inquiry.docx' + logger_object = BookLogger( + name='docx', book_id=docx_file_path.split('/')[-1]) - book = DocxBook(html_path=file) - book.convert_from_html() \ No newline at end of file + html_converter = Docx2LibreHTML(file_path=docx_file_path) + + parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object) + content, footnotes, top_level_headers = parser.process_html( + html_converter.html_path) + + json_converter = LibreHTML2JSONConverter( + content, footnotes, top_level_headers, logger_object) + content_dict = json_converter.convert_to_dict() + + with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f: + json.dump(content_dict, f, ensure_ascii=False)