From 92fe2bc019aa16ebeed2ced84fd07318d289e4e7 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Fri, 3 Sep 2021 22:35:34 +0300 Subject: [PATCH] epub converter: Book, EpubBook refactoring --- src/consumer.py | 8 +- src/{docx_converter.py => docx_solver.py} | 144 +++++----------------- src/epub_solver.py | 17 +++ src/{epub_converter.py => solver.py} | 60 ++++----- 4 files changed, 86 insertions(+), 143 deletions(-) rename src/{docx_converter.py => docx_solver.py} (56%) create mode 100644 src/epub_solver.py rename src/{epub_converter.py => solver.py} (72%) diff --git a/src/consumer.py b/src/consumer.py index 8c39e18..240b832 100644 --- a/src/consumer.py +++ b/src/consumer.py @@ -10,8 +10,8 @@ from threading import Event import pika from access import Access -from docx_converter import DocxBook -from epub_converter import EpubBook +from docx_solver import DocxBook +from epub_solver import EpubBook def configure_file_logger(name, filename='logs/converter_log.log', filemode='w+', @@ -52,7 +52,9 @@ def callback(ch, method, properties, body, logger, libra_locker): assert 'apiURL' in data, 'No apiURL field in received message.' assert data.get('fileExtension') in ['epub', 'docx'], 'Wrong book type received.' - book_params = {'access': Access(url=data['apiURL'])} + book_params = { + 'access': Access(url=data['apiURL']), + } if data.get('fileExtension') == 'docx': book_params.update({'libra_locker': libra_locker}) diff --git a/src/docx_converter.py b/src/docx_solver.py similarity index 56% rename from src/docx_converter.py rename to src/docx_solver.py index 6db9b8b..e3147f7 100644 --- a/src/docx_converter.py +++ b/src/docx_solver.py @@ -1,5 +1,3 @@ -import codecs -import json import logging import os import pathlib @@ -8,73 +6,23 @@ from subprocess import PIPE from threading import Event from bs4 import BeautifulSoup - -from livecarta_config import BookLogger, BookStatusWrapper, LawCartaConfig from html_preprocessor import HTMLPreprocessor from json_postprocessor import JSONConverter +from src.solver import BookSolver -class DocxBook: +class DocxBook(BookSolver): - def __init__(self, book_id=0, access=None, docx_path=None, html_path=None, output_path=None, - main_logger=None, libra_locker=None, - logging_format='%(asctime)s - %(levelname)s - %(message)s'): - self.book_id = book_id - self.access = access - self.docx_path = docx_path # path to docx file, appears after downloading from server + def __init__(self, book_id=0, access=None, html_path=None, + main_logger=None, libra_locker=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'): + super().__init__(book_id, access, main_logger, logging_format) + self.book_type = 'docx' self.html_path = html_path # path to html file, file appears after libre-conversion - self.output_path = output_path # path to json file self.libra_locker: Event() = libra_locker - self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}', - logging_format=logging_format, - book_id=book_id, - main_logger=main_logger) - self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id) - - assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \ - "Length of headers doesn't match allowed levels." - - def save_docx(self, content): - """ - Save binary content of file to .docx. - :param content: binary content of the file. - """ - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - folder_path = os.path.join(folder_path, f'docx/{self.book_id}') - pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) - - file_path = os.path.join(folder_path, f'{self.book_id}.docx') - try: - with open(file_path, 'wb+') as file: - file.write(content) - self.logger_object.log(f'File was saved to folder: {folder_path}.') - except Exception as exc: - self.logger_object.log("Error in writing docx file.", logging.ERROR) - self.logger_object.log_error_to_main_log() - raise exc - - self.docx_path = pathlib.Path(file_path) - - def get_docx(self): - """ - Method for getting and saving book from queue. - """ - try: - self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file') - content = self.access.get_doc(self.book_id) - self.logger_object.log('File was received from server.') - self.save_docx(content) - except FileNotFoundError as f_err: - self.logger_object.log("Can't get docx from server.", logging.ERROR) - self.logger_object.log_error_to_main_log() - raise f_err - except Exception as exc: - raise exc - def _libra_run(self, out_dir_path): command = ['libreoffice', '--headless', - '--convert-to', 'html', f'{str(self.docx_path)}', + '--convert-to', 'html', f'{str(self.file_path)}', '--outdir', f'{out_dir_path}'] result = subprocess.run(command, stdout=PIPE, stderr=PIPE) self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG) @@ -84,12 +32,12 @@ class DocxBook: """ Method for convert .docx document to .html file. """ - self.logger_object.log(f'File - {self.docx_path}.') - print(f'{self.docx_path}') + self.logger_object.log(f'File - {self.file_path}.') + print(f'{self.file_path}') self.logger_object.log('Beginning of conversion from .docx to .html.') try: - f = open(self.docx_path) + f = open(self.file_path) f.close() except FileNotFoundError as error: self.logger_object.log('Invalid path to input data.', logging.ERROR) @@ -142,18 +90,6 @@ class DocxBook: self.logger_object.log('End of conversion from .docx to .html.') self.logger_object.log(f'Input file path after conversion: {self.html_path}.') - def check_output_directory(self): - if self.output_path is None: - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - output_path = os.path.join(folder_path, f'json/{self.book_id}.json') - self.output_path = output_path - - self.output_path = pathlib.Path(self.output_path) - self.logger_object.log(f'Output file path: {self.output_path}') - - pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) - self.output_path.touch(exist_ok=True) - def read_html(self): """ Method for reading .html file into beautiful soup tag. @@ -179,24 +115,6 @@ class DocxBook: f_out.write(body_tag.prettify()) self.logger_object.log(f'Check final prettified html: {file_name}.') - def write_to_json(self, content: dict): - try: - with codecs.open(self.output_path, 'w', encoding='utf-8') as f: - json.dump(content, f, ensure_ascii=False) - self.logger_object.log(f'Data has been saved to .json file: {self.output_path}') - except Exception as exc: - self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR) - - def send_json_content(self, content: dict): - try: - self.access.send_book(self.book_id, content) - self.logger_object.log(f'JSON data has been sent to server.') - except Exception as exc: - self.logger_object.log('Error has occurred while sending json content.', logging.ERROR) - self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() - raise exc - def convert_from_html(self): html_soup = self.read_html() parser = HTMLPreprocessor(html_soup, self.logger_object) @@ -212,8 +130,8 @@ class DocxBook: folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.join(folder_path, f'docx') file_path = os.path.join(folder_path, f'{self.book_id}.docx') - self.docx_path = pathlib.Path(file_path) - self.logger_object.log(f'Test docx path: {self.docx_path}') + self.file_path = pathlib.Path(file_path) + self.logger_object.log(f'Test docx path: {self.file_path}') self.convert_doc_to_html() self.check_output_directory() @@ -229,27 +147,31 @@ class DocxBook: self.write_html_from_list(parser.body_tag) self.logger_object.log('End of the test.') + def get_converted_book(self): + self.convert_doc_to_html() + self.check_output_directory() + + html_soup = self.read_html() + self.logger_object.log('Beginning of processing .html file.') + + parser = HTMLPreprocessor(html_soup, self.logger_object) + content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) + + self.logger_object.log('Beginning of processing json output.') + self.status_wrapper.set_generating() + + json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper) + content_dict = json_converter.convert_to_dict() + return content_dict + def conversion(self): try: self.logger_object.log('Beginning of conversion from .docx to .json.') - self.get_docx() + self.get_book_file() self.status_wrapper.set_processing() - self.convert_doc_to_html() - self.check_output_directory() - - html_soup = self.read_html() - self.logger_object.log('Beginning of processing .html file.') - - parser = HTMLPreprocessor(html_soup, self.logger_object) - content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id) - - self.logger_object.log('Beginning of processing json output.') - self.status_wrapper.set_generating() - - json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper) - content_dict = json_converter.convert_to_dict() + content_dict = self.get_converted_book() self.write_to_json(content_dict) - self.send_json_content(content_dict) + self.send_json_content_to_server(content_dict) self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.') except Exception as exc: self.logger_object.log('Error has occurred while conversion.', logging.ERROR) @@ -263,5 +185,5 @@ if __name__ == "__main__": file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html')) out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json')) - book = DocxBook(html_path=file, output_path=out_path) + book = DocxBook(html_path=file) book.convert_from_html() diff --git a/src/epub_solver.py b/src/epub_solver.py new file mode 100644 index 0000000..08ffbcc --- /dev/null +++ b/src/epub_solver.py @@ -0,0 +1,17 @@ +from epub_postprocessor import EpubPostprocessor +from src.solver import BookSolver + + +class EpubBook(BookSolver): + + def __init__(self, book_id=0, access=None, main_logger=None, + logging_format='%(asctime)s - %(levelname)s - %(message)s'): + super().__init__(book_id, access, main_logger, logging_format) + self.book_type = 'epub' + + def get_converted_book(self): + json_converter = EpubPostprocessor(self.file_path, access=self.access, logger=self.logger_object) + content_dict = json_converter.convert_to_dict() + self.status_wrapper.set_generating() + return content_dict + diff --git a/src/epub_converter.py b/src/solver.py similarity index 72% rename from src/epub_converter.py rename to src/solver.py index 8f05584..c43f68a 100644 --- a/src/epub_converter.py +++ b/src/solver.py @@ -1,3 +1,5 @@ +""" This is Interface for solving a task of a book conversion""" + import codecs import json import logging @@ -5,17 +7,15 @@ import os import pathlib from livecarta_config import BookLogger, BookStatusWrapper, LawCartaConfig -from epub_postprocessor import EpubPostprocessor -class EpubBook: +class BookSolver: - def __init__(self, book_id=0, access=None, - main_logger=None, - logging_format='%(asctime)s - %(levelname)s - %(message)s'): + def __init__(self, book_id=0, access=None, main_logger=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'): + self.book_type = None self.book_id = book_id self.access = access - self.epub_path = None + self.file_path = None # path to book file, appears after downloading from server self.output_path = None # path to json file self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}', logging_format=logging_format, @@ -26,36 +26,36 @@ class EpubBook: assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \ "Length of headers doesn't match allowed levels." - def save_epub(self, content): + def save_book_file(self, content): """ - Save binary content of file to .docx. + Save binary content of file to .docx/.epub. :param content: binary content of the file. """ folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - folder_path = os.path.join(folder_path, f'epub/{self.book_id}') + folder_path = os.path.join(folder_path, f'{self.book_type}/{self.book_id}') pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) - file_path = os.path.join(folder_path, f'{self.book_id}.epub') + file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}') try: with open(file_path, 'wb+') as file: file.write(content) self.logger_object.log(f'File was saved to folder: {folder_path}.') except Exception as exc: - self.logger_object.log("Error in writing epub file.", logging.ERROR) + self.logger_object.log(f"Error in writing {self.book_type} file.", logging.ERROR) self.logger_object.log_error_to_main_log() raise exc - self.epub_path = pathlib.Path(file_path) + self.file_path = pathlib.Path(file_path) - def get_epub(self): + def get_book_file(self): """ - Method for getting and saving book from queue. + Method for getting and saving book from server. """ try: self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file') content = self.access.get_doc(self.book_id) self.logger_object.log('File was received from server.') - self.save_epub(content) + self.save_book_file(content) except FileNotFoundError as f_err: self.logger_object.log("Can't get docx from server.", logging.ERROR) self.logger_object.log_error_to_main_log() @@ -84,7 +84,7 @@ class EpubBook: except Exception as exc: self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR) - def send_json_content(self, content: dict): + def send_json_content_to_server(self, content: dict): try: self.access.send_book(self.book_id, content) self.logger_object.log(f'JSON data has been sent to server.') @@ -94,31 +94,32 @@ class EpubBook: self.status_wrapper.set_error() raise exc + def get_converted_book(self): + self.logger_object.log('Beginning of processing json output.') + self.status_wrapper.set_generating() + return {} + def test_conversion(self): self.logger_object.log('Beginning of the test.') folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - folder_path = os.path.join(folder_path, f'epub') - file_path = os.path.join(folder_path, f'{self.book_id}.epub') - self.epub_path = pathlib.Path(file_path) - self.logger_object.log(f'Test epub path: {self.epub_path}') - json_converter = EpubPostprocessor(self.epub_path) - content_dict = json_converter.convert_to_dict() + folder_path = os.path.join(folder_path, f'{self.book_type}') + file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}') + self.file_path = pathlib.Path(file_path) + self.logger_object.log(f'Test epub path: {self.file_path}') + content_dict = self.get_converted_book() self.write_to_json(content_dict) self.logger_object.log('End of the test.') def conversion(self): - self.logger_object.log('Beginning of conversion from .docx to .json.') - self.get_epub() + self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.') + self.get_book_file() self.status_wrapper.set_processing() - self.logger_object.log('Beginning of processing json output.') try: - json_converter = EpubPostprocessor(self.epub_path, access=self.access, logger=self.logger_object) - content_dict = json_converter.convert_to_dict() - self.status_wrapper.set_generating() + content_dict = self.get_converted_book() self.write_to_json(content_dict) - self.send_json_content(content_dict) + self.send_json_content_to_server(content_dict) self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.') except Exception as exc: @@ -126,3 +127,4 @@ class EpubBook: self.logger_object.log_error_to_main_log(str(exc)) self.status_wrapper.set_error() raise exc +