From 8f284651c40d00649ad35c3e489767b9893386c2 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Wed, 21 Apr 2021 17:28:13 +0300 Subject: [PATCH] epub converter: add epub book conversion pipeline --- src/epub_converter.py | 121 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 src/epub_converter.py diff --git a/src/epub_converter.py b/src/epub_converter.py new file mode 100644 index 0000000..9178a91 --- /dev/null +++ b/src/epub_converter.py @@ -0,0 +1,121 @@ +import codecs +import json +import logging +import os +import pathlib + +from config import BookLogger, BookApiWrapper, LawCartaConfig +from src.epub_postprocessor import EpubPostprocessor + + +class EpubBook: + + def __init__(self, book_id=0, access=None, + main_logger=None, + logging_format='%(asctime)s - %(levelname)s - %(message)s'): + self.book_id = book_id + self.access = access + self.epub_path = None + self.output_path = None # path to json file + self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}', + logging_format=logging_format, + book_id=book_id, + main_logger=main_logger) + self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id) + + assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \ + "Length of headers doesn't match allowed levels." + + def save_epub(self, content): + """ + Save binary content of file to .docx. + :param content: binary content of the file. + """ + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + folder_path = os.path.join(folder_path, f'epub/{self.book_id}') + pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) + + file_path = os.path.join(folder_path, f'{self.book_id}.epub') + try: + with open(file_path, 'wb+') as file: + file.write(content) + self.logger_object.log(f'File was saved to folder: {folder_path}.') + except Exception as exc: + self.logger_object.log("Error in writing epub file.", logging.ERROR) + self.logger_object.log_error_to_main_log() + raise exc + + self.epub_path = pathlib.Path(file_path) + + def get_epub(self): + """ + Method for getting and saving book from queue. + """ + try: + self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file') + content = self.access.get_doc(self.book_id) + self.logger_object.log('File was received from server.') + self.save_epub(content) + except FileNotFoundError as f_err: + self.logger_object.log("Can't get docx from server.", logging.ERROR) + self.logger_object.log_error_to_main_log() + raise f_err + except Exception as exc: + raise exc + + def check_output_directory(self): + if self.output_path is None: + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + output_path = os.path.join(folder_path, f'json/{self.book_id}.json') + self.output_path = output_path + + self.output_path = pathlib.Path(self.output_path) + self.logger_object.log(f'Output file path: {self.output_path}') + + pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) + self.output_path.touch(exist_ok=True) + + def write_to_json(self, content: dict): + self.check_output_directory() + try: + with codecs.open(self.output_path, 'w', encoding='utf-8') as f: + json.dump(content, f, ensure_ascii=False) + self.logger_object.log(f'Data has been saved to .json file: {self.output_path}') + except Exception as exc: + self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR) + + def send_json_content(self, content: dict): + try: + self.access.send_book(self.book_id, content) + self.logger_object.log(f'JSON data has been sent to server.') + except Exception as exc: + self.logger_object.log('Error has occurred while sending json content.', logging.ERROR) + self.logger_object.log_error_to_main_log() + self.book_api_wrapper.set_error_status() + raise exc + + def test_conversion(self): + self.logger_object.log('Beginning of the test.') + + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + folder_path = os.path.join(folder_path, f'epub') + file_path = os.path.join(folder_path, f'{self.book_id}.epub') + self.epub_path = pathlib.Path(file_path) + self.logger_object.log(f'Test epub path: {self.epub_path}') + json_converter = EpubPostprocessor(self.epub_path) + content_dict = json_converter.convert_to_dict() + self.write_to_json(content_dict) + self.logger_object.log('End of the test.') + + def conversion(self): + self.logger_object.log('Beginning of conversion from .docx to .json.') + self.get_epub() + self.book_api_wrapper.set_process_status() + self.logger_object.log('Beginning of processing json output.') + + json_converter = EpubPostprocessor(self.epub_path, self.access) + content_dict = json_converter.convert_to_dict() + self.book_api_wrapper.set_generate_status() + self.write_to_json(content_dict) + self.send_json_content(content_dict) + self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')