epub converter: add epub book conversion pipeline

This commit is contained in:
shirshasa
2021-04-21 17:28:13 +03:00
parent ea0814fb4c
commit 8f284651c4

121
src/epub_converter.py Normal file
View File

@@ -0,0 +1,121 @@
import codecs
import json
import logging
import os
import pathlib
from config import BookLogger, BookApiWrapper, LawCartaConfig
from src.epub_postprocessor import EpubPostprocessor
class EpubBook:
def __init__(self, book_id=0, access=None,
main_logger=None,
logging_format='%(asctime)s - %(levelname)s - %(message)s'):
self.book_id = book_id
self.access = access
self.epub_path = None
self.output_path = None # path to json file
self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
logging_format=logging_format,
book_id=book_id,
main_logger=main_logger)
self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id)
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels."
def save_epub(self, content):
"""
Save binary content of file to .docx.
:param content: binary content of the file.
"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'epub/{self.book_id}')
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
file_path = os.path.join(folder_path, f'{self.book_id}.epub')
try:
with open(file_path, 'wb+') as file:
file.write(content)
self.logger_object.log(f'File was saved to folder: {folder_path}.')
except Exception as exc:
self.logger_object.log("Error in writing epub file.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise exc
self.epub_path = pathlib.Path(file_path)
def get_epub(self):
"""
Method for getting and saving book from queue.
"""
try:
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
content = self.access.get_doc(self.book_id)
self.logger_object.log('File was received from server.')
self.save_epub(content)
except FileNotFoundError as f_err:
self.logger_object.log("Can't get docx from server.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise f_err
except Exception as exc:
raise exc
def check_output_directory(self):
if self.output_path is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
output_path = os.path.join(folder_path, f'json/{self.book_id}.json')
self.output_path = output_path
self.output_path = pathlib.Path(self.output_path)
self.logger_object.log(f'Output file path: {self.output_path}')
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
self.output_path.touch(exist_ok=True)
def write_to_json(self, content: dict):
self.check_output_directory()
try:
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
json.dump(content, f, ensure_ascii=False)
self.logger_object.log(f'Data has been saved to .json file: {self.output_path}')
except Exception as exc:
self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR)
def send_json_content(self, content: dict):
try:
self.access.send_book(self.book_id, content)
self.logger_object.log(f'JSON data has been sent to server.')
except Exception as exc:
self.logger_object.log('Error has occurred while sending json content.', logging.ERROR)
self.logger_object.log_error_to_main_log()
self.book_api_wrapper.set_error_status()
raise exc
def test_conversion(self):
self.logger_object.log('Beginning of the test.')
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'epub')
file_path = os.path.join(folder_path, f'{self.book_id}.epub')
self.epub_path = pathlib.Path(file_path)
self.logger_object.log(f'Test epub path: {self.epub_path}')
json_converter = EpubPostprocessor(self.epub_path)
content_dict = json_converter.convert_to_dict()
self.write_to_json(content_dict)
self.logger_object.log('End of the test.')
def conversion(self):
self.logger_object.log('Beginning of conversion from .docx to .json.')
self.get_epub()
self.book_api_wrapper.set_process_status()
self.logger_object.log('Beginning of processing json output.')
json_converter = EpubPostprocessor(self.epub_path, self.access)
content_dict = json_converter.convert_to_dict()
self.book_api_wrapper.set_generate_status()
self.write_to_json(content_dict)
self.send_json_content(content_dict)
self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')