From 258f3518bfe7c73afeea4ba74f0fc0bc8bb82775 Mon Sep 17 00:00:00 2001 From: Jeniamakarchik Date: Wed, 5 Feb 2020 16:45:12 +0300 Subject: [PATCH] add functionality for api --- src/book.py | 169 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 104 insertions(+), 65 deletions(-) diff --git a/src/book.py b/src/book.py index 89c3ac4..4c278f3 100644 --- a/src/book.py +++ b/src/book.py @@ -28,14 +28,12 @@ class Book: } SUPPORTED_HEADERS = ["h1", "h2", "h3"] - def __init__(self, file_path, output=None, recreate=False, train_mode=False, convert=False, model_location=None): - self.file_path = pathlib.Path(file_path) - self.output_path = output - self.recreate = recreate - self.train_mode = train_mode - self.convert = convert - self.model_location = model_location + def __init__(self, book_id, access=None): + self.book_id = book_id + self.access = access + self.file_path = None + self.output_path = None self.logger = None self.html_soup = None self.body_tag = None @@ -43,37 +41,8 @@ class Book: self.footnotes = list() self.images = list() self.content_dict = dict() - # self.model = HeaderDetector(self.model_location, self.file_path.name) - def parse_args(self): - """ - Method for parsing arguments from command line. - """ - parser = argparse.ArgumentParser(description='Converts .docx/.html documents to .json file with ' - 'LiveCarta book structure.') - parser.add_argument('-f', dest='file_path', type=str, required=True, help='Path to file to be processed.') - parser.add_argument('-o', dest='output', help='Path to output file.', default="") - parser.add_argument('--recreate', nargs='?', const=True, default=False, - help='If output file exist, will overwrite it.') - parser.add_argument('--convert', dest='convert', nargs='?', const=True, default=False, - help='Conversion from .docx to .html with "libreoffice".') - parser.add_argument('--train', dest='train_mode', nargs='?', const=True, default=False, - help='Train mode, takes labeled file (with highlighted paragraphs) and fitted model') - parser.add_argument('-m', dest='model_location', - help='Path to file with fitted model. If does not exist, will be created') - - args = parser.parse_args() - - folder_path = os.path.dirname(os.path.abspath(__file__)) - self.file_path = os.path.join(folder_path, "..", args.file_path) - self.output_path = args.output - self.recreate = args.recreate - self.train_mode = args.train_mode - self.convert = args.convert - self.model_location = args.model_location - # self.model = HeaderDetector(self.model_location, self.file_path.name) - - def configure_file_logger(self, name, attr_name='logger', filename='logs/converter_log.log', filemode='w', + def configure_file_logger(self, name, attr_name='logger', filename='logs/converter_log.log', filemode='w+', logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'): """ Method for Logger configuration. Logger will write in file. @@ -84,14 +53,16 @@ class Book: :param filemode: mode of opening log file. :param logging_level: logging level: 10 - debug, 20 - info, 30 - warning, 40 - error, 50 - critical. :param logging_format: format of record in log file. - :param date_format: format of the date that will be used in record. """ logger = logging.getLogger(name) - if self.file_path: - filename = f'logs/{self.file_path.stem}_log.log' + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + if self.book_id: + filename = f'logs/{self.book_id}_log.log' - file_handler = logging.FileHandler(filename, mode=filemode) + file_path = os.path.join(folder_path, filename) + + file_handler = logging.FileHandler(file_path, mode=filemode) # file_format = logging.Formatter(fmt=logging_format, datefmt=date_format) file_format = logging.Formatter(fmt=logging_format) file_handler.setFormatter(file_format) @@ -101,6 +72,58 @@ class Book: setattr(self, attr_name, logger) + def log(self, message, logging_level=20): + """ + Method for logging. + + :param message: body of the message + :param logging_level: level of logging + """ + self.logger.log(msg=message, level=logging_level) + + def save_docx(self, content): + """ + Save binary content of file to .docx. + :param content: binary content of the file. + """ + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + file_path = os.path.join(folder_path, f'docx/{self.book_id}.docx') + with open(file_path, 'wb+') as file: + file.write(content) + + self.file_path = pathlib.Path(file_path) + + def get_docx(self): + """ + Method for getting and saving book from queue. + """ + try: + content = self.access.get_doc(self.book_id) + self.save_docx(content) + except FileNotFoundError as ferr: + self.log('File have not found') + raise ferr + except Exception as exc: + raise exc + + def set_process_status(self): + try: + self.access.update_status(self.book_id, self.access.PROCESS) + except Exception as exc: + raise exc + + def set_generate_status(self): + try: + self.access.update_status(self.book_id, self.access.GENERATE) + except Exception as exc: + raise exc + + def set_error_status(self): + try: + self.access.update_status(self.book_id, self.access.ERROR) + except Exception as exc: + raise exc + def convert_doc_to_html(self): """ Method for convert .docx document to .html file. @@ -114,27 +137,34 @@ class Book: f.close() except FileNotFoundError as error: self.logger.error('Invalid path to input data.') + self.set_error_status() raise error - command = f'libreoffice --headless --convert-to html "{str(self.file_path)}" --outdir html' + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + out_dir_path = os.path.join(folder_path, f'html/{self.book_id}') + + command = f'libreoffice --headless --convert-to html "{str(self.file_path)}" --outdir {out_dir_path}' os.system(command) - self.file_path = pathlib.Path(f'html/{self.file_path.stem}.html') + out_dir_path = os.path.join(out_dir_path, f'{self.file_path.stem}.html') + self.file_path = pathlib.Path(out_dir_path) try: f = open(self.file_path) f.close() - except FileNotFoundError as e: + except FileNotFoundError as exc: self.logger.error('Conversion has gone wrong.') - raise e + self.set_error_status() + raise exc self.log('End of conversion from .docx to .html.') self.log(f'Input file path after conversion: {self.file_path}.') def check_output_directory(self): - if not self.output_path: - filename = f'{self.file_path.stem}.json' - self.output_path = f'json/{filename}' + if self.output_path is None: + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + output_path = os.path.join(folder_path, f'json/{self.file_path.stem}.json') + self.output_path = output_path self.output_path = pathlib.Path(self.output_path) self.log(f'Output file path: {self.output_path}') @@ -148,9 +178,10 @@ class Book: """ try: html_text = open(self.file_path, 'r', encoding='utf8').read() - except FileNotFoundError as e: + except FileNotFoundError as exc: self.logger.error('There is no html to process. Conversion went wrong or you specified wrong paths.') - raise e + self.set_error_status() + raise exc self.html_soup = BeautifulSoup(html_text, features='lxml') self.body_tag = self.html_soup.body @@ -354,19 +385,21 @@ class Book: imgs = self.body_tag.find_all('img') if len(imgs): - new_path = pathlib.Path(f'json/img_{self.file_path.stem}/') - new_path.mkdir(exist_ok=True) + # new_path = pathlib.Path(f'json/img_{self.file_path.stem}/') + # new_path.mkdir(exist_ok=True) for img in imgs: - img_name = img.attrs.get("src") - img_path = pathlib.Path(f'html/{img_name}') + img_name = img.attrs.get('src') + img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}') - # img_size = os.path.getsize(img_path) # TODO: Implement loading to S3 and then getting link to it. + link = self.access.send_image(img_path, self.book_id) + img.attrs['src'] = link + + # img_size = os.path.getsize(img_path) # print(f'{img_name} successfully loaded. Image size: {img_size}.') - - new_img_path = new_path / img_name - copyfile(img_path, new_img_path) - img.attrs["src"] = str(new_img_path) + # new_img_path = new_path / img_name + # copyfile(img_path, new_img_path) + # img.attrs["src"] = str(new_img_path) self.images = imgs @@ -544,7 +577,7 @@ class Book: else: chapter_title = f'Untitled chapter {ch_num}' chapter = [] - while self.content[ind].name not in self.SUPPORTED_HEADERS: + while ind < len(self.content) and self.content[ind].name not in self.SUPPORTED_HEADERS: chapter.append(self.format_html(str(self.content[ind]))) ind += 1 res = {chapter_title: ["".join(chapter)]} @@ -560,18 +593,24 @@ class Book: with codecs.open(self.output_path, 'w', encoding='utf-8') as f: json.dump(self.content_dict, f, ensure_ascii=False) - def log(self, message, logging_level=20): - self.logger.log(msg=message, level=logging_level) + def send_json_content(self): + try: + self.access.send_book(self.book_id, self.content_dict) + except Exception as exc: + raise exc - def conversion(self, logging_format, filemode='w'): + def conversion(self, logging_format, filemode='w+'): self.configure_file_logger(__name__, logging_format=logging_format, filemode=filemode) self.log('Beginning of conversion from .docx to .json.') - if self.convert: - self.convert_doc_to_html() + self.get_docx() + self.set_process_status() + self.convert_doc_to_html() self.check_output_directory() self.read_html() self.clean_trash() self.process_html() + self.set_generate_status() self.convert_to_json() self.write_json() + self.send_json_content() self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')