add functionality for api

This commit is contained in:
Jeniamakarchik
2020-02-05 16:45:12 +03:00
parent d13dab2d9e
commit 258f3518bf

View File

@@ -28,14 +28,12 @@ class Book:
}
SUPPORTED_HEADERS = ["h1", "h2", "h3"]
def __init__(self, file_path, output=None, recreate=False, train_mode=False, convert=False, model_location=None):
self.file_path = pathlib.Path(file_path)
self.output_path = output
self.recreate = recreate
self.train_mode = train_mode
self.convert = convert
self.model_location = model_location
def __init__(self, book_id, access=None):
self.book_id = book_id
self.access = access
self.file_path = None
self.output_path = None
self.logger = None
self.html_soup = None
self.body_tag = None
@@ -43,37 +41,8 @@ class Book:
self.footnotes = list()
self.images = list()
self.content_dict = dict()
# self.model = HeaderDetector(self.model_location, self.file_path.name)
def parse_args(self):
"""
Method for parsing arguments from command line.
"""
parser = argparse.ArgumentParser(description='Converts .docx/.html documents to .json file with '
'LiveCarta book structure.')
parser.add_argument('-f', dest='file_path', type=str, required=True, help='Path to file to be processed.')
parser.add_argument('-o', dest='output', help='Path to output file.', default="")
parser.add_argument('--recreate', nargs='?', const=True, default=False,
help='If output file exist, will overwrite it.')
parser.add_argument('--convert', dest='convert', nargs='?', const=True, default=False,
help='Conversion from .docx to .html with "libreoffice".')
parser.add_argument('--train', dest='train_mode', nargs='?', const=True, default=False,
help='Train mode, takes labeled file (with highlighted paragraphs) and fitted model')
parser.add_argument('-m', dest='model_location',
help='Path to file with fitted model. If does not exist, will be created')
args = parser.parse_args()
folder_path = os.path.dirname(os.path.abspath(__file__))
self.file_path = os.path.join(folder_path, "..", args.file_path)
self.output_path = args.output
self.recreate = args.recreate
self.train_mode = args.train_mode
self.convert = args.convert
self.model_location = args.model_location
# self.model = HeaderDetector(self.model_location, self.file_path.name)
def configure_file_logger(self, name, attr_name='logger', filename='logs/converter_log.log', filemode='w',
def configure_file_logger(self, name, attr_name='logger', filename='logs/converter_log.log', filemode='w+',
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
"""
Method for Logger configuration. Logger will write in file.
@@ -84,14 +53,16 @@ class Book:
:param filemode: mode of opening log file.
:param logging_level: logging level: 10 - debug, 20 - info, 30 - warning, 40 - error, 50 - critical.
:param logging_format: format of record in log file.
:param date_format: format of the date that will be used in record.
"""
logger = logging.getLogger(name)
if self.file_path:
filename = f'logs/{self.file_path.stem}_log.log'
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if self.book_id:
filename = f'logs/{self.book_id}_log.log'
file_handler = logging.FileHandler(filename, mode=filemode)
file_path = os.path.join(folder_path, filename)
file_handler = logging.FileHandler(file_path, mode=filemode)
# file_format = logging.Formatter(fmt=logging_format, datefmt=date_format)
file_format = logging.Formatter(fmt=logging_format)
file_handler.setFormatter(file_format)
@@ -101,6 +72,58 @@ class Book:
setattr(self, attr_name, logger)
def log(self, message, logging_level=20):
"""
Method for logging.
:param message: body of the message
:param logging_level: level of logging
"""
self.logger.log(msg=message, level=logging_level)
def save_docx(self, content):
"""
Save binary content of file to .docx.
:param content: binary content of the file.
"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
file_path = os.path.join(folder_path, f'docx/{self.book_id}.docx')
with open(file_path, 'wb+') as file:
file.write(content)
self.file_path = pathlib.Path(file_path)
def get_docx(self):
"""
Method for getting and saving book from queue.
"""
try:
content = self.access.get_doc(self.book_id)
self.save_docx(content)
except FileNotFoundError as ferr:
self.log('File have not found')
raise ferr
except Exception as exc:
raise exc
def set_process_status(self):
try:
self.access.update_status(self.book_id, self.access.PROCESS)
except Exception as exc:
raise exc
def set_generate_status(self):
try:
self.access.update_status(self.book_id, self.access.GENERATE)
except Exception as exc:
raise exc
def set_error_status(self):
try:
self.access.update_status(self.book_id, self.access.ERROR)
except Exception as exc:
raise exc
def convert_doc_to_html(self):
"""
Method for convert .docx document to .html file.
@@ -114,27 +137,34 @@ class Book:
f.close()
except FileNotFoundError as error:
self.logger.error('Invalid path to input data.')
self.set_error_status()
raise error
command = f'libreoffice --headless --convert-to html "{str(self.file_path)}" --outdir html'
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(folder_path, f'html/{self.book_id}')
command = f'libreoffice --headless --convert-to html "{str(self.file_path)}" --outdir {out_dir_path}'
os.system(command)
self.file_path = pathlib.Path(f'html/{self.file_path.stem}.html')
out_dir_path = os.path.join(out_dir_path, f'{self.file_path.stem}.html')
self.file_path = pathlib.Path(out_dir_path)
try:
f = open(self.file_path)
f.close()
except FileNotFoundError as e:
except FileNotFoundError as exc:
self.logger.error('Conversion has gone wrong.')
raise e
self.set_error_status()
raise exc
self.log('End of conversion from .docx to .html.')
self.log(f'Input file path after conversion: {self.file_path}.')
def check_output_directory(self):
if not self.output_path:
filename = f'{self.file_path.stem}.json'
self.output_path = f'json/{filename}'
if self.output_path is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
output_path = os.path.join(folder_path, f'json/{self.file_path.stem}.json')
self.output_path = output_path
self.output_path = pathlib.Path(self.output_path)
self.log(f'Output file path: {self.output_path}')
@@ -148,9 +178,10 @@ class Book:
"""
try:
html_text = open(self.file_path, 'r', encoding='utf8').read()
except FileNotFoundError as e:
except FileNotFoundError as exc:
self.logger.error('There is no html to process. Conversion went wrong or you specified wrong paths.')
raise e
self.set_error_status()
raise exc
self.html_soup = BeautifulSoup(html_text, features='lxml')
self.body_tag = self.html_soup.body
@@ -354,19 +385,21 @@ class Book:
imgs = self.body_tag.find_all('img')
if len(imgs):
new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
new_path.mkdir(exist_ok=True)
# new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
# new_path.mkdir(exist_ok=True)
for img in imgs:
img_name = img.attrs.get("src")
img_path = pathlib.Path(f'html/{img_name}')
img_name = img.attrs.get('src')
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
# img_size = os.path.getsize(img_path) # TODO: Implement loading to S3 and then getting link to it.
link = self.access.send_image(img_path, self.book_id)
img.attrs['src'] = link
# img_size = os.path.getsize(img_path)
# print(f'{img_name} successfully loaded. Image size: {img_size}.')
new_img_path = new_path / img_name
copyfile(img_path, new_img_path)
img.attrs["src"] = str(new_img_path)
# new_img_path = new_path / img_name
# copyfile(img_path, new_img_path)
# img.attrs["src"] = str(new_img_path)
self.images = imgs
@@ -544,7 +577,7 @@ class Book:
else:
chapter_title = f'Untitled chapter {ch_num}'
chapter = []
while self.content[ind].name not in self.SUPPORTED_HEADERS:
while ind < len(self.content) and self.content[ind].name not in self.SUPPORTED_HEADERS:
chapter.append(self.format_html(str(self.content[ind])))
ind += 1
res = {chapter_title: ["".join(chapter)]}
@@ -560,18 +593,24 @@ class Book:
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
json.dump(self.content_dict, f, ensure_ascii=False)
def log(self, message, logging_level=20):
self.logger.log(msg=message, level=logging_level)
def send_json_content(self):
try:
self.access.send_book(self.book_id, self.content_dict)
except Exception as exc:
raise exc
def conversion(self, logging_format, filemode='w'):
def conversion(self, logging_format, filemode='w+'):
self.configure_file_logger(__name__, logging_format=logging_format, filemode=filemode)
self.log('Beginning of conversion from .docx to .json.')
if self.convert:
self.convert_doc_to_html()
self.get_docx()
self.set_process_status()
self.convert_doc_to_html()
self.check_output_directory()
self.read_html()
self.clean_trash()
self.process_html()
self.set_generate_status()
self.convert_to_json()
self.write_json()
self.send_json_content()
self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')