Update book.py

- add libra conversion locking
- add docx/book_id/ folder
- update test conversion func
- separate file_path intp docx_path and html_path
This commit is contained in:
shirshasa
2020-06-19 13:15:36 +03:00
parent 8a32aeb58d
commit 340d68ae5d

View File

@@ -4,6 +4,9 @@ import logging
import os import os
import pathlib import pathlib
import re import re
import subprocess
from subprocess import PIPE
from threading import Event
from copy import copy from copy import copy
from shutil import copyfile from shutil import copyfile
@@ -33,11 +36,14 @@ class Book:
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"} SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"}
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None): def __init__(self, book_id=0, access=None, docx_path=None, html_path=None, output_path=None, main_logger=None,
libra_locker=None):
self.book_id = book_id self.book_id = book_id
self.access = access self.access = access
self.file_path = file_path self.docx_path = docx_path # path to docx file, appears after downloading from server
self.output_path = output_path self.html_path = html_path # path to html file, file appears after libre-conversion
self.output_path = output_path # path to json file
self.libra_locker: Event() = libra_locker
self.main_logger = main_logger self.main_logger = main_logger
self.logger = None self.logger = None
@@ -53,8 +59,11 @@ class Book:
assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \ assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels." "Length of headers doesn't match allowed levels."
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+', def configure_file_logger(self, name, attr_name='logger',
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'): filename='logs/book_log.log',
filemode='w+',
logging_level=logging.INFO,
logging_format='%(asctime)s - %(message)s'):
""" """
Method for Logger configuration. Logger will write in file. Method for Logger configuration. Logger will write in file.
@@ -107,7 +116,8 @@ class Book:
:param content: binary content of the file. :param content: binary content of the file.
""" """
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, 'docx') folder_path = os.path.join(folder_path, f'docx/{self.book_id}')
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
file_path = os.path.join(folder_path, f'{self.book_id}.docx') file_path = os.path.join(folder_path, f'{self.book_id}.docx')
try: try:
@@ -119,7 +129,7 @@ class Book:
self.log_error_to_main_log() self.log_error_to_main_log()
raise exc raise exc
self.file_path = pathlib.Path(file_path) self.docx_path = pathlib.Path(file_path)
def get_docx(self): def get_docx(self):
""" """
@@ -167,16 +177,24 @@ class Book:
self.log_error_to_main_log() self.log_error_to_main_log()
raise exc raise exc
def _libra_run(self, out_dir_path):
command = ['libreoffice', '--headless',
'--convert-to', 'html', f'{str(self.docx_path)}',
'--outdir', f'{out_dir_path}']
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
self.log(f'STATUS book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
self.log(f'ERROR book_{self.book_id}: {result.stderr}', logging.DEBUG)
def convert_doc_to_html(self): def convert_doc_to_html(self):
""" """
Method for convert .docx document to .html file. Method for convert .docx document to .html file.
""" """
self.log(f'File - {self.file_path}.') self.log(f'File - {self.docx_path}.')
print(f'{self.file_path}') print(f'{self.docx_path}')
self.log('Beginning of conversion from .docx to .html.') self.log('Beginning of conversion from .docx to .html.')
try: try:
f = open(self.file_path) f = open(self.docx_path)
f.close() f.close()
except FileNotFoundError as error: except FileNotFoundError as error:
self.log('Invalid path to input data.', logging.ERROR) self.log('Invalid path to input data.', logging.ERROR)
@@ -185,21 +203,40 @@ class Book:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(folder_path, f'html/{self.book_id}') out_dir_path = os.path.join(folder_path, f'html/{self.book_id}')
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
is_book_converted = False
try: try:
command = f'libreoffice --headless --convert-to html "{str(self.file_path)}" --outdir {out_dir_path}' if self.libra_locker.isSet():
os.system(command) self.libra_locker.clear()
self.log('Got flag...', logging.DEBUG)
self._libra_run(out_dir_path)
self.libra_locker.set()
self.log('Cleared flag...', logging.DEBUG)
else:
while not self.libra_locker.isSet() and not is_book_converted:
self.log('Waiting for libra...', logging.DEBUG)
flag = self.libra_locker.wait(50)
if flag:
if self.libra_locker.isSet():
self.libra_locker.clear()
self.log(f'Got flag!', logging.DEBUG)
self._libra_run(out_dir_path)
self.libra_locker.set()
break
except Exception as exc: except Exception as exc:
self.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) self.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
self.log_error_to_main_log() self.log_error_to_main_log()
self.set_error_status() self.set_error_status()
raise exc raise exc
out_dir_path = os.path.join(out_dir_path, f'{self.file_path.stem}.html') out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
self.file_path = pathlib.Path(out_dir_path) self.html_path = pathlib.Path(out_dir_path)
try: try:
f = open(self.file_path) f = open(self.html_path)
f.close() f.close()
except FileNotFoundError as exc: except FileNotFoundError as exc:
self.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR) self.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
@@ -208,12 +245,12 @@ class Book:
raise exc raise exc
self.log('End of conversion from .docx to .html.') self.log('End of conversion from .docx to .html.')
self.log(f'Input file path after conversion: {self.file_path}.') self.log(f'Input file path after conversion: {self.html_path}.')
def check_output_directory(self): def check_output_directory(self):
if self.output_path is None: if self.output_path is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
output_path = os.path.join(folder_path, f'json/{self.file_path.stem}.json') output_path = os.path.join(folder_path, f'json/{self.book_id}.json')
self.output_path = output_path self.output_path = output_path
self.output_path = pathlib.Path(self.output_path) self.output_path = pathlib.Path(self.output_path)
@@ -227,7 +264,7 @@ class Book:
Method for reading .html file into beautiful soup tag. Method for reading .html file into beautiful soup tag.
""" """
try: try:
html_text = open(self.file_path, 'r', encoding='utf8').read() html_text = open(self.html_path, 'r', encoding='utf8').read()
self.log('HTML for book has been loaded.') self.log('HTML for book has been loaded.')
except FileNotFoundError as exc: except FileNotFoundError as exc:
self.log('There is no html to process. Conversion went wrong or you specified wrong paths.', logging.ERROR) self.log('There is no html to process. Conversion went wrong or you specified wrong paths.', logging.ERROR)
@@ -549,12 +586,12 @@ class Book:
if len(img_tags): if len(img_tags):
if self.access is None: if self.access is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/')) new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.book_id}/'))
new_path.mkdir(exist_ok=True) new_path.mkdir(exist_ok=True)
for img in img_tags: for img in img_tags:
img_name = img.attrs.get('src') img_name = img.attrs.get('src')
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}') img_path = pathlib.Path(f'{self.html_path.parent}/{img_name}')
if self.access is not None: if self.access is not None:
link = self.access.send_image(img_path, self.book_id) link = self.access.send_image(img_path, self.book_id)
@@ -955,8 +992,18 @@ class Book:
self.write_json() self.write_json()
def test_conversion(self): def test_conversion(self):
self.configure_file_logger(self.book_id, filemode='w+') self.configure_file_logger(self.book_id,
filemode='w+',
logging_format='%(asctime)s - %(levelname)s - %(message)s',
logging_level=logging.INFO)
self.log('Beginning of the test.') self.log('Beginning of the test.')
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'docx')
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
self.docx_path = pathlib.Path(file_path)
self.log(f'Test docx path: {self.docx_path}')
self.convert_doc_to_html() self.convert_doc_to_html()
self.check_output_directory() self.check_output_directory()
self.read_html() self.read_html()
@@ -982,11 +1029,9 @@ class Book:
if __name__ == "__main__": if __name__ == "__main__":
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
file_path = pathlib.Path(os.path.join(folder_path, 'html/82/82.html')) file = pathlib.Path(os.path.join(folder, 'html/82/82.html'))
out_path = pathlib.Path(os.path.join(folder_path, 'json/82.json')) out_path = pathlib.Path(os.path.join(folder, 'json/82.json'))
logging_format = '%(asctime)s - %(levelname)s - %(message)s' book = Book(html_path=file, output_path=out_path)
book.convert_from_html(logging_format='%(asctime)s - %(levelname)s - %(message)s')
book = Book(file_path=file_path, output_path=out_path)
book.convert_from_html(logging_format=logging_format)