forked from LiveCarta/BookConverter
Update book.py
- add libra conversion locking - add docx/book_id/ folder - update test conversion func - separate file_path intp docx_path and html_path
This commit is contained in:
101
src/book.py
101
src/book.py
@@ -4,6 +4,9 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
|
import subprocess
|
||||||
|
from subprocess import PIPE
|
||||||
|
from threading import Event
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
@@ -33,11 +36,14 @@ class Book:
|
|||||||
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"}
|
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"}
|
||||||
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
||||||
|
|
||||||
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
|
def __init__(self, book_id=0, access=None, docx_path=None, html_path=None, output_path=None, main_logger=None,
|
||||||
|
libra_locker=None):
|
||||||
self.book_id = book_id
|
self.book_id = book_id
|
||||||
self.access = access
|
self.access = access
|
||||||
self.file_path = file_path
|
self.docx_path = docx_path # path to docx file, appears after downloading from server
|
||||||
self.output_path = output_path
|
self.html_path = html_path # path to html file, file appears after libre-conversion
|
||||||
|
self.output_path = output_path # path to json file
|
||||||
|
self.libra_locker: Event() = libra_locker
|
||||||
self.main_logger = main_logger
|
self.main_logger = main_logger
|
||||||
|
|
||||||
self.logger = None
|
self.logger = None
|
||||||
@@ -53,8 +59,11 @@ class Book:
|
|||||||
assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
|
assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
|
||||||
"Length of headers doesn't match allowed levels."
|
"Length of headers doesn't match allowed levels."
|
||||||
|
|
||||||
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
|
def configure_file_logger(self, name, attr_name='logger',
|
||||||
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
|
filename='logs/book_log.log',
|
||||||
|
filemode='w+',
|
||||||
|
logging_level=logging.INFO,
|
||||||
|
logging_format='%(asctime)s - %(message)s'):
|
||||||
"""
|
"""
|
||||||
Method for Logger configuration. Logger will write in file.
|
Method for Logger configuration. Logger will write in file.
|
||||||
|
|
||||||
@@ -107,7 +116,8 @@ class Book:
|
|||||||
:param content: binary content of the file.
|
:param content: binary content of the file.
|
||||||
"""
|
"""
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
folder_path = os.path.join(folder_path, 'docx')
|
folder_path = os.path.join(folder_path, f'docx/{self.book_id}')
|
||||||
|
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
|
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
|
||||||
try:
|
try:
|
||||||
@@ -119,7 +129,7 @@ class Book:
|
|||||||
self.log_error_to_main_log()
|
self.log_error_to_main_log()
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
self.file_path = pathlib.Path(file_path)
|
self.docx_path = pathlib.Path(file_path)
|
||||||
|
|
||||||
def get_docx(self):
|
def get_docx(self):
|
||||||
"""
|
"""
|
||||||
@@ -167,16 +177,24 @@ class Book:
|
|||||||
self.log_error_to_main_log()
|
self.log_error_to_main_log()
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
|
def _libra_run(self, out_dir_path):
|
||||||
|
command = ['libreoffice', '--headless',
|
||||||
|
'--convert-to', 'html', f'{str(self.docx_path)}',
|
||||||
|
'--outdir', f'{out_dir_path}']
|
||||||
|
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
|
||||||
|
self.log(f'STATUS book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
|
||||||
|
self.log(f'ERROR book_{self.book_id}: {result.stderr}', logging.DEBUG)
|
||||||
|
|
||||||
def convert_doc_to_html(self):
|
def convert_doc_to_html(self):
|
||||||
"""
|
"""
|
||||||
Method for convert .docx document to .html file.
|
Method for convert .docx document to .html file.
|
||||||
"""
|
"""
|
||||||
self.log(f'File - {self.file_path}.')
|
self.log(f'File - {self.docx_path}.')
|
||||||
print(f'{self.file_path}')
|
print(f'{self.docx_path}')
|
||||||
self.log('Beginning of conversion from .docx to .html.')
|
self.log('Beginning of conversion from .docx to .html.')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
f = open(self.file_path)
|
f = open(self.docx_path)
|
||||||
f.close()
|
f.close()
|
||||||
except FileNotFoundError as error:
|
except FileNotFoundError as error:
|
||||||
self.log('Invalid path to input data.', logging.ERROR)
|
self.log('Invalid path to input data.', logging.ERROR)
|
||||||
@@ -185,21 +203,40 @@ class Book:
|
|||||||
|
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
out_dir_path = os.path.join(folder_path, f'html/{self.book_id}')
|
out_dir_path = os.path.join(folder_path, f'html/{self.book_id}')
|
||||||
|
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
is_book_converted = False
|
||||||
try:
|
try:
|
||||||
command = f'libreoffice --headless --convert-to html "{str(self.file_path)}" --outdir {out_dir_path}'
|
if self.libra_locker.isSet():
|
||||||
os.system(command)
|
self.libra_locker.clear()
|
||||||
|
self.log('Got flag...', logging.DEBUG)
|
||||||
|
self._libra_run(out_dir_path)
|
||||||
|
self.libra_locker.set()
|
||||||
|
self.log('Cleared flag...', logging.DEBUG)
|
||||||
|
|
||||||
|
else:
|
||||||
|
while not self.libra_locker.isSet() and not is_book_converted:
|
||||||
|
self.log('Waiting for libra...', logging.DEBUG)
|
||||||
|
flag = self.libra_locker.wait(50)
|
||||||
|
if flag:
|
||||||
|
if self.libra_locker.isSet():
|
||||||
|
self.libra_locker.clear()
|
||||||
|
self.log(f'Got flag!', logging.DEBUG)
|
||||||
|
self._libra_run(out_dir_path)
|
||||||
|
self.libra_locker.set()
|
||||||
|
break
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
self.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
|
self.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
|
||||||
self.log_error_to_main_log()
|
self.log_error_to_main_log()
|
||||||
self.set_error_status()
|
self.set_error_status()
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
out_dir_path = os.path.join(out_dir_path, f'{self.file_path.stem}.html')
|
out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
|
||||||
self.file_path = pathlib.Path(out_dir_path)
|
self.html_path = pathlib.Path(out_dir_path)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
f = open(self.file_path)
|
f = open(self.html_path)
|
||||||
f.close()
|
f.close()
|
||||||
except FileNotFoundError as exc:
|
except FileNotFoundError as exc:
|
||||||
self.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
|
self.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
|
||||||
@@ -208,12 +245,12 @@ class Book:
|
|||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
self.log('End of conversion from .docx to .html.')
|
self.log('End of conversion from .docx to .html.')
|
||||||
self.log(f'Input file path after conversion: {self.file_path}.')
|
self.log(f'Input file path after conversion: {self.html_path}.')
|
||||||
|
|
||||||
def check_output_directory(self):
|
def check_output_directory(self):
|
||||||
if self.output_path is None:
|
if self.output_path is None:
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
output_path = os.path.join(folder_path, f'json/{self.file_path.stem}.json')
|
output_path = os.path.join(folder_path, f'json/{self.book_id}.json')
|
||||||
self.output_path = output_path
|
self.output_path = output_path
|
||||||
|
|
||||||
self.output_path = pathlib.Path(self.output_path)
|
self.output_path = pathlib.Path(self.output_path)
|
||||||
@@ -227,7 +264,7 @@ class Book:
|
|||||||
Method for reading .html file into beautiful soup tag.
|
Method for reading .html file into beautiful soup tag.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
html_text = open(self.file_path, 'r', encoding='utf8').read()
|
html_text = open(self.html_path, 'r', encoding='utf8').read()
|
||||||
self.log('HTML for book has been loaded.')
|
self.log('HTML for book has been loaded.')
|
||||||
except FileNotFoundError as exc:
|
except FileNotFoundError as exc:
|
||||||
self.log('There is no html to process. Conversion went wrong or you specified wrong paths.', logging.ERROR)
|
self.log('There is no html to process. Conversion went wrong or you specified wrong paths.', logging.ERROR)
|
||||||
@@ -549,12 +586,12 @@ class Book:
|
|||||||
if len(img_tags):
|
if len(img_tags):
|
||||||
if self.access is None:
|
if self.access is None:
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
|
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.book_id}/'))
|
||||||
new_path.mkdir(exist_ok=True)
|
new_path.mkdir(exist_ok=True)
|
||||||
|
|
||||||
for img in img_tags:
|
for img in img_tags:
|
||||||
img_name = img.attrs.get('src')
|
img_name = img.attrs.get('src')
|
||||||
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
|
img_path = pathlib.Path(f'{self.html_path.parent}/{img_name}')
|
||||||
|
|
||||||
if self.access is not None:
|
if self.access is not None:
|
||||||
link = self.access.send_image(img_path, self.book_id)
|
link = self.access.send_image(img_path, self.book_id)
|
||||||
@@ -955,8 +992,18 @@ class Book:
|
|||||||
self.write_json()
|
self.write_json()
|
||||||
|
|
||||||
def test_conversion(self):
|
def test_conversion(self):
|
||||||
self.configure_file_logger(self.book_id, filemode='w+')
|
self.configure_file_logger(self.book_id,
|
||||||
|
filemode='w+',
|
||||||
|
logging_format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
logging_level=logging.INFO)
|
||||||
self.log('Beginning of the test.')
|
self.log('Beginning of the test.')
|
||||||
|
|
||||||
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
folder_path = os.path.join(folder_path, f'docx')
|
||||||
|
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
|
||||||
|
self.docx_path = pathlib.Path(file_path)
|
||||||
|
self.log(f'Test docx path: {self.docx_path}')
|
||||||
|
|
||||||
self.convert_doc_to_html()
|
self.convert_doc_to_html()
|
||||||
self.check_output_directory()
|
self.check_output_directory()
|
||||||
self.read_html()
|
self.read_html()
|
||||||
@@ -982,11 +1029,9 @@ class Book:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
file_path = pathlib.Path(os.path.join(folder_path, 'html/82/82.html'))
|
file = pathlib.Path(os.path.join(folder, 'html/82/82.html'))
|
||||||
out_path = pathlib.Path(os.path.join(folder_path, 'json/82.json'))
|
out_path = pathlib.Path(os.path.join(folder, 'json/82.json'))
|
||||||
|
|
||||||
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
|
book = Book(html_path=file, output_path=out_path)
|
||||||
|
book.convert_from_html(logging_format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
book = Book(file_path=file_path, output_path=out_path)
|
|
||||||
book.convert_from_html(logging_format=logging_format)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user