forked from LiveCarta/BookConverter
268 lines
12 KiB
Python
268 lines
12 KiB
Python
import codecs
|
|
import json
|
|
import logging
|
|
import os
|
|
import pathlib
|
|
import subprocess
|
|
from subprocess import PIPE
|
|
from threading import Event
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from livecarta_config import BookLogger, BookStatusWrapper, LawCartaConfig
|
|
from html_preprocessor import HTMLPreprocessor
|
|
from json_postprocessor import JSONConverter
|
|
|
|
|
|
class Book:
|
|
|
|
def __init__(self, book_id=0, access=None, docx_path=None, html_path=None, output_path=None,
|
|
main_logger=None, libra_locker=None,
|
|
logging_format='%(asctime)s - %(levelname)s - %(message)s'):
|
|
self.book_id = book_id
|
|
self.access = access
|
|
self.docx_path = docx_path # path to docx file, appears after downloading from server
|
|
self.html_path = html_path # path to html file, file appears after libre-conversion
|
|
self.output_path = output_path # path to json file
|
|
self.libra_locker: Event() = libra_locker
|
|
|
|
self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
|
|
logging_format=logging_format,
|
|
book_id=book_id,
|
|
main_logger=main_logger)
|
|
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
|
|
|
|
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
|
|
"Length of headers doesn't match allowed levels."
|
|
|
|
def save_docx(self, content):
|
|
"""
|
|
Save binary content of file to .docx.
|
|
:param content: binary content of the file.
|
|
"""
|
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
folder_path = os.path.join(folder_path, f'docx/{self.book_id}')
|
|
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
|
|
|
|
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
|
|
try:
|
|
with open(file_path, 'wb+') as file:
|
|
file.write(content)
|
|
self.logger_object.log(f'File was saved to folder: {folder_path}.')
|
|
except Exception as exc:
|
|
self.logger_object.log("Error in writing docx file.", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
raise exc
|
|
|
|
self.docx_path = pathlib.Path(file_path)
|
|
|
|
def get_docx(self):
|
|
"""
|
|
Method for getting and saving book from queue.
|
|
"""
|
|
try:
|
|
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
|
|
content = self.access.get_doc(self.book_id)
|
|
self.logger_object.log('File was received from server.')
|
|
self.save_docx(content)
|
|
except FileNotFoundError as f_err:
|
|
self.logger_object.log("Can't get docx from server.", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
raise f_err
|
|
except Exception as exc:
|
|
raise exc
|
|
|
|
def _libra_run(self, out_dir_path):
|
|
command = ['libreoffice', '--headless',
|
|
'--convert-to', 'html', f'{str(self.docx_path)}',
|
|
'--outdir', f'{out_dir_path}']
|
|
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
|
|
self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
|
|
self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
|
|
|
|
def convert_doc_to_html(self):
|
|
"""
|
|
Method for convert .docx document to .html file.
|
|
"""
|
|
self.logger_object.log(f'File - {self.docx_path}.')
|
|
print(f'{self.docx_path}')
|
|
self.logger_object.log('Beginning of conversion from .docx to .html.')
|
|
|
|
try:
|
|
f = open(self.docx_path)
|
|
f.close()
|
|
except FileNotFoundError as error:
|
|
self.logger_object.log('Invalid path to input data.', logging.ERROR)
|
|
self.status_wrapper.set_error()
|
|
raise error
|
|
|
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
out_dir_path = os.path.join(folder_path, f'html/{self.book_id}')
|
|
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
|
|
|
|
is_book_converted = False
|
|
try:
|
|
if self.libra_locker.isSet():
|
|
self.libra_locker.clear()
|
|
self.logger_object.log('Got flag...', logging.DEBUG)
|
|
self._libra_run(out_dir_path)
|
|
self.libra_locker.set()
|
|
self.logger_object.log('Cleared flag...', logging.DEBUG)
|
|
|
|
else:
|
|
while not self.libra_locker.isSet() and not is_book_converted:
|
|
self.logger_object.log('Waiting for libra...', logging.DEBUG)
|
|
flag = self.libra_locker.wait(50)
|
|
if flag:
|
|
if self.libra_locker.isSet():
|
|
self.libra_locker.clear()
|
|
self.logger_object.log(f'Got flag!', logging.DEBUG)
|
|
self._libra_run(out_dir_path)
|
|
self.libra_locker.set()
|
|
break
|
|
|
|
except Exception as exc:
|
|
self.logger_object.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
self.status_wrapper.set_error()
|
|
raise exc
|
|
|
|
out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
|
|
self.html_path = pathlib.Path(out_dir_path)
|
|
|
|
try:
|
|
f = open(self.html_path)
|
|
f.close()
|
|
except FileNotFoundError as exc:
|
|
self.logger_object.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
self.status_wrapper.set_error()
|
|
raise exc
|
|
|
|
self.logger_object.log('End of conversion from .docx to .html.')
|
|
self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
|
|
|
|
def check_output_directory(self):
|
|
if self.output_path is None:
|
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
output_path = os.path.join(folder_path, f'json/{self.book_id}.json')
|
|
self.output_path = output_path
|
|
|
|
self.output_path = pathlib.Path(self.output_path)
|
|
self.logger_object.log(f'Output file path: {self.output_path}')
|
|
|
|
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
self.output_path.touch(exist_ok=True)
|
|
|
|
def read_html(self):
|
|
"""
|
|
Method for reading .html file into beautiful soup tag.
|
|
"""
|
|
try:
|
|
html_text = open(self.html_path, 'r', encoding='utf8').read()
|
|
self.logger_object.log('HTML for book has been loaded.')
|
|
except FileNotFoundError as exc:
|
|
self.logger_object.log('There is no html to process.'
|
|
'Conversion went wrong or you specified wrong paths.', logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
self.status_wrapper.set_error()
|
|
raise exc
|
|
|
|
html_soup = BeautifulSoup(html_text, features='lxml')
|
|
return html_soup
|
|
|
|
def write_html_from_list(self, body_tag, file_name='json/html_test.html'):
|
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
file_path = pathlib.Path(os.path.join(folder_path, file_name))
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f_out:
|
|
f_out.write(body_tag.prettify())
|
|
self.logger_object.log(f'Check final prettified html: {file_name}.')
|
|
|
|
def write_to_json(self, content: dict):
|
|
try:
|
|
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(content, f, ensure_ascii=False)
|
|
self.logger_object.log(f'Data has been saved to .json file: {self.output_path}')
|
|
except Exception as exc:
|
|
self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR)
|
|
|
|
def send_json_content(self, content: dict):
|
|
try:
|
|
self.access.send_book(self.book_id, content)
|
|
self.logger_object.log(f'JSON data has been sent to server.')
|
|
except Exception as exc:
|
|
self.logger_object.log('Error has occurred while sending json content.', logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
self.status_wrapper.set_error()
|
|
raise exc
|
|
|
|
def convert_from_html(self):
|
|
html_soup = self.read_html()
|
|
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
|
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
|
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
|
content_dict = json_converter.convert_to_dict()
|
|
self.write_to_json(content_dict)
|
|
self.write_html_from_list(parser.body_tag)
|
|
|
|
def test_conversion(self):
|
|
self.logger_object.log('Beginning of the test.')
|
|
|
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
folder_path = os.path.join(folder_path, f'docx')
|
|
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
|
|
self.docx_path = pathlib.Path(file_path)
|
|
self.logger_object.log(f'Test docx path: {self.docx_path}')
|
|
|
|
self.convert_doc_to_html()
|
|
self.check_output_directory()
|
|
|
|
html_soup = self.read_html()
|
|
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
|
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
|
|
|
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
|
content_dict = json_converter.convert_to_dict()
|
|
|
|
self.write_to_json(content_dict)
|
|
self.write_html_from_list(parser.body_tag)
|
|
self.logger_object.log('End of the test.')
|
|
|
|
def conversion(self):
|
|
try:
|
|
self.logger_object.log('Beginning of conversion from .docx to .json.')
|
|
self.get_docx()
|
|
self.status_wrapper.set_processing()
|
|
self.convert_doc_to_html()
|
|
self.check_output_directory()
|
|
|
|
html_soup = self.read_html()
|
|
self.logger_object.log('Beginning of processing .html file.')
|
|
|
|
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
|
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
|
|
|
self.logger_object.log('Beginning of processing json output.')
|
|
self.status_wrapper.set_generating()
|
|
|
|
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
|
content_dict = json_converter.convert_to_dict()
|
|
self.write_to_json(content_dict)
|
|
self.send_json_content(content_dict)
|
|
self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
|
except Exception as exc:
|
|
self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
|
|
self.logger_object.log_error_to_main_log(str(exc))
|
|
self.status_wrapper.set_error()
|
|
raise exc
|
|
|
|
|
|
if __name__ == "__main__":
|
|
folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html'))
|
|
out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json'))
|
|
|
|
book = Book(html_path=file, output_path=out_path)
|
|
book.convert_from_html()
|