This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/docx_converter.py
2021-08-17 19:10:55 +03:00

268 lines
12 KiB
Python

import codecs
import json
import logging
import os
import pathlib
import subprocess
from subprocess import PIPE
from threading import Event
from bs4 import BeautifulSoup
from livecarta_config import BookLogger, BookStatusWrapper, LawCartaConfig
from html_preprocessor import HTMLPreprocessor
from json_postprocessor import JSONConverter
class Book:
def __init__(self, book_id=0, access=None, docx_path=None, html_path=None, output_path=None,
main_logger=None, libra_locker=None,
logging_format='%(asctime)s - %(levelname)s - %(message)s'):
self.book_id = book_id
self.access = access
self.docx_path = docx_path # path to docx file, appears after downloading from server
self.html_path = html_path # path to html file, file appears after libre-conversion
self.output_path = output_path # path to json file
self.libra_locker: Event() = libra_locker
self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
logging_format=logging_format,
book_id=book_id,
main_logger=main_logger)
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels."
def save_docx(self, content):
"""
Save binary content of file to .docx.
:param content: binary content of the file.
"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'docx/{self.book_id}')
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
try:
with open(file_path, 'wb+') as file:
file.write(content)
self.logger_object.log(f'File was saved to folder: {folder_path}.')
except Exception as exc:
self.logger_object.log("Error in writing docx file.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise exc
self.docx_path = pathlib.Path(file_path)
def get_docx(self):
"""
Method for getting and saving book from queue.
"""
try:
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
content = self.access.get_doc(self.book_id)
self.logger_object.log('File was received from server.')
self.save_docx(content)
except FileNotFoundError as f_err:
self.logger_object.log("Can't get docx from server.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise f_err
except Exception as exc:
raise exc
def _libra_run(self, out_dir_path):
command = ['libreoffice', '--headless',
'--convert-to', 'html', f'{str(self.docx_path)}',
'--outdir', f'{out_dir_path}']
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
def convert_doc_to_html(self):
"""
Method for convert .docx document to .html file.
"""
self.logger_object.log(f'File - {self.docx_path}.')
print(f'{self.docx_path}')
self.logger_object.log('Beginning of conversion from .docx to .html.')
try:
f = open(self.docx_path)
f.close()
except FileNotFoundError as error:
self.logger_object.log('Invalid path to input data.', logging.ERROR)
self.status_wrapper.set_error()
raise error
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(folder_path, f'html/{self.book_id}')
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
is_book_converted = False
try:
if self.libra_locker.isSet():
self.libra_locker.clear()
self.logger_object.log('Got flag...', logging.DEBUG)
self._libra_run(out_dir_path)
self.libra_locker.set()
self.logger_object.log('Cleared flag...', logging.DEBUG)
else:
while not self.libra_locker.isSet() and not is_book_converted:
self.logger_object.log('Waiting for libra...', logging.DEBUG)
flag = self.libra_locker.wait(50)
if flag:
if self.libra_locker.isSet():
self.libra_locker.clear()
self.logger_object.log(f'Got flag!', logging.DEBUG)
self._libra_run(out_dir_path)
self.libra_locker.set()
break
except Exception as exc:
self.logger_object.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
self.html_path = pathlib.Path(out_dir_path)
try:
f = open(self.html_path)
f.close()
except FileNotFoundError as exc:
self.logger_object.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
self.logger_object.log('End of conversion from .docx to .html.')
self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
def check_output_directory(self):
if self.output_path is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
output_path = os.path.join(folder_path, f'json/{self.book_id}.json')
self.output_path = output_path
self.output_path = pathlib.Path(self.output_path)
self.logger_object.log(f'Output file path: {self.output_path}')
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
self.output_path.touch(exist_ok=True)
def read_html(self):
"""
Method for reading .html file into beautiful soup tag.
"""
try:
html_text = open(self.html_path, 'r', encoding='utf8').read()
self.logger_object.log('HTML for book has been loaded.')
except FileNotFoundError as exc:
self.logger_object.log('There is no html to process.'
'Conversion went wrong or you specified wrong paths.', logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
html_soup = BeautifulSoup(html_text, features='lxml')
return html_soup
def write_html_from_list(self, body_tag, file_name='json/html_test.html'):
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
file_path = pathlib.Path(os.path.join(folder_path, file_name))
with open(file_path, 'w', encoding='utf-8') as f_out:
f_out.write(body_tag.prettify())
self.logger_object.log(f'Check final prettified html: {file_name}.')
def write_to_json(self, content: dict):
try:
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
json.dump(content, f, ensure_ascii=False)
self.logger_object.log(f'Data has been saved to .json file: {self.output_path}')
except Exception as exc:
self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR)
def send_json_content(self, content: dict):
try:
self.access.send_book(self.book_id, content)
self.logger_object.log(f'JSON data has been sent to server.')
except Exception as exc:
self.logger_object.log('Error has occurred while sending json content.', logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
def convert_from_html(self):
html_soup = self.read_html()
parser = HTMLPreprocessor(html_soup, self.logger_object)
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
self.write_to_json(content_dict)
self.write_html_from_list(parser.body_tag)
def test_conversion(self):
self.logger_object.log('Beginning of the test.')
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'docx')
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
self.docx_path = pathlib.Path(file_path)
self.logger_object.log(f'Test docx path: {self.docx_path}')
self.convert_doc_to_html()
self.check_output_directory()
html_soup = self.read_html()
parser = HTMLPreprocessor(html_soup, self.logger_object)
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
self.write_to_json(content_dict)
self.write_html_from_list(parser.body_tag)
self.logger_object.log('End of the test.')
def conversion(self):
try:
self.logger_object.log('Beginning of conversion from .docx to .json.')
self.get_docx()
self.status_wrapper.set_processing()
self.convert_doc_to_html()
self.check_output_directory()
html_soup = self.read_html()
self.logger_object.log('Beginning of processing .html file.')
parser = HTMLPreprocessor(html_soup, self.logger_object)
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
self.logger_object.log('Beginning of processing json output.')
self.status_wrapper.set_generating()
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
self.write_to_json(content_dict)
self.send_json_content(content_dict)
self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
except Exception as exc:
self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
self.logger_object.log_error_to_main_log(str(exc))
self.status_wrapper.set_error()
raise exc
if __name__ == "__main__":
folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html'))
out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json'))
book = Book(html_path=file, output_path=out_path)
book.convert_from_html()