epub converter: Book, EpubBook refactoring

This commit is contained in:
shirshasa
2021-09-03 22:35:34 +03:00
parent c12be5b482
commit 92fe2bc019
4 changed files with 86 additions and 143 deletions

View File

@@ -10,8 +10,8 @@ from threading import Event
import pika
from access import Access
from docx_converter import DocxBook
from epub_converter import EpubBook
from docx_solver import DocxBook
from epub_solver import EpubBook
def configure_file_logger(name, filename='logs/converter_log.log', filemode='w+',
@@ -52,7 +52,9 @@ def callback(ch, method, properties, body, logger, libra_locker):
assert 'apiURL' in data, 'No apiURL field in received message.'
assert data.get('fileExtension') in ['epub', 'docx'], 'Wrong book type received.'
book_params = {'access': Access(url=data['apiURL'])}
book_params = {
'access': Access(url=data['apiURL']),
}
if data.get('fileExtension') == 'docx':
book_params.update({'libra_locker': libra_locker})

View File

@@ -1,5 +1,3 @@
import codecs
import json
import logging
import os
import pathlib
@@ -8,73 +6,23 @@ from subprocess import PIPE
from threading import Event
from bs4 import BeautifulSoup
from livecarta_config import BookLogger, BookStatusWrapper, LawCartaConfig
from html_preprocessor import HTMLPreprocessor
from json_postprocessor import JSONConverter
from src.solver import BookSolver
class DocxBook:
class DocxBook(BookSolver):
def __init__(self, book_id=0, access=None, docx_path=None, html_path=None, output_path=None,
main_logger=None, libra_locker=None,
logging_format='%(asctime)s - %(levelname)s - %(message)s'):
self.book_id = book_id
self.access = access
self.docx_path = docx_path # path to docx file, appears after downloading from server
def __init__(self, book_id=0, access=None, html_path=None,
main_logger=None, libra_locker=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'):
super().__init__(book_id, access, main_logger, logging_format)
self.book_type = 'docx'
self.html_path = html_path # path to html file, file appears after libre-conversion
self.output_path = output_path # path to json file
self.libra_locker: Event() = libra_locker
self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
logging_format=logging_format,
book_id=book_id,
main_logger=main_logger)
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels."
def save_docx(self, content):
"""
Save binary content of file to .docx.
:param content: binary content of the file.
"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'docx/{self.book_id}')
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
try:
with open(file_path, 'wb+') as file:
file.write(content)
self.logger_object.log(f'File was saved to folder: {folder_path}.')
except Exception as exc:
self.logger_object.log("Error in writing docx file.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise exc
self.docx_path = pathlib.Path(file_path)
def get_docx(self):
"""
Method for getting and saving book from queue.
"""
try:
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
content = self.access.get_doc(self.book_id)
self.logger_object.log('File was received from server.')
self.save_docx(content)
except FileNotFoundError as f_err:
self.logger_object.log("Can't get docx from server.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise f_err
except Exception as exc:
raise exc
def _libra_run(self, out_dir_path):
command = ['libreoffice', '--headless',
'--convert-to', 'html', f'{str(self.docx_path)}',
'--convert-to', 'html', f'{str(self.file_path)}',
'--outdir', f'{out_dir_path}']
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
@@ -84,12 +32,12 @@ class DocxBook:
"""
Method for convert .docx document to .html file.
"""
self.logger_object.log(f'File - {self.docx_path}.')
print(f'{self.docx_path}')
self.logger_object.log(f'File - {self.file_path}.')
print(f'{self.file_path}')
self.logger_object.log('Beginning of conversion from .docx to .html.')
try:
f = open(self.docx_path)
f = open(self.file_path)
f.close()
except FileNotFoundError as error:
self.logger_object.log('Invalid path to input data.', logging.ERROR)
@@ -142,18 +90,6 @@ class DocxBook:
self.logger_object.log('End of conversion from .docx to .html.')
self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
def check_output_directory(self):
if self.output_path is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
output_path = os.path.join(folder_path, f'json/{self.book_id}.json')
self.output_path = output_path
self.output_path = pathlib.Path(self.output_path)
self.logger_object.log(f'Output file path: {self.output_path}')
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
self.output_path.touch(exist_ok=True)
def read_html(self):
"""
Method for reading .html file into beautiful soup tag.
@@ -179,24 +115,6 @@ class DocxBook:
f_out.write(body_tag.prettify())
self.logger_object.log(f'Check final prettified html: {file_name}.')
def write_to_json(self, content: dict):
try:
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
json.dump(content, f, ensure_ascii=False)
self.logger_object.log(f'Data has been saved to .json file: {self.output_path}')
except Exception as exc:
self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR)
def send_json_content(self, content: dict):
try:
self.access.send_book(self.book_id, content)
self.logger_object.log(f'JSON data has been sent to server.')
except Exception as exc:
self.logger_object.log('Error has occurred while sending json content.', logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
def convert_from_html(self):
html_soup = self.read_html()
parser = HTMLPreprocessor(html_soup, self.logger_object)
@@ -212,8 +130,8 @@ class DocxBook:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'docx')
file_path = os.path.join(folder_path, f'{self.book_id}.docx')
self.docx_path = pathlib.Path(file_path)
self.logger_object.log(f'Test docx path: {self.docx_path}')
self.file_path = pathlib.Path(file_path)
self.logger_object.log(f'Test docx path: {self.file_path}')
self.convert_doc_to_html()
self.check_output_directory()
@@ -229,27 +147,31 @@ class DocxBook:
self.write_html_from_list(parser.body_tag)
self.logger_object.log('End of the test.')
def get_converted_book(self):
self.convert_doc_to_html()
self.check_output_directory()
html_soup = self.read_html()
self.logger_object.log('Beginning of processing .html file.')
parser = HTMLPreprocessor(html_soup, self.logger_object)
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
self.logger_object.log('Beginning of processing json output.')
self.status_wrapper.set_generating()
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
return content_dict
def conversion(self):
try:
self.logger_object.log('Beginning of conversion from .docx to .json.')
self.get_docx()
self.get_book_file()
self.status_wrapper.set_processing()
self.convert_doc_to_html()
self.check_output_directory()
html_soup = self.read_html()
self.logger_object.log('Beginning of processing .html file.')
parser = HTMLPreprocessor(html_soup, self.logger_object)
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
self.logger_object.log('Beginning of processing json output.')
self.status_wrapper.set_generating()
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
content_dict = self.get_converted_book()
self.write_to_json(content_dict)
self.send_json_content(content_dict)
self.send_json_content_to_server(content_dict)
self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
except Exception as exc:
self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
@@ -263,5 +185,5 @@ if __name__ == "__main__":
file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html'))
out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json'))
book = DocxBook(html_path=file, output_path=out_path)
book = DocxBook(html_path=file)
book.convert_from_html()

17
src/epub_solver.py Normal file
View File

@@ -0,0 +1,17 @@
from epub_postprocessor import EpubPostprocessor
from src.solver import BookSolver
class EpubBook(BookSolver):
def __init__(self, book_id=0, access=None, main_logger=None,
logging_format='%(asctime)s - %(levelname)s - %(message)s'):
super().__init__(book_id, access, main_logger, logging_format)
self.book_type = 'epub'
def get_converted_book(self):
json_converter = EpubPostprocessor(self.file_path, access=self.access, logger=self.logger_object)
content_dict = json_converter.convert_to_dict()
self.status_wrapper.set_generating()
return content_dict

View File

@@ -1,3 +1,5 @@
""" This is Interface for solving a task of a book conversion"""
import codecs
import json
import logging
@@ -5,17 +7,15 @@ import os
import pathlib
from livecarta_config import BookLogger, BookStatusWrapper, LawCartaConfig
from epub_postprocessor import EpubPostprocessor
class EpubBook:
class BookSolver:
def __init__(self, book_id=0, access=None,
main_logger=None,
logging_format='%(asctime)s - %(levelname)s - %(message)s'):
def __init__(self, book_id=0, access=None, main_logger=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'):
self.book_type = None
self.book_id = book_id
self.access = access
self.epub_path = None
self.file_path = None # path to book file, appears after downloading from server
self.output_path = None # path to json file
self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
logging_format=logging_format,
@@ -26,36 +26,36 @@ class EpubBook:
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels."
def save_epub(self, content):
def save_book_file(self, content):
"""
Save binary content of file to .docx.
Save binary content of file to .docx/.epub.
:param content: binary content of the file.
"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'epub/{self.book_id}')
folder_path = os.path.join(folder_path, f'{self.book_type}/{self.book_id}')
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
file_path = os.path.join(folder_path, f'{self.book_id}.epub')
file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
try:
with open(file_path, 'wb+') as file:
file.write(content)
self.logger_object.log(f'File was saved to folder: {folder_path}.')
except Exception as exc:
self.logger_object.log("Error in writing epub file.", logging.ERROR)
self.logger_object.log(f"Error in writing {self.book_type} file.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise exc
self.epub_path = pathlib.Path(file_path)
self.file_path = pathlib.Path(file_path)
def get_epub(self):
def get_book_file(self):
"""
Method for getting and saving book from queue.
Method for getting and saving book from server.
"""
try:
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
content = self.access.get_doc(self.book_id)
self.logger_object.log('File was received from server.')
self.save_epub(content)
self.save_book_file(content)
except FileNotFoundError as f_err:
self.logger_object.log("Can't get docx from server.", logging.ERROR)
self.logger_object.log_error_to_main_log()
@@ -84,7 +84,7 @@ class EpubBook:
except Exception as exc:
self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR)
def send_json_content(self, content: dict):
def send_json_content_to_server(self, content: dict):
try:
self.access.send_book(self.book_id, content)
self.logger_object.log(f'JSON data has been sent to server.')
@@ -94,31 +94,32 @@ class EpubBook:
self.status_wrapper.set_error()
raise exc
def get_converted_book(self):
self.logger_object.log('Beginning of processing json output.')
self.status_wrapper.set_generating()
return {}
def test_conversion(self):
self.logger_object.log('Beginning of the test.')
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'epub')
file_path = os.path.join(folder_path, f'{self.book_id}.epub')
self.epub_path = pathlib.Path(file_path)
self.logger_object.log(f'Test epub path: {self.epub_path}')
json_converter = EpubPostprocessor(self.epub_path)
content_dict = json_converter.convert_to_dict()
folder_path = os.path.join(folder_path, f'{self.book_type}')
file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
self.file_path = pathlib.Path(file_path)
self.logger_object.log(f'Test epub path: {self.file_path}')
content_dict = self.get_converted_book()
self.write_to_json(content_dict)
self.logger_object.log('End of the test.')
def conversion(self):
self.logger_object.log('Beginning of conversion from .docx to .json.')
self.get_epub()
self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.')
self.get_book_file()
self.status_wrapper.set_processing()
self.logger_object.log('Beginning of processing json output.')
try:
json_converter = EpubPostprocessor(self.epub_path, access=self.access, logger=self.logger_object)
content_dict = json_converter.convert_to_dict()
self.status_wrapper.set_generating()
content_dict = self.get_converted_book()
self.write_to_json(content_dict)
self.send_json_content(content_dict)
self.send_json_content_to_server(content_dict)
self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
except Exception as exc:
@@ -126,3 +127,4 @@ class EpubBook:
self.logger_object.log_error_to_main_log(str(exc))
self.status_wrapper.set_error()
raise exc