This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/docx_solver.py
2021-09-06 18:18:29 +03:00

158 lines
6.7 KiB
Python

import logging
import os
import pathlib
import subprocess
from subprocess import PIPE
from threading import Event
from bs4 import BeautifulSoup
from html_preprocessor import HTMLPreprocessor
from json_postprocessor import JSONConverter
from src.solver import BookSolver
class DocxBook(BookSolver):
def __init__(self, book_id=0, access=None, html_path=None,
main_logger=None, libra_locker=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'):
super().__init__(book_id, access, main_logger, logging_format)
self.book_type = 'docx'
self.html_path = html_path # path to html file, file appears after libre-conversion
self.libra_locker: Event() = libra_locker # critical section for occupying libreoffice by one thread
def _libra_run(self, out_dir_path):
command = ['libreoffice', '--headless',
'--convert-to', 'html', f'{str(self.file_path)}',
'--outdir', f'{out_dir_path}']
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
def convert_doc_to_html(self):
"""
Method for convert .docx document to .html file.
"""
self.logger_object.log(f'File - {self.file_path}.')
print(f'{self.file_path}')
self.logger_object.log('Beginning of conversion from .docx to .html.')
try:
f = open(self.file_path)
f.close()
except FileNotFoundError as error:
self.logger_object.log('Invalid path to input data.', logging.ERROR)
self.status_wrapper.set_error()
raise error
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(folder_path, f'html/{self.book_id}')
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
is_book_converted = False
try:
if self.libra_locker.isSet():
self.libra_locker.clear()
self.logger_object.log('Got flag...', logging.DEBUG)
self._libra_run(out_dir_path)
self.libra_locker.set()
self.logger_object.log('Cleared flag...', logging.DEBUG)
else:
while not self.libra_locker.isSet() and not is_book_converted:
self.logger_object.log('Waiting for libra...', logging.DEBUG)
flag = self.libra_locker.wait(50)
if flag:
if self.libra_locker.isSet():
self.libra_locker.clear()
self.logger_object.log(f'Got flag!', logging.DEBUG)
self._libra_run(out_dir_path)
self.libra_locker.set()
break
except Exception as exc:
self.logger_object.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
self.html_path = pathlib.Path(out_dir_path)
try:
f = open(self.html_path)
f.close()
except FileNotFoundError as exc:
self.logger_object.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
self.logger_object.log('End of conversion from .docx to .html.')
self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
def read_html(self):
"""
Method for reading .html file into beautiful soup tag.
"""
try:
html_text = open(self.html_path, 'r', encoding='utf8').read()
self.logger_object.log('HTML for book has been loaded.')
except FileNotFoundError as exc:
self.logger_object.log('There is no html to process.'
'Conversion went wrong or you specified wrong paths.', logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
html_soup = BeautifulSoup(html_text, features='lxml')
return html_soup
def write_html_from_list(self, body_tag, file_name='json/html_test.html'):
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
file_path = pathlib.Path(os.path.join(folder_path, file_name))
with open(file_path, 'w', encoding='utf-8') as f_out:
f_out.write(body_tag.prettify())
self.logger_object.log(f'Check final prettified html: {file_name}.')
def convert_from_html(self):
html_soup = self.read_html()
parser = HTMLPreprocessor(html_soup, self.logger_object)
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
self.write_to_json(content_dict)
self.write_html_from_list(parser.body_tag)
def get_converted_book(self):
"""
1. Convert docx to html with libra office
2. Parse and clean html, get list of tags, get footnotes
3. Parse from line structure to nested structure with JSONConverter
"""
self.convert_doc_to_html()
self.check_output_directory()
html_soup = self.read_html()
self.logger_object.log('Beginning of processing .html file.')
parser = HTMLPreprocessor(html_soup, self.logger_object)
bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
self.logger_object.log('Beginning of processing json output.')
self.status_wrapper.set_generating()
json_converter = JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
return content_dict
if __name__ == "__main__":
folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html'))
out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json'))
book = DocxBook(html_path=file)
book.convert_from_html()