forked from LiveCarta/BookConverter
Rewrite docx_solver.py
This commit is contained in:
129
src/docx_converter/docx2libre_html.py
Normal file
129
src/docx_converter/docx2libre_html.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import os
|
||||
import logging
|
||||
import pathlib
|
||||
import subprocess
|
||||
from subprocess import PIPE
|
||||
from threading import Event
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
|
||||
|
||||
class Docx2LibreHTML:
|
||||
def __init__(self, book_id=0, file_path=None, access=None, logger=None, status_wrapper=None, libre_locker=None):
|
||||
self.book_id = book_id
|
||||
self.file_path = file_path
|
||||
self.access = access
|
||||
self.logger_object: BookLogger = logger
|
||||
self.status_wrapper: status_wrapper = status_wrapper
|
||||
# critical section for occupying libreoffice by one thread
|
||||
self.libre_locker: Event() = libre_locker
|
||||
|
||||
# path to html file, file appears after libre-conversion
|
||||
self.html_path = self.convert_docx_to_html()
|
||||
self.html_soup = self.read_html(self.html_path)
|
||||
|
||||
def _libre_run(self, out_dir_path):
|
||||
command = ['libreoffice', '--headless',
|
||||
'--convert-to', 'html', f'{str(self.file_path)}',
|
||||
'--outdir', f'{out_dir_path}']
|
||||
print(command)
|
||||
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
|
||||
self.logger_object.log(f'Result of libre conversion for book_{self.book_id}:'
|
||||
f' {result.returncode}, {result.stdout}', logging.DEBUG)
|
||||
self.logger_object.log(f'Any error while libre conversion for book_'
|
||||
f'{self.book_id}: {result.stderr}', logging.DEBUG)
|
||||
|
||||
def convert_docx_to_html(self):
|
||||
"""
|
||||
Function converts .docx document to .html file.
|
||||
Steps
|
||||
----------
|
||||
1. Converts .epub to .html
|
||||
2. Parses from line structure to nested structure
|
||||
|
||||
Returns
|
||||
----------
|
||||
html_path: str
|
||||
path to html file, file appears after libre-conversion
|
||||
|
||||
"""
|
||||
self.logger_object.log(f'File - {self.file_path}.')
|
||||
print(f'{self.file_path}')
|
||||
self.logger_object.log('Beginning of conversion from .docx to .html.')
|
||||
|
||||
try:
|
||||
f = open(self.file_path)
|
||||
f.close()
|
||||
except FileNotFoundError as error:
|
||||
self.logger_object.log(
|
||||
'Invalid path to input data.', logging.ERROR)
|
||||
self.status_wrapper.set_error()
|
||||
raise error
|
||||
|
||||
folder_path = os.path.dirname(
|
||||
os.path.dirname(os.path.abspath(__file__)))
|
||||
out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}')
|
||||
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
is_book_converted = False
|
||||
try:
|
||||
if self.libre_locker.isSet():
|
||||
self.libre_locker.clear()
|
||||
self.logger_object.log('Got flag...', logging.DEBUG)
|
||||
self._libre_run(out_dir_path)
|
||||
self.libre_locker.set()
|
||||
self.logger_object.log('Cleared flag...', logging.DEBUG)
|
||||
|
||||
else:
|
||||
while not self.libre_locker.isSet() and not is_book_converted:
|
||||
self.logger_object.log(
|
||||
'Waiting for libre...', logging.DEBUG)
|
||||
flag = self.libre_locker.wait(50)
|
||||
if flag:
|
||||
if self.libre_locker.isSet():
|
||||
self.libre_locker.clear()
|
||||
self.logger_object.log(f'Got flag!', logging.DEBUG)
|
||||
self._libre_run(out_dir_path)
|
||||
self.libre_locker.set()
|
||||
break
|
||||
|
||||
except Exception as exc:
|
||||
self.logger_object.log(
|
||||
"Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
|
||||
self.logger_object.log_error_to_main_log()
|
||||
self.status_wrapper.set_error()
|
||||
raise exc
|
||||
|
||||
out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
|
||||
html_path = pathlib.Path(out_dir_path)
|
||||
|
||||
try:
|
||||
f = open(html_path)
|
||||
f.close()
|
||||
except FileNotFoundError as exc:
|
||||
self.logger_object.log(
|
||||
"Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
|
||||
self.logger_object.log_error_to_main_log()
|
||||
self.status_wrapper.set_error()
|
||||
raise exc
|
||||
|
||||
self.logger_object.log('End of conversion from .docx to .html.')
|
||||
self.logger_object.log(
|
||||
f'Input file path after conversion: {html_path}.')
|
||||
return html_path
|
||||
|
||||
def read_html(self, html_path):
|
||||
"""Method for reading .html file into beautiful soup tag."""
|
||||
try:
|
||||
html_text = open(html_path, 'r', encoding='utf8').read()
|
||||
self.logger_object.log('HTML for book has been loaded.')
|
||||
except FileNotFoundError as exc:
|
||||
self.logger_object.log('There is no html to process.'
|
||||
'Conversion went wrong or you specified wrong paths.', logging.ERROR)
|
||||
self.logger_object.log_error_to_main_log()
|
||||
self.status_wrapper.set_error()
|
||||
raise exc
|
||||
|
||||
html_soup = BeautifulSoup(html_text, features='lxml')
|
||||
return html_soup
|
||||
@@ -1,154 +1,71 @@
|
||||
import os
|
||||
import logging
|
||||
import pathlib
|
||||
import subprocess
|
||||
from subprocess import PIPE
|
||||
import json
|
||||
import codecs
|
||||
from threading import Event
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor
|
||||
from src.docx_converter.libra_html2json_converter import LibraHTML2JSONConverter
|
||||
from src.book_solver import BookSolver
|
||||
from src.util.helpers import BookLogger
|
||||
from src.docx_converter.docx2libre_html import Docx2LibreHTML
|
||||
from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor
|
||||
from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter
|
||||
|
||||
|
||||
class DocxBook(BookSolver):
|
||||
"""Class of .docx type book - child of BookSolver"""
|
||||
|
||||
def __init__(self, book_id=0, access=None, html_path=None,
|
||||
main_logger=None, libra_locker=None):
|
||||
def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None):
|
||||
super().__init__(book_id, access, main_logger)
|
||||
self.book_type = 'docx'
|
||||
self.html_path = html_path # path to html file, file appears after libre-conversion
|
||||
self.libra_locker: Event() = libra_locker # critical section for occupying libreoffice by one thread
|
||||
|
||||
def _libra_run(self, out_dir_path):
|
||||
command = ['libreoffice', '--headless',
|
||||
'--convert-to', 'html', f'{str(self.file_path)}',
|
||||
'--outdir', f'{out_dir_path}']
|
||||
print(command)
|
||||
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
|
||||
self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
|
||||
self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
|
||||
|
||||
def convert_doc_to_html(self):
|
||||
"""Method for convert .docx document to .html file."""
|
||||
self.logger_object.log(f'File - {self.file_path}.')
|
||||
print(f'{self.file_path}')
|
||||
self.logger_object.log('Beginning of conversion from .docx to .html.')
|
||||
|
||||
try:
|
||||
f = open(self.file_path)
|
||||
f.close()
|
||||
except FileNotFoundError as error:
|
||||
self.logger_object.log('Invalid path to input data.', logging.ERROR)
|
||||
self.status_wrapper.set_error()
|
||||
raise error
|
||||
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}')
|
||||
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
is_book_converted = False
|
||||
try:
|
||||
if self.libra_locker.isSet():
|
||||
self.libra_locker.clear()
|
||||
self.logger_object.log('Got flag...', logging.DEBUG)
|
||||
self._libra_run(out_dir_path)
|
||||
self.libra_locker.set()
|
||||
self.logger_object.log('Cleared flag...', logging.DEBUG)
|
||||
|
||||
else:
|
||||
while not self.libra_locker.isSet() and not is_book_converted:
|
||||
self.logger_object.log('Waiting for libra...', logging.DEBUG)
|
||||
flag = self.libra_locker.wait(50)
|
||||
if flag:
|
||||
if self.libra_locker.isSet():
|
||||
self.libra_locker.clear()
|
||||
self.logger_object.log(f'Got flag!', logging.DEBUG)
|
||||
self._libra_run(out_dir_path)
|
||||
self.libra_locker.set()
|
||||
break
|
||||
|
||||
except Exception as exc:
|
||||
self.logger_object.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
|
||||
self.logger_object.log_error_to_main_log()
|
||||
self.status_wrapper.set_error()
|
||||
raise exc
|
||||
|
||||
out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
|
||||
self.html_path = pathlib.Path(out_dir_path)
|
||||
|
||||
try:
|
||||
f = open(self.html_path)
|
||||
f.close()
|
||||
except FileNotFoundError as exc:
|
||||
self.logger_object.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
|
||||
self.logger_object.log_error_to_main_log()
|
||||
self.status_wrapper.set_error()
|
||||
raise exc
|
||||
|
||||
self.logger_object.log('End of conversion from .docx to .html.')
|
||||
self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
|
||||
|
||||
def read_html(self):
|
||||
"""Method for reading .html file into beautiful soup tag."""
|
||||
try:
|
||||
html_text = open(self.html_path, 'r', encoding='utf8').read()
|
||||
self.logger_object.log('HTML for book has been loaded.')
|
||||
except FileNotFoundError as exc:
|
||||
self.logger_object.log('There is no html to process.'
|
||||
'Conversion went wrong or you specified wrong paths.', logging.ERROR)
|
||||
self.logger_object.log_error_to_main_log()
|
||||
self.status_wrapper.set_error()
|
||||
raise exc
|
||||
|
||||
html_soup = BeautifulSoup(html_text, features='lxml')
|
||||
return html_soup
|
||||
|
||||
def write_html_from_list(self, body_tag, file_name='json/html_test.html'):
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
file_path = pathlib.Path(os.path.join(folder_path, file_name))
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f_out:
|
||||
f_out.write(body_tag.prettify())
|
||||
self.logger_object.log(f'Check final prettified html: {file_name}.')
|
||||
|
||||
def convert_from_html(self):
|
||||
html_soup = self.read_html()
|
||||
parser = HTMLDocxPreprocessor(html_soup, self.logger_object)
|
||||
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
||||
json_converter = LibraHTML2JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
self.write_to_json(content_dict)
|
||||
self.write_html_from_list(parser.body_tag)
|
||||
# critical section for occupying libreoffice by one thread
|
||||
self.libre_locker: Event() = libre_locker
|
||||
|
||||
def get_converted_book(self):
|
||||
"""
|
||||
1. Convert docx to html with libra office
|
||||
2. Parse and clean html, get list of tags, get footnotes
|
||||
3. Parse from line structure to nested structure with JSONConverter
|
||||
Function
|
||||
Steps
|
||||
----------
|
||||
1. Converts docx to html with LibreOffice
|
||||
2. Parses and cleans html, gets list of tags, gets footnotes
|
||||
3. Parses from line structure to nested structure with JSONConverter
|
||||
|
||||
Returns
|
||||
----------
|
||||
content_dict
|
||||
json for LiveCarta platform
|
||||
|
||||
"""
|
||||
self.convert_doc_to_html()
|
||||
self.check_output_directory()
|
||||
# 1. Converts docx to html with LibreOffice
|
||||
html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access,
|
||||
self.logger_object, self.status_wrapper, self.libre_locker)
|
||||
# TODO presets
|
||||
|
||||
html_soup = self.read_html()
|
||||
self.logger_object.log('Beginning of processing .html file.')
|
||||
# 2. Parses and cleans html, gets list of tags, gets footnotes
|
||||
parser = HTMLDocxPreprocessor(
|
||||
html_converter.html_soup, self.logger_object)
|
||||
bs_tags, footnotes, top_level_headers = parser.process_html(
|
||||
self.access, html_converter.html_path, self.book_id)
|
||||
|
||||
parser = HTMLDocxPreprocessor(html_soup, self.logger_object)
|
||||
bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
||||
|
||||
self.logger_object.log('Beginning of processing json output.')
|
||||
self.status_wrapper.set_generating()
|
||||
|
||||
json_converter = LibraHTML2JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
||||
# 3. Parses from line structure to nested structure with JSONConverter
|
||||
json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers,
|
||||
self.logger_object, self.status_wrapper)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
|
||||
return content_dict
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html'))
|
||||
out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json'))
|
||||
docx_file_path = '../../docx/music_inquiry.docx'
|
||||
logger_object = BookLogger(
|
||||
name='docx', book_id=docx_file_path.split('/')[-1])
|
||||
|
||||
book = DocxBook(html_path=file)
|
||||
book.convert_from_html()
|
||||
html_converter = Docx2LibreHTML(file_path=docx_file_path)
|
||||
|
||||
parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object)
|
||||
content, footnotes, top_level_headers = parser.process_html(
|
||||
html_converter.html_path)
|
||||
|
||||
json_converter = LibreHTML2JSONConverter(
|
||||
content, footnotes, top_level_headers, logger_object)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
|
||||
with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(content_dict, f, ensure_ascii=False)
|
||||
|
||||
Reference in New Issue
Block a user