diff --git a/src/docx_converter/docx2libre_html.py b/src/docx_converter/docx2libre_html.py
new file mode 100644
index 0000000..889aa25
--- /dev/null
+++ b/src/docx_converter/docx2libre_html.py
@@ -0,0 +1,129 @@
+import os
+import logging
+import pathlib
+import subprocess
+from subprocess import PIPE
+from threading import Event
+from bs4 import BeautifulSoup
+
+from src.util.helpers import BookLogger
+
+
+class Docx2LibreHTML:
+ def __init__(self, book_id=0, file_path=None, access=None, logger=None, status_wrapper=None, libre_locker=None):
+ self.book_id = book_id
+ self.file_path = file_path
+ self.access = access
+ self.logger_object: BookLogger = logger
+ self.status_wrapper: status_wrapper = status_wrapper
+ # critical section for occupying libreoffice by one thread
+ self.libre_locker: Event() = libre_locker
+
+ # path to html file, file appears after libre-conversion
+ self.html_path = self.convert_docx_to_html()
+ self.html_soup = self.read_html(self.html_path)
+
+ def _libre_run(self, out_dir_path):
+ command = ['libreoffice', '--headless',
+ '--convert-to', 'html', f'{str(self.file_path)}',
+ '--outdir', f'{out_dir_path}']
+ print(command)
+ result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
+ self.logger_object.log(f'Result of libre conversion for book_{self.book_id}:'
+ f' {result.returncode}, {result.stdout}', logging.DEBUG)
+ self.logger_object.log(f'Any error while libre conversion for book_'
+ f'{self.book_id}: {result.stderr}', logging.DEBUG)
+
+ def convert_docx_to_html(self):
+ """
+ Function converts .docx document to .html file.
+ Steps
+ ----------
+ 1. Converts .epub to .html
+ 2. Parses from line structure to nested structure
+
+ Returns
+ ----------
+ html_path: str
+ path to html file, file appears after libre-conversion
+
+ """
+ self.logger_object.log(f'File - {self.file_path}.')
+ print(f'{self.file_path}')
+ self.logger_object.log('Beginning of conversion from .docx to .html.')
+
+ try:
+ f = open(self.file_path)
+ f.close()
+ except FileNotFoundError as error:
+ self.logger_object.log(
+ 'Invalid path to input data.', logging.ERROR)
+ self.status_wrapper.set_error()
+ raise error
+
+ folder_path = os.path.dirname(
+ os.path.dirname(os.path.abspath(__file__)))
+ out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}')
+ pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
+
+ is_book_converted = False
+ try:
+ if self.libre_locker.isSet():
+ self.libre_locker.clear()
+ self.logger_object.log('Got flag...', logging.DEBUG)
+ self._libre_run(out_dir_path)
+ self.libre_locker.set()
+ self.logger_object.log('Cleared flag...', logging.DEBUG)
+
+ else:
+ while not self.libre_locker.isSet() and not is_book_converted:
+ self.logger_object.log(
+ 'Waiting for libre...', logging.DEBUG)
+ flag = self.libre_locker.wait(50)
+ if flag:
+ if self.libre_locker.isSet():
+ self.libre_locker.clear()
+ self.logger_object.log(f'Got flag!', logging.DEBUG)
+ self._libre_run(out_dir_path)
+ self.libre_locker.set()
+ break
+
+ except Exception as exc:
+ self.logger_object.log(
+ "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
+ self.logger_object.log_error_to_main_log()
+ self.status_wrapper.set_error()
+ raise exc
+
+ out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
+ html_path = pathlib.Path(out_dir_path)
+
+ try:
+ f = open(html_path)
+ f.close()
+ except FileNotFoundError as exc:
+ self.logger_object.log(
+ "Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
+ self.logger_object.log_error_to_main_log()
+ self.status_wrapper.set_error()
+ raise exc
+
+ self.logger_object.log('End of conversion from .docx to .html.')
+ self.logger_object.log(
+ f'Input file path after conversion: {html_path}.')
+ return html_path
+
+ def read_html(self, html_path):
+ """Method for reading .html file into beautiful soup tag."""
+ try:
+ html_text = open(html_path, 'r', encoding='utf8').read()
+ self.logger_object.log('HTML for book has been loaded.')
+ except FileNotFoundError as exc:
+ self.logger_object.log('There is no html to process.'
+ 'Conversion went wrong or you specified wrong paths.', logging.ERROR)
+ self.logger_object.log_error_to_main_log()
+ self.status_wrapper.set_error()
+ raise exc
+
+ html_soup = BeautifulSoup(html_text, features='lxml')
+ return html_soup
diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py
index 680a059..b4aa9b3 100644
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -1,154 +1,71 @@
-import os
-import logging
-import pathlib
-import subprocess
-from subprocess import PIPE
+import json
+import codecs
from threading import Event
-from bs4 import BeautifulSoup
-from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor
-from src.docx_converter.libra_html2json_converter import LibraHTML2JSONConverter
from src.book_solver import BookSolver
+from src.util.helpers import BookLogger
+from src.docx_converter.docx2libre_html import Docx2LibreHTML
+from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor
+from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter
class DocxBook(BookSolver):
"""Class of .docx type book - child of BookSolver"""
- def __init__(self, book_id=0, access=None, html_path=None,
- main_logger=None, libra_locker=None):
+ def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None):
super().__init__(book_id, access, main_logger)
self.book_type = 'docx'
- self.html_path = html_path # path to html file, file appears after libre-conversion
- self.libra_locker: Event() = libra_locker # critical section for occupying libreoffice by one thread
-
- def _libra_run(self, out_dir_path):
- command = ['libreoffice', '--headless',
- '--convert-to', 'html', f'{str(self.file_path)}',
- '--outdir', f'{out_dir_path}']
- print(command)
- result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
- self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
- self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
-
- def convert_doc_to_html(self):
- """Method for convert .docx document to .html file."""
- self.logger_object.log(f'File - {self.file_path}.')
- print(f'{self.file_path}')
- self.logger_object.log('Beginning of conversion from .docx to .html.')
-
- try:
- f = open(self.file_path)
- f.close()
- except FileNotFoundError as error:
- self.logger_object.log('Invalid path to input data.', logging.ERROR)
- self.status_wrapper.set_error()
- raise error
-
- folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}')
- pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
-
- is_book_converted = False
- try:
- if self.libra_locker.isSet():
- self.libra_locker.clear()
- self.logger_object.log('Got flag...', logging.DEBUG)
- self._libra_run(out_dir_path)
- self.libra_locker.set()
- self.logger_object.log('Cleared flag...', logging.DEBUG)
-
- else:
- while not self.libra_locker.isSet() and not is_book_converted:
- self.logger_object.log('Waiting for libra...', logging.DEBUG)
- flag = self.libra_locker.wait(50)
- if flag:
- if self.libra_locker.isSet():
- self.libra_locker.clear()
- self.logger_object.log(f'Got flag!', logging.DEBUG)
- self._libra_run(out_dir_path)
- self.libra_locker.set()
- break
-
- except Exception as exc:
- self.logger_object.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
- self.logger_object.log_error_to_main_log()
- self.status_wrapper.set_error()
- raise exc
-
- out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
- self.html_path = pathlib.Path(out_dir_path)
-
- try:
- f = open(self.html_path)
- f.close()
- except FileNotFoundError as exc:
- self.logger_object.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
- self.logger_object.log_error_to_main_log()
- self.status_wrapper.set_error()
- raise exc
-
- self.logger_object.log('End of conversion from .docx to .html.')
- self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
-
- def read_html(self):
- """Method for reading .html file into beautiful soup tag."""
- try:
- html_text = open(self.html_path, 'r', encoding='utf8').read()
- self.logger_object.log('HTML for book has been loaded.')
- except FileNotFoundError as exc:
- self.logger_object.log('There is no html to process.'
- 'Conversion went wrong or you specified wrong paths.', logging.ERROR)
- self.logger_object.log_error_to_main_log()
- self.status_wrapper.set_error()
- raise exc
-
- html_soup = BeautifulSoup(html_text, features='lxml')
- return html_soup
-
- def write_html_from_list(self, body_tag, file_name='json/html_test.html'):
- folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- file_path = pathlib.Path(os.path.join(folder_path, file_name))
-
- with open(file_path, 'w', encoding='utf-8') as f_out:
- f_out.write(body_tag.prettify())
- self.logger_object.log(f'Check final prettified html: {file_name}.')
-
- def convert_from_html(self):
- html_soup = self.read_html()
- parser = HTMLDocxPreprocessor(html_soup, self.logger_object)
- content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
- json_converter = LibraHTML2JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
- content_dict = json_converter.convert_to_dict()
- self.write_to_json(content_dict)
- self.write_html_from_list(parser.body_tag)
+ # critical section for occupying libreoffice by one thread
+ self.libre_locker: Event() = libre_locker
def get_converted_book(self):
"""
- 1. Convert docx to html with libra office
- 2. Parse and clean html, get list of tags, get footnotes
- 3. Parse from line structure to nested structure with JSONConverter
+ Function
+ Steps
+ ----------
+ 1. Converts docx to html with LibreOffice
+ 2. Parses and cleans html, gets list of tags, gets footnotes
+ 3. Parses from line structure to nested structure with JSONConverter
+
+ Returns
+ ----------
+ content_dict
+ json for LiveCarta platform
+
"""
- self.convert_doc_to_html()
- self.check_output_directory()
+ # 1. Converts docx to html with LibreOffice
+ html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access,
+ self.logger_object, self.status_wrapper, self.libre_locker)
+ # TODO presets
- html_soup = self.read_html()
- self.logger_object.log('Beginning of processing .html file.')
+ # 2. Parses and cleans html, gets list of tags, gets footnotes
+ parser = HTMLDocxPreprocessor(
+ html_converter.html_soup, self.logger_object)
+ bs_tags, footnotes, top_level_headers = parser.process_html(
+ self.access, html_converter.html_path, self.book_id)
- parser = HTMLDocxPreprocessor(html_soup, self.logger_object)
- bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
-
- self.logger_object.log('Beginning of processing json output.')
- self.status_wrapper.set_generating()
-
- json_converter = LibraHTML2JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
+ # 3. Parses from line structure to nested structure with JSONConverter
+ json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers,
+ self.logger_object, self.status_wrapper)
content_dict = json_converter.convert_to_dict()
+
return content_dict
if __name__ == "__main__":
- folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html'))
- out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json'))
+ docx_file_path = '../../docx/music_inquiry.docx'
+ logger_object = BookLogger(
+ name='docx', book_id=docx_file_path.split('/')[-1])
- book = DocxBook(html_path=file)
- book.convert_from_html()
\ No newline at end of file
+ html_converter = Docx2LibreHTML(file_path=docx_file_path)
+
+ parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object)
+ content, footnotes, top_level_headers = parser.process_html(
+ html_converter.html_path)
+
+ json_converter = LibreHTML2JSONConverter(
+ content, footnotes, top_level_headers, logger_object)
+ content_dict = json_converter.convert_to_dict()
+
+ with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f:
+ json.dump(content_dict, f, ensure_ascii=False)