epub converter: Book, EpubBook refactoring

2021-09-03 22:35:34 +03:00
parent c12be5b482
commit 92fe2bc019
4 changed files with 86 additions and 143 deletions
--- a/src/docx_solver.py
+++ b/src/docx_solver.py
@@ -0,0 +1,189 @@
+import logging
+import os
+import pathlib
+import subprocess
+from subprocess import PIPE
+from threading import Event
+
+from bs4 import BeautifulSoup
+from html_preprocessor import HTMLPreprocessor
+from json_postprocessor import JSONConverter
+from src.solver import BookSolver
+
+
+class DocxBook(BookSolver):
+
+    def __init__(self, book_id=0, access=None, html_path=None,
+                 main_logger=None, libra_locker=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'):
+        super().__init__(book_id, access, main_logger, logging_format)
+        self.book_type = 'docx'
+        self.html_path = html_path  # path to html file, file appears after libre-conversion
+        self.libra_locker: Event() = libra_locker
+
+    def _libra_run(self, out_dir_path):
+        command = ['libreoffice', '--headless',
+                   '--convert-to', 'html', f'{str(self.file_path)}',
+                   '--outdir', f'{out_dir_path}']
+        result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
+        self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
+        self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
+
+    def convert_doc_to_html(self):
+        """
+        Method for convert .docx document to .html file.
+        """
+        self.logger_object.log(f'File - {self.file_path}.')
+        print(f'{self.file_path}')
+        self.logger_object.log('Beginning of conversion from .docx to .html.')
+
+        try:
+            f = open(self.file_path)
+            f.close()
+        except FileNotFoundError as error:
+            self.logger_object.log('Invalid path to input data.', logging.ERROR)
+            self.status_wrapper.set_error()
+            raise error
+
+        folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        out_dir_path = os.path.join(folder_path, f'html/{self.book_id}')
+        pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
+
+        is_book_converted = False
+        try:
+            if self.libra_locker.isSet():
+                self.libra_locker.clear()
+                self.logger_object.log('Got flag...', logging.DEBUG)
+                self._libra_run(out_dir_path)
+                self.libra_locker.set()
+                self.logger_object.log('Cleared flag...', logging.DEBUG)
+
+            else:
+                while not self.libra_locker.isSet() and not is_book_converted:
+                    self.logger_object.log('Waiting for libra...', logging.DEBUG)
+                    flag = self.libra_locker.wait(50)
+                    if flag:
+                        if self.libra_locker.isSet():
+                            self.libra_locker.clear()
+                            self.logger_object.log(f'Got flag!', logging.DEBUG)
+                            self._libra_run(out_dir_path)
+                            self.libra_locker.set()
+                            break
+
+        except Exception as exc:
+            self.logger_object.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            self.status_wrapper.set_error()
+            raise exc
+
+        out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
+        self.html_path = pathlib.Path(out_dir_path)
+
+        try:
+            f = open(self.html_path)
+            f.close()
+        except FileNotFoundError as exc:
+            self.logger_object.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            self.status_wrapper.set_error()
+            raise exc
+
+        self.logger_object.log('End of conversion from .docx to .html.')
+        self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
+
+    def read_html(self):
+        """
+        Method for reading .html file into beautiful soup tag.
+        """
+        try:
+            html_text = open(self.html_path, 'r', encoding='utf8').read()
+            self.logger_object.log('HTML for book has been loaded.')
+        except FileNotFoundError as exc:
+            self.logger_object.log('There is no html to process.'
+                                   'Conversion went wrong or you specified wrong paths.', logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            self.status_wrapper.set_error()
+            raise exc
+
+        html_soup = BeautifulSoup(html_text, features='lxml')
+        return html_soup
+
+    def write_html_from_list(self, body_tag, file_name='json/html_test.html'):
+        folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        file_path = pathlib.Path(os.path.join(folder_path, file_name))
+
+        with open(file_path, 'w', encoding='utf-8') as f_out:
+            f_out.write(body_tag.prettify())
+            self.logger_object.log(f'Check final prettified html: {file_name}.')
+
+    def convert_from_html(self):
+        html_soup = self.read_html()
+        parser = HTMLPreprocessor(html_soup, self.logger_object)
+        content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
+        json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
+        content_dict = json_converter.convert_to_dict()
+        self.write_to_json(content_dict)
+        self.write_html_from_list(parser.body_tag)
+
+    def test_conversion(self):
+        self.logger_object.log('Beginning of the test.')
+
+        folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        folder_path = os.path.join(folder_path, f'docx')
+        file_path = os.path.join(folder_path, f'{self.book_id}.docx')
+        self.file_path = pathlib.Path(file_path)
+        self.logger_object.log(f'Test docx path: {self.file_path}')
+
+        self.convert_doc_to_html()
+        self.check_output_directory()
+
+        html_soup = self.read_html()
+        parser = HTMLPreprocessor(html_soup, self.logger_object)
+        content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
+
+        json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
+        content_dict = json_converter.convert_to_dict()
+
+        self.write_to_json(content_dict)
+        self.write_html_from_list(parser.body_tag)
+        self.logger_object.log('End of the test.')
+
+    def get_converted_book(self):
+        self.convert_doc_to_html()
+        self.check_output_directory()
+
+        html_soup = self.read_html()
+        self.logger_object.log('Beginning of processing .html file.')
+
+        parser = HTMLPreprocessor(html_soup, self.logger_object)
+        content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
+
+        self.logger_object.log('Beginning of processing json output.')
+        self.status_wrapper.set_generating()
+
+        json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
+        content_dict = json_converter.convert_to_dict()
+        return content_dict
+
+    def conversion(self):
+        try:
+            self.logger_object.log('Beginning of conversion from .docx to .json.')
+            self.get_book_file()
+            self.status_wrapper.set_processing()
+            content_dict = self.get_converted_book()
+            self.write_to_json(content_dict)
+            self.send_json_content_to_server(content_dict)
+            self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
+        except Exception as exc:
+            self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
+            self.logger_object.log_error_to_main_log(str(exc))
+            self.status_wrapper.set_error()
+            raise exc
+
+
+if __name__ == "__main__":
+    folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html'))
+    out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json'))
+
+    book = DocxBook(html_path=file)
+    book.convert_from_html()