Rewrite docx_solver.py

2022-06-01 16:24:19 +03:00
parent c0ef0b6d6e
commit 0d1ec03f57
2 changed files with 178 additions and 132 deletions
--- a/src/docx_converter/docx2libre_html.py
+++ b/src/docx_converter/docx2libre_html.py
@@ -0,0 +1,129 @@
+import os
+import logging
+import pathlib
+import subprocess
+from subprocess import PIPE
+from threading import Event
+from bs4 import BeautifulSoup
+
+from src.util.helpers import BookLogger
+
+
+class Docx2LibreHTML:
+    def __init__(self, book_id=0, file_path=None, access=None, logger=None, status_wrapper=None, libre_locker=None):
+        self.book_id = book_id
+        self.file_path = file_path
+        self.access = access
+        self.logger_object: BookLogger = logger
+        self.status_wrapper: status_wrapper = status_wrapper
+        # critical section for occupying libreoffice by one thread
+        self.libre_locker: Event() = libre_locker
+
+        # path to html file, file appears after libre-conversion
+        self.html_path = self.convert_docx_to_html()
+        self.html_soup = self.read_html(self.html_path)
+
+    def _libre_run(self, out_dir_path):
+        command = ['libreoffice', '--headless',
+                   '--convert-to', 'html', f'{str(self.file_path)}',
+                   '--outdir', f'{out_dir_path}']
+        print(command)
+        result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
+        self.logger_object.log(f'Result of libre conversion for book_{self.book_id}:'
+                               f' {result.returncode}, {result.stdout}', logging.DEBUG)
+        self.logger_object.log(f'Any error while libre conversion for book_'
+                               f'{self.book_id}: {result.stderr}', logging.DEBUG)
+
+    def convert_docx_to_html(self):
+        """
+        Function converts .docx document to .html file.
+        Steps
+        ----------
+        1. Converts .epub to .html
+        2. Parses from line structure to nested structure
+
+        Returns
+        ----------
+        html_path: str
+            path to html file, file appears after libre-conversion
+
+        """
+        self.logger_object.log(f'File - {self.file_path}.')
+        print(f'{self.file_path}')
+        self.logger_object.log('Beginning of conversion from .docx to .html.')
+
+        try:
+            f = open(self.file_path)
+            f.close()
+        except FileNotFoundError as error:
+            self.logger_object.log(
+                'Invalid path to input data.', logging.ERROR)
+            self.status_wrapper.set_error()
+            raise error
+
+        folder_path = os.path.dirname(
+            os.path.dirname(os.path.abspath(__file__)))
+        out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}')
+        pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
+
+        is_book_converted = False
+        try:
+            if self.libre_locker.isSet():
+                self.libre_locker.clear()
+                self.logger_object.log('Got flag...', logging.DEBUG)
+                self._libre_run(out_dir_path)
+                self.libre_locker.set()
+                self.logger_object.log('Cleared flag...', logging.DEBUG)
+
+            else:
+                while not self.libre_locker.isSet() and not is_book_converted:
+                    self.logger_object.log(
+                        'Waiting for libre...', logging.DEBUG)
+                    flag = self.libre_locker.wait(50)
+                    if flag:
+                        if self.libre_locker.isSet():
+                            self.libre_locker.clear()
+                            self.logger_object.log(f'Got flag!', logging.DEBUG)
+                            self._libre_run(out_dir_path)
+                            self.libre_locker.set()
+                            break
+
+        except Exception as exc:
+            self.logger_object.log(
+                "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            self.status_wrapper.set_error()
+            raise exc
+
+        out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
+        html_path = pathlib.Path(out_dir_path)
+
+        try:
+            f = open(html_path)
+            f.close()
+        except FileNotFoundError as exc:
+            self.logger_object.log(
+                "Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            self.status_wrapper.set_error()
+            raise exc
+
+        self.logger_object.log('End of conversion from .docx to .html.')
+        self.logger_object.log(
+            f'Input file path after conversion: {html_path}.')
+        return html_path
+
+    def read_html(self, html_path):
+        """Method for reading .html file into beautiful soup tag."""
+        try:
+            html_text = open(html_path, 'r', encoding='utf8').read()
+            self.logger_object.log('HTML for book has been loaded.')
+        except FileNotFoundError as exc:
+            self.logger_object.log('There is no html to process.'
+                                   'Conversion went wrong or you specified wrong paths.', logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            self.status_wrapper.set_error()
+            raise exc
+
+        html_soup = BeautifulSoup(html_text, features='lxml')
+        return html_soup
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -1,154 +1,71 @@
-import os
-import logging
-import pathlib
-import subprocess
-from subprocess import PIPE
+import json
+import codecs
 from threading import Event
-from bs4 import BeautifulSoup

-from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor
-from src.docx_converter.libra_html2json_converter import LibraHTML2JSONConverter
 from src.book_solver import BookSolver
+from src.util.helpers import BookLogger
+from src.docx_converter.docx2libre_html import Docx2LibreHTML
+from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor
+from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter


 class DocxBook(BookSolver):
    """Class of .docx type book - child of BookSolver"""

-    def __init__(self, book_id=0, access=None, html_path=None,
-                 main_logger=None, libra_locker=None):
+    def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None):
        super().__init__(book_id, access, main_logger)
        self.book_type = 'docx'
-        self.html_path = html_path  # path to html file, file appears after libre-conversion
-        self.libra_locker: Event() = libra_locker  # critical section for occupying libreoffice by one thread
-
-    def _libra_run(self, out_dir_path):
-        command = ['libreoffice', '--headless',
-                   '--convert-to', 'html', f'{str(self.file_path)}',
-                   '--outdir', f'{out_dir_path}']
-        print(command)
-        result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
-        self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
-        self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
-
-    def convert_doc_to_html(self):
-        """Method for convert .docx document to .html file."""
-        self.logger_object.log(f'File - {self.file_path}.')
-        print(f'{self.file_path}')
-        self.logger_object.log('Beginning of conversion from .docx to .html.')
-
-        try:
-            f = open(self.file_path)
-            f.close()
-        except FileNotFoundError as error:
-            self.logger_object.log('Invalid path to input data.', logging.ERROR)
-            self.status_wrapper.set_error()
-            raise error
-
-        folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}')
-        pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
-
-        is_book_converted = False
-        try:
-            if self.libra_locker.isSet():
-                self.libra_locker.clear()
-                self.logger_object.log('Got flag...', logging.DEBUG)
-                self._libra_run(out_dir_path)
-                self.libra_locker.set()
-                self.logger_object.log('Cleared flag...', logging.DEBUG)
-
-            else:
-                while not self.libra_locker.isSet() and not is_book_converted:
-                    self.logger_object.log('Waiting for libra...', logging.DEBUG)
-                    flag = self.libra_locker.wait(50)
-                    if flag:
-                        if self.libra_locker.isSet():
-                            self.libra_locker.clear()
-                            self.logger_object.log(f'Got flag!', logging.DEBUG)
-                            self._libra_run(out_dir_path)
-                            self.libra_locker.set()
-                            break
-
-        except Exception as exc:
-            self.logger_object.log("Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
-            self.logger_object.log_error_to_main_log()
-            self.status_wrapper.set_error()
-            raise exc
-
-        out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
-        self.html_path = pathlib.Path(out_dir_path)
-
-        try:
-            f = open(self.html_path)
-            f.close()
-        except FileNotFoundError as exc:
-            self.logger_object.log("Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
-            self.logger_object.log_error_to_main_log()
-            self.status_wrapper.set_error()
-            raise exc
-
-        self.logger_object.log('End of conversion from .docx to .html.')
-        self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
-
-    def read_html(self):
-        """Method for reading .html file into beautiful soup tag."""
-        try:
-            html_text = open(self.html_path, 'r', encoding='utf8').read()
-            self.logger_object.log('HTML for book has been loaded.')
-        except FileNotFoundError as exc:
-            self.logger_object.log('There is no html to process.'
-                                   'Conversion went wrong or you specified wrong paths.', logging.ERROR)
-            self.logger_object.log_error_to_main_log()
-            self.status_wrapper.set_error()
-            raise exc
-
-        html_soup = BeautifulSoup(html_text, features='lxml')
-        return html_soup
-
-    def write_html_from_list(self, body_tag, file_name='json/html_test.html'):
-        folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        file_path = pathlib.Path(os.path.join(folder_path, file_name))
-
-        with open(file_path, 'w', encoding='utf-8') as f_out:
-            f_out.write(body_tag.prettify())
-            self.logger_object.log(f'Check final prettified html: {file_name}.')
-
-    def convert_from_html(self):
-        html_soup = self.read_html()
-        parser = HTMLDocxPreprocessor(html_soup, self.logger_object)
-        content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
-        json_converter = LibraHTML2JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
-        content_dict = json_converter.convert_to_dict()
-        self.write_to_json(content_dict)
-        self.write_html_from_list(parser.body_tag)
+        # critical section for occupying libreoffice by one thread
+        self.libre_locker: Event() = libre_locker

    def get_converted_book(self):
        """
-        1. Convert docx to html with libra office
-        2. Parse and clean html, get list of tags, get footnotes
-        3. Parse from line structure to nested structure with JSONConverter
+        Function
+        Steps
+        ----------
+        1. Converts docx to html with LibreOffice
+        2. Parses and cleans html, gets list of tags, gets footnotes
+        3. Parses from line structure to nested structure with JSONConverter
+
+        Returns
+        ----------
+        content_dict
+            json for LiveCarta platform
+
        """
-        self.convert_doc_to_html()
-        self.check_output_directory()
+        # 1. Converts docx to html with LibreOffice
+        html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access,
+                                        self.logger_object, self.status_wrapper, self.libre_locker)
+        # TODO presets

-        html_soup = self.read_html()
-        self.logger_object.log('Beginning of processing .html file.')
+        # 2. Parses and cleans html, gets list of tags, gets footnotes
+        parser = HTMLDocxPreprocessor(
+            html_converter.html_soup, self.logger_object)
+        bs_tags, footnotes, top_level_headers = parser.process_html(
+            self.access, html_converter.html_path, self.book_id)

-        parser = HTMLDocxPreprocessor(html_soup, self.logger_object)
-        bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
-
-        self.logger_object.log('Beginning of processing json output.')
-        self.status_wrapper.set_generating()
-
-        json_converter = LibraHTML2JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
+        # 3. Parses from line structure to nested structure with JSONConverter
+        json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers,
+                                                 self.logger_object, self.status_wrapper)
        content_dict = json_converter.convert_to_dict()
+
        return content_dict


 if __name__ == "__main__":
-    folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html'))
-    out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json'))
+    docx_file_path = '../../docx/music_inquiry.docx'
+    logger_object = BookLogger(
+        name='docx', book_id=docx_file_path.split('/')[-1])

-    book = DocxBook(html_path=file)
-    book.convert_from_html()
+    html_converter = Docx2LibreHTML(file_path=docx_file_path)
+
+    parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object)
+    content, footnotes, top_level_headers = parser.process_html(
+        html_converter.html_path)
+
+    json_converter = LibreHTML2JSONConverter(
+        content, footnotes, top_level_headers, logger_object)
+    content_dict = json_converter.convert_to_dict()
+
+    with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f:
+        json.dump(content_dict, f, ensure_ascii=False)