Rewrite docx_solver.py

2022-06-01 16:24:19 +03:00
parent c0ef0b6d6e
commit 0d1ec03f57
2 changed files with 178 additions and 132 deletions
--- a/src/docx_converter/docx2libre_html.py
+++ b/src/docx_converter/docx2libre_html.py
@@ -0,0 +1,129 @@
+import os
+import logging
+import pathlib
+import subprocess
+from subprocess import PIPE
+from threading import Event
+from bs4 import BeautifulSoup
+
+from src.util.helpers import BookLogger
+
+
+class Docx2LibreHTML:
+    def __init__(self, book_id=0, file_path=None, access=None, logger=None, status_wrapper=None, libre_locker=None):
+        self.book_id = book_id
+        self.file_path = file_path
+        self.access = access
+        self.logger_object: BookLogger = logger
+        self.status_wrapper: status_wrapper = status_wrapper
+        # critical section for occupying libreoffice by one thread
+        self.libre_locker: Event() = libre_locker
+
+        # path to html file, file appears after libre-conversion
+        self.html_path = self.convert_docx_to_html()
+        self.html_soup = self.read_html(self.html_path)
+
+    def _libre_run(self, out_dir_path):
+        command = ['libreoffice', '--headless',
+                   '--convert-to', 'html', f'{str(self.file_path)}',
+                   '--outdir', f'{out_dir_path}']
+        print(command)
+        result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
+        self.logger_object.log(f'Result of libre conversion for book_{self.book_id}:'
+                               f' {result.returncode}, {result.stdout}', logging.DEBUG)
+        self.logger_object.log(f'Any error while libre conversion for book_'
+                               f'{self.book_id}: {result.stderr}', logging.DEBUG)
+
+    def convert_docx_to_html(self):
+        """
+        Function converts .docx document to .html file.
+        Steps
+        ----------
+        1. Converts .epub to .html
+        2. Parses from line structure to nested structure
+
+        Returns
+        ----------
+        html_path: str
+            path to html file, file appears after libre-conversion
+
+        """
+        self.logger_object.log(f'File - {self.file_path}.')
+        print(f'{self.file_path}')
+        self.logger_object.log('Beginning of conversion from .docx to .html.')
+
+        try:
+            f = open(self.file_path)
+            f.close()
+        except FileNotFoundError as error:
+            self.logger_object.log(
+                'Invalid path to input data.', logging.ERROR)
+            self.status_wrapper.set_error()
+            raise error
+
+        folder_path = os.path.dirname(
+            os.path.dirname(os.path.abspath(__file__)))
+        out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}')
+        pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
+
+        is_book_converted = False
+        try:
+            if self.libre_locker.isSet():
+                self.libre_locker.clear()
+                self.logger_object.log('Got flag...', logging.DEBUG)
+                self._libre_run(out_dir_path)
+                self.libre_locker.set()
+                self.logger_object.log('Cleared flag...', logging.DEBUG)
+
+            else:
+                while not self.libre_locker.isSet() and not is_book_converted:
+                    self.logger_object.log(
+                        'Waiting for libre...', logging.DEBUG)
+                    flag = self.libre_locker.wait(50)
+                    if flag:
+                        if self.libre_locker.isSet():
+                            self.libre_locker.clear()
+                            self.logger_object.log(f'Got flag!', logging.DEBUG)
+                            self._libre_run(out_dir_path)
+                            self.libre_locker.set()
+                            break
+
+        except Exception as exc:
+            self.logger_object.log(
+                "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            self.status_wrapper.set_error()
+            raise exc
+
+        out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html')
+        html_path = pathlib.Path(out_dir_path)
+
+        try:
+            f = open(html_path)
+            f.close()
+        except FileNotFoundError as exc:
+            self.logger_object.log(
+                "Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            self.status_wrapper.set_error()
+            raise exc
+
+        self.logger_object.log('End of conversion from .docx to .html.')
+        self.logger_object.log(
+            f'Input file path after conversion: {html_path}.')
+        return html_path
+
+    def read_html(self, html_path):
+        """Method for reading .html file into beautiful soup tag."""
+        try:
+            html_text = open(html_path, 'r', encoding='utf8').read()
+            self.logger_object.log('HTML for book has been loaded.')
+        except FileNotFoundError as exc:
+            self.logger_object.log('There is no html to process.'
+                                   'Conversion went wrong or you specified wrong paths.', logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            self.status_wrapper.set_error()
+            raise exc
+
+        html_soup = BeautifulSoup(html_text, features='lxml')
+        return html_soup