BookConverter/src/docx_converter/docx2libre_html.py

import os
import logging
import pathlib
import subprocess
from subprocess import PIPE
from typing import Union
from threading import Event
from bs4 import BeautifulSoup

from src.util.helpers import BookLogger


class Docx2LibreHtml:
    def __init__(self, book_id: int = 0, file_path: Union[pathlib.PosixPath, str] = None,
                 access=None, logger: BookLogger = None, libre_locker: Event = None):
        self.book_id = book_id if book_id != 0 else pathlib.Path(
            file_path).stem
        self.file_path = file_path
        self.access = access
        self.logger_object: BookLogger = logger
        # critical section for occupying libreoffice by one thread
        self.libre_locker = libre_locker

        # path to html file, file appears after libre-conversion
        self.html_path = self.convert_docx_to_html()
        self.html_soup = self.read_html(self.html_path)

    def _libre_run(self, out_dir_path: str):
        command = ["libreoffice", "--headless",
                   "--convert-to", "html", f"{str(self.file_path)}",
                   "--outdir", f"{out_dir_path}"]
        # print(command)
        result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
        self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:"
                               f" {result.returncode}, {result.stdout}", logging.DEBUG)
        self.logger_object.log(f"Any error while libre conversion for book_"
                               f"{self.book_id}: {result.stderr}", logging.DEBUG)

    def convert_docx_to_html(self) -> pathlib.Path:
        """
        Function converts .docx document to .html file.
        Steps
        ----------
        1. Converts .epub to .html
        2. Parses from line structure to nested structure

        Returns
        ----------
        html_path: pathlib.Path
            path to html file, file appears after libre-conversion

        """
        def get_and_clear_flag(html_file_path: str):
            self.libre_locker.clear()
            self.logger_object.log(f"Got flag!", logging.DEBUG)
            self._libre_run(html_file_path)
            self.libre_locker.set()
            self.logger_object.log("Cleared flag...", logging.DEBUG)

        def check_file_exists(path: pathlib.Path, error_string: str):
            try:
                f = open(path)
                f.close()
            except FileNotFoundError as error:
                self.logger_object.log(
                    error_string, logging.ERROR)
                self.logger_object.log_error_to_main_log()
                raise error

        self.logger_object.log(f"File - {self.file_path}.")
        self.logger_object.log("Beginning of conversion from .docx to .html.")

        check_file_exists(
            self.file_path, error_string="Invalid path to input data.")

        folder_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        out_dir_path = os.path.join(
            folder_path, f"../books/html/{self.book_id}")
        pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)

        try:
            if self.libre_locker.is_set():
                get_and_clear_flag(out_dir_path)
            else:
                while not self.libre_locker.is_set():
                    self.logger_object.log(
                        "Waiting for libre...", logging.DEBUG)
                    flag = self.libre_locker.wait(50)
                    if flag:
                        if self.libre_locker.is_set():
                            get_and_clear_flag(out_dir_path)
                            break
        except Exception as exc:
            self.logger_object.log(
                "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
            self.logger_object.log_error_to_main_log()
            raise exc

        out_dir_path = os.path.join(out_dir_path, f"{self.book_id}.html")
        html_path = pathlib.Path(out_dir_path)

        check_file_exists(
            html_path, error_string="Conversion has gone wrong. HTML file doesn't exist.")

        self.logger_object.log("End of conversion from .docx to .html.")
        self.logger_object.log(
            f"Input file path after conversion: {html_path}.")
        return html_path

    def read_html(self, html_path: pathlib.Path) -> BeautifulSoup:
        """Method for reading .html file into beautiful soup tag."""
        try:
            html_text = open(html_path, "r", encoding="utf8").read()
            self.logger_object.log("Html for book has been loaded.")
        except FileNotFoundError as exc:
            self.logger_object.log("There is no html to process."
                                   "Conversion went wrong or you specified wrong paths.", logging.ERROR)
            self.logger_object.log_error_to_main_log()
            raise exc

        html_soup = BeautifulSoup(html_text, features="lxml")
        return html_soup