import os import logging import pathlib import subprocess from subprocess import PIPE from typing import Union from threading import Event from bs4 import BeautifulSoup from src.util.helpers import BookLogger class Docx2LibreHtml: def __init__(self, book_id: int = 0, file_path: Union[pathlib.PosixPath, str] = None, access=None, logger: BookLogger = None, libre_locker: Event = None): self.book_id = book_id if book_id != 0 else pathlib.Path( file_path).stem self.file_path = file_path self.access = access self.logger_object: BookLogger = logger # critical section for occupying libreoffice by one thread self.libre_locker = libre_locker # path to html file, file appears after libre-conversion self.html_path = self.convert_docx_to_html() self.html_soup = self.read_html(self.html_path) def _libre_run(self, out_dir_path: str): command = ["libreoffice", "--headless", "--convert-to", "html", f"{str(self.file_path)}", "--outdir", f"{out_dir_path}"] # print(command) result = subprocess.run(command, stdout=PIPE, stderr=PIPE) self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:" f" {result.returncode}, {result.stdout}", logging.DEBUG) self.logger_object.log(f"Any error while libre conversion for book_" f"{self.book_id}: {result.stderr}", logging.DEBUG) def convert_docx_to_html(self) -> pathlib.Path: """ Function converts .docx document to .html file. Steps ---------- 1. Converts .epub to .html 2. Parses from line structure to nested structure Returns ---------- html_path: pathlib.Path path to html file, file appears after libre-conversion """ def get_and_clear_flag(html_file_path: str): self.libre_locker.clear() self.logger_object.log(f"Got flag!", logging.DEBUG) self._libre_run(html_file_path) self.libre_locker.set() self.logger_object.log("Cleared flag...", logging.DEBUG) def check_file_exists(path: pathlib.Path, error_string: str): try: f = open(path) f.close() except FileNotFoundError as error: self.logger_object.log( error_string, logging.ERROR) self.logger_object.log_error_to_main_log() raise error self.logger_object.log(f"File - {self.file_path}.") self.logger_object.log("Beginning of conversion from .docx to .html.") check_file_exists( self.file_path, error_string="Invalid path to input data.") folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) out_dir_path = os.path.join( folder_path, f"../books/html/{self.book_id}") pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) try: if self.libre_locker.is_set(): get_and_clear_flag(out_dir_path) else: while not self.libre_locker.is_set(): self.logger_object.log( "Waiting for libre...", logging.DEBUG) flag = self.libre_locker.wait(50) if flag: if self.libre_locker.is_set(): get_and_clear_flag(out_dir_path) break except Exception as exc: self.logger_object.log( "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) self.logger_object.log_error_to_main_log() raise exc out_dir_path = os.path.join(out_dir_path, f"{self.book_id}.html") html_path = pathlib.Path(out_dir_path) check_file_exists( html_path, error_string="Conversion has gone wrong. HTML file doesn't exist.") self.logger_object.log("End of conversion from .docx to .html.") self.logger_object.log( f"Input file path after conversion: {html_path}.") return html_path def read_html(self, html_path: pathlib.Path) -> BeautifulSoup: """Method for reading .html file into beautiful soup tag.""" try: html_text = open(html_path, "r", encoding="utf8").read() self.logger_object.log("Html for book has been loaded.") except FileNotFoundError as exc: self.logger_object.log("There is no html to process." "Conversion went wrong or you specified wrong paths.", logging.ERROR) self.logger_object.log_error_to_main_log() raise exc html_soup = BeautifulSoup(html_text, features="lxml") return html_soup