This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/docx_converter/docx2libre_html.py
2022-09-09 15:13:02 +03:00

124 lines
4.8 KiB
Python

import os
import logging
import pathlib
import subprocess
from subprocess import PIPE
from typing import Union
from threading import Event
from bs4 import BeautifulSoup
from src.util.helpers import BookLogger
class Docx2LibreHtml:
def __init__(self, book_id: int = 0, file_path: Union[pathlib.PosixPath, str] = None,
access=None, logger: BookLogger = None, libre_locker: Event = None):
self.book_id = book_id if book_id != 0 else pathlib.Path(
file_path).stem
self.file_path = file_path
self.access = access
self.logger_object: BookLogger = logger
# critical section for occupying libreoffice by one thread
self.libre_locker = libre_locker
# path to html file, file appears after libre-conversion
self.html_path = self.convert_docx_to_html()
self.html_soup = self.read_html(self.html_path)
def _libre_run(self, out_dir_path: str):
command = ["libreoffice", "--headless",
"--convert-to", "html", f"{str(self.file_path)}",
"--outdir", f"{out_dir_path}"]
# print(command)
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:"
f" {result.returncode}, {result.stdout}", logging.DEBUG)
self.logger_object.log(f"Any error while libre conversion for book_"
f"{self.book_id}: {result.stderr}", logging.DEBUG)
def convert_docx_to_html(self) -> pathlib.Path:
"""
Function converts .docx document to .html file.
Steps
----------
1. Converts .epub to .html
2. Parses from line structure to nested structure
Returns
----------
html_path: pathlib.Path
path to html file, file appears after libre-conversion
"""
def get_and_clear_flag(html_file_path: str):
self.libre_locker.clear()
self.logger_object.log(f"Got flag!", logging.DEBUG)
self._libre_run(html_file_path)
self.libre_locker.set()
self.logger_object.log("Cleared flag...", logging.DEBUG)
def check_file_exists(path: pathlib.Path, error_string: str):
try:
f = open(path)
f.close()
except FileNotFoundError as error:
self.logger_object.log(
error_string, logging.ERROR)
self.logger_object.log_error_to_main_log()
raise error
self.logger_object.log(f"File - {self.file_path}.")
self.logger_object.log("Beginning of conversion from .docx to .html.")
check_file_exists(
self.file_path, error_string="Invalid path to input data.")
folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(
folder_path, f"../books/html/{self.book_id}")
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
try:
if self.libre_locker.is_set():
get_and_clear_flag(out_dir_path)
else:
while not self.libre_locker.is_set():
self.logger_object.log(
"Waiting for libre...", logging.DEBUG)
flag = self.libre_locker.wait(50)
if flag:
if self.libre_locker.is_set():
get_and_clear_flag(out_dir_path)
break
except Exception as exc:
self.logger_object.log(
"Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise exc
out_dir_path = os.path.join(out_dir_path, f"{self.book_id}.html")
html_path = pathlib.Path(out_dir_path)
check_file_exists(
html_path, error_string="Conversion has gone wrong. HTML file doesn't exist.")
self.logger_object.log("End of conversion from .docx to .html.")
self.logger_object.log(
f"Input file path after conversion: {html_path}.")
return html_path
def read_html(self, html_path: pathlib.Path) -> BeautifulSoup:
"""Method for reading .html file into beautiful soup tag."""
try:
html_text = open(html_path, "r", encoding="utf8").read()
self.logger_object.log("Html for book has been loaded.")
except FileNotFoundError as exc:
self.logger_object.log("There is no html to process."
"Conversion went wrong or you specified wrong paths.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise exc
html_soup = BeautifulSoup(html_text, features="lxml")
return html_soup