forked from LiveCarta/BookConverter
124 lines
4.8 KiB
Python
124 lines
4.8 KiB
Python
import os
|
|
import logging
|
|
import pathlib
|
|
import subprocess
|
|
from subprocess import PIPE
|
|
from typing import Union
|
|
from threading import Event
|
|
from bs4 import BeautifulSoup
|
|
|
|
from src.util.helpers import BookLogger
|
|
|
|
|
|
class Docx2LibreHtml:
|
|
def __init__(self, book_id: int = 0, file_path: Union[pathlib.PosixPath, str] = None,
|
|
access=None, logger: BookLogger = None, libre_locker: Event = None):
|
|
self.book_id = book_id if book_id != 0 else pathlib.Path(
|
|
file_path).stem
|
|
self.file_path = file_path
|
|
self.access = access
|
|
self.logger_object: BookLogger = logger
|
|
# critical section for occupying libreoffice by one thread
|
|
self.libre_locker = libre_locker
|
|
|
|
# path to html file, file appears after libre-conversion
|
|
self.html_path = self.convert_docx_to_html()
|
|
self.html_soup = self.read_html(self.html_path)
|
|
|
|
def _libre_run(self, out_dir_path: str):
|
|
command = ["libreoffice", "--headless",
|
|
"--convert-to", "html", f"{str(self.file_path)}",
|
|
"--outdir", f"{out_dir_path}"]
|
|
# print(command)
|
|
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
|
|
self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:"
|
|
f" {result.returncode}, {result.stdout}", logging.DEBUG)
|
|
self.logger_object.log(f"Any error while libre conversion for book_"
|
|
f"{self.book_id}: {result.stderr}", logging.DEBUG)
|
|
|
|
def convert_docx_to_html(self) -> pathlib.Path:
|
|
"""
|
|
Function converts .docx document to .html file.
|
|
Steps
|
|
----------
|
|
1. Converts .epub to .html
|
|
2. Parses from line structure to nested structure
|
|
|
|
Returns
|
|
----------
|
|
html_path: pathlib.Path
|
|
path to html file, file appears after libre-conversion
|
|
|
|
"""
|
|
def get_and_clear_flag(html_file_path: str):
|
|
self.libre_locker.clear()
|
|
self.logger_object.log(f"Got flag!", logging.DEBUG)
|
|
self._libre_run(html_file_path)
|
|
self.libre_locker.set()
|
|
self.logger_object.log("Cleared flag...", logging.DEBUG)
|
|
|
|
def check_file_exists(path: pathlib.Path, error_string: str):
|
|
try:
|
|
f = open(path)
|
|
f.close()
|
|
except FileNotFoundError as error:
|
|
self.logger_object.log(
|
|
error_string, logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
raise error
|
|
|
|
self.logger_object.log(f"File - {self.file_path}.")
|
|
self.logger_object.log("Beginning of conversion from .docx to .html.")
|
|
|
|
check_file_exists(
|
|
self.file_path, error_string="Invalid path to input data.")
|
|
|
|
folder_path = os.path.dirname(
|
|
os.path.dirname(os.path.abspath(__file__)))
|
|
out_dir_path = os.path.join(
|
|
folder_path, f"../books/html/{self.book_id}")
|
|
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
if self.libre_locker.is_set():
|
|
get_and_clear_flag(out_dir_path)
|
|
else:
|
|
while not self.libre_locker.is_set():
|
|
self.logger_object.log(
|
|
"Waiting for libre...", logging.DEBUG)
|
|
flag = self.libre_locker.wait(50)
|
|
if flag:
|
|
if self.libre_locker.is_set():
|
|
get_and_clear_flag(out_dir_path)
|
|
break
|
|
except Exception as exc:
|
|
self.logger_object.log(
|
|
"Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
raise exc
|
|
|
|
out_dir_path = os.path.join(out_dir_path, f"{self.book_id}.html")
|
|
html_path = pathlib.Path(out_dir_path)
|
|
|
|
check_file_exists(
|
|
html_path, error_string="Conversion has gone wrong. HTML file doesn't exist.")
|
|
|
|
self.logger_object.log("End of conversion from .docx to .html.")
|
|
self.logger_object.log(
|
|
f"Input file path after conversion: {html_path}.")
|
|
return html_path
|
|
|
|
def read_html(self, html_path: pathlib.Path) -> BeautifulSoup:
|
|
"""Method for reading .html file into beautiful soup tag."""
|
|
try:
|
|
html_text = open(html_path, "r", encoding="utf8").read()
|
|
self.logger_object.log("Html for book has been loaded.")
|
|
except FileNotFoundError as exc:
|
|
self.logger_object.log("There is no html to process."
|
|
"Conversion went wrong or you specified wrong paths.", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
raise exc
|
|
|
|
html_soup = BeautifulSoup(html_text, features="lxml")
|
|
return html_soup
|