Annotations for Docx Converter

This commit is contained in:
Kiryl
2022-08-05 12:36:39 +03:00
parent 2122fb82fa
commit 18642ec5fd
6 changed files with 86 additions and 92 deletions

View File

@@ -3,38 +3,41 @@ import logging
import pathlib
import subprocess
from subprocess import PIPE
from typing import Union
from threading import Event
from bs4 import BeautifulSoup
from src.util.helpers import BookLogger
class Docx2LibreHTML:
def __init__(self, book_id=0, file_path=None, access=None, logger=None, libre_locker=None):
def __init__(self, book_id: int = 0, file_path: Union[pathlib.PosixPath, str] = None,
access=None, logger: BookLogger = None, libre_locker: Event = None):
self.book_id = book_id if book_id != 0 else pathlib.Path(
file_path).stem
self.file_path = file_path
self.access = access
self.logger_object: BookLogger = logger
# critical section for occupying libreoffice by one thread
self.libre_locker: Event() = libre_locker
self.libre_locker = libre_locker
# path to html file, file appears after libre-conversion
self.html_path = self.convert_docx_to_html()
self.html_soup = self.read_html(self.html_path)
def _libre_run(self, out_dir_path):
def _libre_run(self, out_dir_path: str):
command = ["libreoffice", "--headless",
"--convert-to", "html", f"{str(self.file_path)}",
"--outdir", f"{out_dir_path}"]
print(command)
# print(command)
result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:"
f" {result.returncode}, {result.stdout}", logging.DEBUG)
self.logger_object.log(f"Any error while libre conversion for book_"
f"{self.book_id}: {result.stderr}", logging.DEBUG)
def convert_docx_to_html(self):
def convert_docx_to_html(self) -> pathlib.Path:
"""
Function converts .docx document to .html file.
Steps
@@ -44,18 +47,18 @@ class Docx2LibreHTML:
Returns
----------
html_path: str
html_path: pathlib.Path
path to html file, file appears after libre-conversion
"""
def get_and_clear_flag(out_dir_path: str):
def get_and_clear_flag(html_file_path: str):
self.libre_locker.clear()
self.logger_object.log(f"Got flag!", logging.DEBUG)
self._libre_run(out_dir_path)
self._libre_run(html_file_path)
self.libre_locker.set()
self.logger_object.log("Cleared flag...", logging.DEBUG)
def check_file_exists(path, error_string: str):
def check_file_exists(path: pathlib.Path, error_string: str):
try:
f = open(path)
f.close()
@@ -73,19 +76,20 @@ class Docx2LibreHTML:
folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(folder_path, f"../books/html/{self.book_id}")
out_dir_path = os.path.join(
folder_path, f"../books/html/{self.book_id}")
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
try:
if self.libre_locker.isSet():
if self.libre_locker.is_set():
get_and_clear_flag(out_dir_path)
else:
while not self.libre_locker.isSet():
while not self.libre_locker.is_set():
self.logger_object.log(
"Waiting for libre...", logging.DEBUG)
flag = self.libre_locker.wait(50)
if flag:
if self.libre_locker.isSet():
if self.libre_locker.is_set():
get_and_clear_flag(out_dir_path)
break
except Exception as exc:
@@ -105,7 +109,7 @@ class Docx2LibreHTML:
f"Input file path after conversion: {html_path}.")
return html_path
def read_html(self, html_path):
def read_html(self, html_path: pathlib.Path) -> BeautifulSoup:
"""Method for reading .html file into beautiful soup tag."""
try:
html_text = open(html_path, "r", encoding="utf8").read()