Annotations for Docx Converter

This commit is contained in:
Kiryl
2022-08-05 12:36:39 +03:00
parent 2122fb82fa
commit 18642ec5fd
6 changed files with 86 additions and 92 deletions

View File

@@ -3,38 +3,41 @@ import logging
import pathlib import pathlib
import subprocess import subprocess
from subprocess import PIPE from subprocess import PIPE
from typing import Union
from threading import Event from threading import Event
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
class Docx2LibreHTML: class Docx2LibreHTML:
def __init__(self, book_id=0, file_path=None, access=None, logger=None, libre_locker=None): def __init__(self, book_id: int = 0, file_path: Union[pathlib.PosixPath, str] = None,
access=None, logger: BookLogger = None, libre_locker: Event = None):
self.book_id = book_id if book_id != 0 else pathlib.Path( self.book_id = book_id if book_id != 0 else pathlib.Path(
file_path).stem file_path).stem
self.file_path = file_path self.file_path = file_path
self.access = access self.access = access
self.logger_object: BookLogger = logger self.logger_object: BookLogger = logger
# critical section for occupying libreoffice by one thread # critical section for occupying libreoffice by one thread
self.libre_locker: Event() = libre_locker self.libre_locker = libre_locker
# path to html file, file appears after libre-conversion # path to html file, file appears after libre-conversion
self.html_path = self.convert_docx_to_html() self.html_path = self.convert_docx_to_html()
self.html_soup = self.read_html(self.html_path) self.html_soup = self.read_html(self.html_path)
def _libre_run(self, out_dir_path): def _libre_run(self, out_dir_path: str):
command = ["libreoffice", "--headless", command = ["libreoffice", "--headless",
"--convert-to", "html", f"{str(self.file_path)}", "--convert-to", "html", f"{str(self.file_path)}",
"--outdir", f"{out_dir_path}"] "--outdir", f"{out_dir_path}"]
print(command) # print(command)
result = subprocess.run(command, stdout=PIPE, stderr=PIPE) result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:" self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:"
f" {result.returncode}, {result.stdout}", logging.DEBUG) f" {result.returncode}, {result.stdout}", logging.DEBUG)
self.logger_object.log(f"Any error while libre conversion for book_" self.logger_object.log(f"Any error while libre conversion for book_"
f"{self.book_id}: {result.stderr}", logging.DEBUG) f"{self.book_id}: {result.stderr}", logging.DEBUG)
def convert_docx_to_html(self): def convert_docx_to_html(self) -> pathlib.Path:
""" """
Function converts .docx document to .html file. Function converts .docx document to .html file.
Steps Steps
@@ -44,18 +47,18 @@ class Docx2LibreHTML:
Returns Returns
---------- ----------
html_path: str html_path: pathlib.Path
path to html file, file appears after libre-conversion path to html file, file appears after libre-conversion
""" """
def get_and_clear_flag(out_dir_path: str): def get_and_clear_flag(html_file_path: str):
self.libre_locker.clear() self.libre_locker.clear()
self.logger_object.log(f"Got flag!", logging.DEBUG) self.logger_object.log(f"Got flag!", logging.DEBUG)
self._libre_run(out_dir_path) self._libre_run(html_file_path)
self.libre_locker.set() self.libre_locker.set()
self.logger_object.log("Cleared flag...", logging.DEBUG) self.logger_object.log("Cleared flag...", logging.DEBUG)
def check_file_exists(path, error_string: str): def check_file_exists(path: pathlib.Path, error_string: str):
try: try:
f = open(path) f = open(path)
f.close() f.close()
@@ -73,19 +76,20 @@ class Docx2LibreHTML:
folder_path = os.path.dirname( folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__))) os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(folder_path, f"../books/html/{self.book_id}") out_dir_path = os.path.join(
folder_path, f"../books/html/{self.book_id}")
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
try: try:
if self.libre_locker.isSet(): if self.libre_locker.is_set():
get_and_clear_flag(out_dir_path) get_and_clear_flag(out_dir_path)
else: else:
while not self.libre_locker.isSet(): while not self.libre_locker.is_set():
self.logger_object.log( self.logger_object.log(
"Waiting for libre...", logging.DEBUG) "Waiting for libre...", logging.DEBUG)
flag = self.libre_locker.wait(50) flag = self.libre_locker.wait(50)
if flag: if flag:
if self.libre_locker.isSet(): if self.libre_locker.is_set():
get_and_clear_flag(out_dir_path) get_and_clear_flag(out_dir_path)
break break
except Exception as exc: except Exception as exc:
@@ -105,7 +109,7 @@ class Docx2LibreHTML:
f"Input file path after conversion: {html_path}.") f"Input file path after conversion: {html_path}.")
return html_path return html_path
def read_html(self, html_path): def read_html(self, html_path: pathlib.Path) -> BeautifulSoup:
"""Method for reading .html file into beautiful soup tag.""" """Method for reading .html file into beautiful soup tag."""
try: try:
html_text = open(html_path, "r", encoding="utf8").read() html_text = open(html_path, "r", encoding="utf8").read()

View File

@@ -12,7 +12,7 @@ from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter
class DocxBook(BookSolver): class DocxBook(BookSolver):
"""Class of .docx type book - child of BookSolver""" """Class of .docx type book - child of BookSolver"""
def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None): def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker=None):
super().__init__(book_id, access, main_logger) super().__init__(book_id, access, main_logger)
self.book_type = "docx" self.book_type = "docx"
# critical section for occupying libreoffice by one thread # critical section for occupying libreoffice by one thread

View File

@@ -1,13 +1,14 @@
import re import re
from bs4 import BeautifulSoup, NavigableString from typing import List
from bs4 import BeautifulSoup, Tag, NavigableString
def _clean_footnote_content(content): def _clean_footnote_content(content: str) -> str:
content = content.strip() content = content.strip()
return content.strip() return content.strip()
def process_footnotes(body_tag): def process_footnotes(body_tag: Tag) -> List[str]:
"""Function returns list of footnotes and delete them from html_soup.""" """Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc") footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc")
footnote_content = body_tag.find_all( footnote_content = body_tag.find_all(
@@ -32,7 +33,7 @@ def process_footnotes(body_tag):
new_tag = BeautifulSoup(features="lxml").new_tag("sup") new_tag = BeautifulSoup(features="lxml").new_tag("sup")
new_tag["class"] = "footnote-element" new_tag["class"] = "footnote-element"
new_tag["data-id"] = i + 1 new_tag["data-id"] = f"{i + 1}"
new_tag["id"] = f"footnote-{i + 1}" new_tag["id"] = f"footnote-{i + 1}"
new_tag.string = "*" new_tag.string = "*"
anc_tag.replace_with(new_tag) anc_tag.replace_with(new_tag)
@@ -67,7 +68,6 @@ def process_footnotes(body_tag):
content = _clean_footnote_content(unicode_string) content = _clean_footnote_content(unicode_string)
cont_tag.decompose() cont_tag.decompose()
footnotes.append(content) footnotes.append(content)
return footnotes return footnotes

View File

@@ -1,27 +1,25 @@
import re import re
import logging import pathlib
from typing import List from typing import List, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString
from bs4 import BeautifulSoup, NavigableString, Tag
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.util.helpers import BookLogger, BookStatusWrapper from src.util.helpers import BookLogger
from src.docx_converter.footnotes_processing import process_footnotes from src.docx_converter.footnotes_processing import process_footnotes
from src.docx_converter.image_processing import process_images from src.docx_converter.image_processing import process_images
class HTMLDocxPreprocessor: class HTMLDocxPreprocessor:
def __init__(self, html_soup, logger_object, status_wrapper=None): def __init__(self, html_soup: BeautifulSoup, logger_object: BookLogger):
self.body_tag = html_soup.body self.body_tag = html_soup.body
self.html_soup = html_soup self.html_soup = html_soup
self.logger_object: BookLogger = logger_object self.logger_object = logger_object
self.status_wrapper: BookStatusWrapper = status_wrapper
self.top_level_headers = None self.top_level_headers = None
self.content = list() self.content = list()
def _process_toc_links(self): def _process_toc_links(self):
def _check_parent_link_exist_in_toc(tag_with_link): def _check_parent_link_exist_in_toc(tag_with_link: Tag) -> bool:
toc_links = [] toc_links = []
for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}): for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}):
link_name = a_tag.attrs["name"] link_name = a_tag.attrs["name"]
@@ -90,7 +88,7 @@ class HTMLDocxPreprocessor:
u[0].unwrap() u[0].unwrap()
@classmethod @classmethod
def convert_pt_to_px(cls, value): def convert_pt_to_px(cls, value: float) -> float:
value = float(value) value = float(value)
if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE: if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
@@ -344,11 +342,11 @@ class HTMLDocxPreprocessor:
for div in divs: for div in divs:
div.unwrap() div.unwrap()
def _get_top_level_headers(self): def _get_top_level_headers(self) -> List[Dict[str, Union[str, bool]]]:
""" """
Function for gathering info about top-level chapters. Function for gathering info about top-level chapters.
Assume: Assume: _
- Headers with the smallest outline(or digit in <h>) are top level chapters. - Headers with the smallest outline(or digit in <h>) are top level chapters.
[ It is consistent with a recursive algorithm [ It is consistent with a recursive algorithm
for saving content to a resulted json structure, for saving content to a resulted json structure,
@@ -422,7 +420,7 @@ class HTMLDocxPreprocessor:
features="lxml"), cleaned, NavigableString) features="lxml"), cleaned, NavigableString)
tag.replace_with(this) tag.replace_with(this)
def apply_func_to_last_child(self, tag, func=None): def apply_func_to_last_child(self, tag: Union[NavigableString, Tag], func=None):
""" """
works only with constructions like (((child to work with))) works only with constructions like (((child to work with)))
where child is object of NavigableString where child is object of NavigableString
@@ -457,10 +455,9 @@ class HTMLDocxPreprocessor:
[tag.unwrap() for tag in b_tags] [tag.unwrap() for tag in b_tags]
spans = tag.find_all("span") spans = tag.find_all("span")
if spans: if spans:
for span in spans: [span.unwrap() for span in spans]
style = span.attrs.get("style")
span.unwrap()
tag.attrs = {} tag.attrs = {}
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
@@ -526,10 +523,10 @@ class HTMLDocxPreprocessor:
ind = self.content.index(toc_tag) + 1 ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:] self.content = self.content[ind:]
def process_html(self, access=None, html_path="", book_id=0): def process_html(self, access=None, html_path: pathlib.Path = "", book_id: int = 0):
"""Process html code to satisfy LiveCarta formatting.""" """Process html code to satisfy LiveCarta formatting."""
self.logger_object.log("Beginning of processing .html file.") self.logger_object.log("Beginning of processing .html file.")
try:
self.logger_object.log(f"Processing TOC and headers.") self.logger_object.log(f"Processing TOC and headers.")
self._process_toc_links() self._process_toc_links()
@@ -558,7 +555,7 @@ class HTMLDocxPreprocessor:
f"{len(self.footnotes)} footnotes have been processed.") f"{len(self.footnotes)} footnotes have been processed.")
self.logger_object.log("Image processing.") self.logger_object.log("Image processing.")
self.images = process_images(access=access, html_path=html_path, self.images = process_images(access, path_to_html=html_path,
book_id=book_id, body_tag=self.body_tag) book_id=book_id, body_tag=self.body_tag)
self.logger_object.log( self.logger_object.log(
f"{len(self.images)} images have been processed.") f"{len(self.images)} images have been processed.")
@@ -575,14 +572,6 @@ class HTMLDocxPreprocessor:
# delete text before table of content if exists # delete text before table of content if exists
self.delete_content_before_toc() self.delete_content_before_toc()
except Exception as exc:
self.logger_object.log(
"Error has occurred while processing html.", logging.ERROR)
self.logger_object.log_error_to_main_log()
if self.status_wrapper:
self.status_wrapper.set_error()
raise exc
self.logger_object.log("End of processing .html file.") self.logger_object.log("End of processing .html file.")
return self.content, self.footnotes, self.top_level_headers return self.content, self.footnotes, self.top_level_headers

View File

@@ -1,6 +1,7 @@
import os import os
import pathlib import pathlib
from bs4 import Tag from bs4 import Tag
from typing import Union, List
from shutil import copyfile from shutil import copyfile
@@ -22,7 +23,7 @@ def save_image_locally(img_file_path: str, book_id: int) -> pathlib.Path:
return img_folder_path return img_folder_path
def process_images(access, path_to_html: str, book_id: int, body_tag: Tag): def process_images(access, path_to_html: Union[pathlib.Path, str], book_id: int, body_tag: Tag) -> List:
""" """
Function to process <img> tag. Function to process <img> tag.
Img should be sent Amazon S3 and then return new tag with valid link. Img should be sent Amazon S3 and then return new tag with valid link.

View File

@@ -1,12 +1,15 @@
import re import re
import logging import logging
from copy import copy from copy import copy
from typing import List, Tuple, Dict, Union
from bs4 import Tag
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
class LibreHTML2JSONConverter: class LibreHTML2JSONConverter:
def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None): def __init__(self, content: List[Tag], footnotes: List[str], top_level_headers: List[Dict[str, Union[str, bool]]],
logger_object, book_api_status=None):
self.content_dict = None self.content_dict = None
self.content = content self.content = content
self.footnotes = footnotes self.footnotes = footnotes
@@ -33,7 +36,7 @@ class LibreHTML2JSONConverter:
return new_text return new_text
# TODO: rethink the function structure without indexes. # TODO: rethink the function structure without indexes.
def header_to_livecarta_chapter_item(self, ind) -> (dict, int): def header_to_livecarta_chapter_item(self, ind: int) -> Union[Tuple[Dict[str, Union[str, List]], int], str]:
""" """
Function process header and collects all content for it. Function process header and collects all content for it.
Parameters Parameters
@@ -90,7 +93,7 @@ class LibreHTML2JSONConverter:
return "" return ""
@staticmethod @staticmethod
def _is_empty_p_tag(tag): def _is_empty_p_tag(tag: Tag) -> bool:
if tag.name != "p": if tag.name != "p":
return False return False
@@ -102,7 +105,6 @@ class LibreHTML2JSONConverter:
text = re.sub(r"\s+", "", temp_tag.text) text = re.sub(r"\s+", "", temp_tag.text)
if text: if text:
return False return False
return True return True
def convert_to_dict(self): def convert_to_dict(self):
@@ -148,9 +150,7 @@ class LibreHTML2JSONConverter:
# Add is_introduction field to json structure # Add is_introduction field to json structure
# after deleting content before toc, some chapters can be deleted # after deleting content before toc, some chapters can be deleted
if self.top_level_headers: if self.top_level_headers:
same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"]
is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"] is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]
json_strc[0]["is_introduction"] = is_first_header_introduction json_strc[0]["is_introduction"] = is_first_header_introduction
self.content_dict = { self.content_dict = {