forked from LiveCarta/BookConverter
106 lines
4.5 KiB
Python
106 lines
4.5 KiB
Python
import json
|
|
import codecs
|
|
import logging
|
|
from threading import Event
|
|
|
|
from src.book_solver import BookSolver
|
|
from src.util.helpers import BookLogger
|
|
from src.html_preprocessor import HtmlPreprocessor
|
|
from src.style_preprocessor import StylePreprocessor
|
|
from src.docx_converter.docx2libre_html import Docx2LibreHtml
|
|
from src.docx_converter.html_docx_processor import HtmlDocxProcessor
|
|
from src.docx_converter.libre_html2json_converter import LibreHtml2JsonConverter
|
|
|
|
|
|
class DocxBook(BookSolver):
|
|
"""Class of .docx type book - child of BookSolver"""
|
|
|
|
def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker: Event = None):
|
|
super().__init__(book_id, access, main_logger)
|
|
self.book_type = "docx"
|
|
# critical section for occupying libreoffice by one thread
|
|
self.libre_locker = libre_locker
|
|
|
|
def get_converted_book(self):
|
|
"""
|
|
Function
|
|
Steps
|
|
----------
|
|
1. Converts docx to html with LibreOffice
|
|
2. Parses and cleans html, gets list of tags, gets footnotes
|
|
3. Parses from line structure to nested structure with JSONConverter
|
|
|
|
Returns
|
|
----------
|
|
content_dict
|
|
json for LiveCarta platform
|
|
|
|
"""
|
|
# 1. Converts docx to html with LibreOffice
|
|
try:
|
|
html_converter = Docx2LibreHtml(self.book_id, self.book_path, self.access,
|
|
self.logger_object, self.libre_locker)
|
|
except Exception as exc:
|
|
self.logger_object.log(
|
|
"Error has occurred while converting .docx to .html.", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
self.status_wrapper.set_error()
|
|
raise exc
|
|
|
|
# 2. Parses and cleans html, gets list of tags, gets footnotes
|
|
try:
|
|
html_preprocessor = HtmlPreprocessor(
|
|
logger=self.logger_object, preset_path="presets/docx_presets.json")
|
|
style_preprocessor = StylePreprocessor()
|
|
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup,
|
|
logger=self.logger_object,
|
|
html_preprocessor=html_preprocessor,
|
|
style_preprocessor=style_preprocessor)
|
|
bs_tags, footnotes, top_level_headers = html_processor.process_html(
|
|
self.access, html_converter.html_path, self.book_id)
|
|
except Exception as exc:
|
|
self.logger_object.log(
|
|
"Error has occurred while processing .html", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
self.status_wrapper.set_error()
|
|
raise exc
|
|
|
|
# 3. Parses from line structure to nested structure with JSONConverter
|
|
try:
|
|
json_converter = LibreHtml2JsonConverter(bs_tags, footnotes, top_level_headers,
|
|
self.logger_object)
|
|
content_dict = json_converter.convert_to_dict()
|
|
except Exception as exc:
|
|
self.logger_object.log(
|
|
"Error has occurred while converting .html to .json", logging.ERROR)
|
|
self.logger_object.log_error_to_main_log()
|
|
self.status_wrapper.set_error()
|
|
raise exc
|
|
return content_dict
|
|
|
|
|
|
if __name__ == "__main__":
|
|
docx_file_path = "../../books/docx/Bar_Exam_MPT_2e_prepared.docx"
|
|
logger_object = BookLogger(
|
|
name="docx", book_id=docx_file_path.split("/")[-1])
|
|
locker = Event()
|
|
locker.set()
|
|
|
|
html_converter = Docx2LibreHtml(file_path=docx_file_path,
|
|
logger=logger_object, libre_locker=locker)
|
|
|
|
html_preprocessor = HtmlPreprocessor(
|
|
logger=logger_object, preset_path="../../presets/docx_presets.json")
|
|
style_preprocessor = StylePreprocessor()
|
|
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
|
|
html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
|
|
content, footnotes, top_level_headers = html_processor.process_html(
|
|
html_path=html_converter.html_path, book_id=html_converter.book_id)
|
|
|
|
json_converter = LibreHtml2JsonConverter(
|
|
content, footnotes, top_level_headers, logger_object)
|
|
content_dict = json_converter.convert_to_dict()
|
|
|
|
with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f:
|
|
json.dump(content_dict, f, ensure_ascii=False)
|