BookConverter/src/docx_converter/docx_solver.py

import json
import codecs
import logging
from threading import Event

from src.book_solver import BookSolver
from src.util.helpers import BookLogger
from src.html_preprocessor import HtmlPreprocessor
from src.style_preprocessor import StylePreprocessor
from src.docx_converter.docx2libre_html import Docx2LibreHtml
from src.docx_converter.html_docx_processor import HtmlDocxProcessor
from src.docx_converter.libre_html2json_converter import LibreHtml2JsonConverter


class DocxBook(BookSolver):
    """Class of .docx type book - child of BookSolver"""

    def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker: Event = None):
        super().__init__(book_id, access, main_logger)
        self.book_type = "docx"
        # critical section for occupying libreoffice by one thread
        self.libre_locker = libre_locker

    def get_converted_book(self):
        """
        Function
        Steps
        ----------
        1. Converts docx to html with LibreOffice
        2. Parses and cleans html, gets list of tags, gets footnotes
        3. Parses from line structure to nested structure with JSONConverter

        Returns
        ----------
        content_dict
            json for LiveCarta platform

        """
        # 1. Converts docx to html with LibreOffice
        try:
            html_converter = Docx2LibreHtml(self.book_id, self.book_path, self.access,
                                            self.logger_object, self.libre_locker)
        except Exception as exc:
            self.logger_object.log(
                "Error has occurred while converting .docx to .html.", logging.ERROR)
            self.logger_object.log_error_to_main_log()
            self.status_wrapper.set_error()
            raise exc

        # 2. Parses and cleans html, gets list of tags, gets footnotes
        try:
            html_preprocessor = HtmlPreprocessor(
                logger=self.logger_object, preset_path="presets/docx_presets.json")
            style_preprocessor = StylePreprocessor()
            html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup,
                                               logger=self.logger_object,
                                               html_preprocessor=html_preprocessor,
                                               style_preprocessor=style_preprocessor)
            bs_tags, footnotes, top_level_headers = html_processor.process_html(
                self.access, html_converter.html_path, self.book_id)
        except Exception as exc:
            self.logger_object.log(
                "Error has occurred while processing .html", logging.ERROR)
            self.logger_object.log_error_to_main_log()
            self.status_wrapper.set_error()
            raise exc

        # 3. Parses from line structure to nested structure with JSONConverter
        try:
            json_converter = LibreHtml2JsonConverter(bs_tags, footnotes, top_level_headers,
                                                     self.logger_object)
            content_dict = json_converter.convert_to_dict()
        except Exception as exc:
            self.logger_object.log(
                "Error has occurred while converting .html to .json", logging.ERROR)
            self.logger_object.log_error_to_main_log()
            self.status_wrapper.set_error()
            raise exc
        return content_dict


if __name__ == "__main__":
    docx_file_path = "../../books/docx/Bar_Exam_MPT_2e_prepared.docx"
    logger_object = BookLogger(
        name="docx", book_id=docx_file_path.split("/")[-1])
    locker = Event()
    locker.set()

    html_converter = Docx2LibreHtml(file_path=docx_file_path,
                                    logger=logger_object, libre_locker=locker)

    html_preprocessor = HtmlPreprocessor(
        logger=logger_object, preset_path="../../presets/docx_presets.json")
    style_preprocessor = StylePreprocessor()
    html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
                                       html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
    content, footnotes, top_level_headers = html_processor.process_html(
        html_path=html_converter.html_path, book_id=html_converter.book_id)

    json_converter = LibreHtml2JsonConverter(
        content, footnotes, top_level_headers, logger_object)
    content_dict = json_converter.convert_to_dict()

    with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f:
        json.dump(content_dict, f, ensure_ascii=False)