BookConverter/src/docx_converter/docx_solver.py

import json
import codecs
from threading import Event

from src.book_solver import BookSolver
from src.util.helpers import BookLogger
from src.docx_converter.docx2libre_html import Docx2LibreHTML
from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor
from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter


class DocxBook(BookSolver):
    """Class of .docx type book - child of BookSolver"""

    def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None):
        super().__init__(book_id, access, main_logger)
        self.book_type = 'docx'
        # critical section for occupying libreoffice by one thread
        self.libre_locker: Event() = libre_locker

    def get_converted_book(self):
        """
        Function
        Steps
        ----------
        1. Converts docx to html with LibreOffice
        2. Parses and cleans html, gets list of tags, gets footnotes
        3. Parses from line structure to nested structure with JSONConverter

        Returns
        ----------
        content_dict
            json for LiveCarta platform

        """
        # 1. Converts docx to html with LibreOffice
        html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access,
                                        self.logger_object, self.status_wrapper, self.libre_locker)
        # TODO presets

        # 2. Parses and cleans html, gets list of tags, gets footnotes
        parser = HTMLDocxPreprocessor(
            html_converter.html_soup, self.logger_object)
        bs_tags, footnotes, top_level_headers = parser.process_html(
            self.access, html_converter.html_path, self.book_id)

        # 3. Parses from line structure to nested structure with JSONConverter
        json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers,
                                                 self.logger_object, self.status_wrapper)
        content_dict = json_converter.convert_to_dict()

        return content_dict


if __name__ == "__main__":
    docx_file_path = '../../docx/music_inquiry.docx'
    logger_object = BookLogger(
        name='docx', book_id=docx_file_path.split('/')[-1])

    html_converter = Docx2LibreHTML(file_path=docx_file_path)

    parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object)
    content, footnotes, top_level_headers = parser.process_html(
        html_converter.html_path)

    json_converter = LibreHTML2JSONConverter(
        content, footnotes, top_level_headers, logger_object)
    content_dict = json_converter.convert_to_dict()

    with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f:
        json.dump(content_dict, f, ensure_ascii=False)