From 92fe2bc019aa16ebeed2ced84fd07318d289e4e7 Mon Sep 17 00:00:00 2001
From: shirshasa <katerinagorbac@gmail.com>
Date: Fri, 3 Sep 2021 22:35:34 +0300
Subject: [PATCH] epub converter: Book, EpubBook refactoring

---
 src/consumer.py                           |   8 +-
 src/{docx_converter.py => docx_solver.py} | 144 +++++-----------------
 src/epub_solver.py                        |  17 +++
 src/{epub_converter.py => solver.py}      |  60 ++++-----
 4 files changed, 86 insertions(+), 143 deletions(-)
 rename src/{docx_converter.py => docx_solver.py} (56%)
 create mode 100644 src/epub_solver.py
 rename src/{epub_converter.py => solver.py} (72%)

diff --git a/src/consumer.py b/src/consumer.py
index 8c39e18..240b832 100644
--- a/src/consumer.py
+++ b/src/consumer.py
@@ -10,8 +10,8 @@ from threading import Event
 import pika
 
 from access import Access
-from docx_converter import DocxBook
-from epub_converter import EpubBook
+from docx_solver import DocxBook
+from epub_solver import EpubBook
 
 
 def configure_file_logger(name, filename='logs/converter_log.log', filemode='w+',
@@ -52,7 +52,9 @@ def callback(ch, method, properties, body, logger, libra_locker):
         assert 'apiURL' in data, 'No apiURL field in received message.'
         assert data.get('fileExtension') in ['epub', 'docx'], 'Wrong book type received.'
 
-        book_params = {'access': Access(url=data['apiURL'])}
+        book_params = {
+            'access': Access(url=data['apiURL']),
+        }
         if data.get('fileExtension') == 'docx':
             book_params.update({'libra_locker': libra_locker})
 
diff --git a/src/docx_converter.py b/src/docx_solver.py
similarity index 56%
rename from src/docx_converter.py
rename to src/docx_solver.py
index 6db9b8b..e3147f7 100644
--- a/src/docx_converter.py
+++ b/src/docx_solver.py
@@ -1,5 +1,3 @@
-import codecs
-import json
 import logging
 import os
 import pathlib
@@ -8,73 +6,23 @@ from subprocess import PIPE
 from threading import Event
 
 from bs4 import BeautifulSoup
-
-from livecarta_config import BookLogger, BookStatusWrapper, LawCartaConfig
 from html_preprocessor import HTMLPreprocessor
 from json_postprocessor import JSONConverter
+from src.solver import BookSolver
 
 
-class DocxBook:
+class DocxBook(BookSolver):
 
-    def __init__(self, book_id=0, access=None, docx_path=None, html_path=None, output_path=None,
-                 main_logger=None, libra_locker=None,
-                 logging_format='%(asctime)s - %(levelname)s - %(message)s'):
-        self.book_id = book_id
-        self.access = access
-        self.docx_path = docx_path  # path to docx file, appears after downloading from server
+    def __init__(self, book_id=0, access=None, html_path=None,
+                 main_logger=None, libra_locker=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'):
+        super().__init__(book_id, access, main_logger, logging_format)
+        self.book_type = 'docx'
         self.html_path = html_path  # path to html file, file appears after libre-conversion
-        self.output_path = output_path  # path to json file
         self.libra_locker: Event() = libra_locker
 
-        self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
-                                        logging_format=logging_format,
-                                        book_id=book_id,
-                                        main_logger=main_logger)
-        self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
-
-        assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
-            "Length of headers doesn't match allowed levels."
-
-    def save_docx(self, content):
-        """
-        Save binary content of file to .docx.
-        :param content: binary content of the file.
-        """
-        folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        folder_path = os.path.join(folder_path, f'docx/{self.book_id}')
-        pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
-
-        file_path = os.path.join(folder_path, f'{self.book_id}.docx')
-        try:
-            with open(file_path, 'wb+') as file:
-                file.write(content)
-            self.logger_object.log(f'File was saved to folder: {folder_path}.')
-        except Exception as exc:
-            self.logger_object.log("Error in writing docx file.", logging.ERROR)
-            self.logger_object.log_error_to_main_log()
-            raise exc
-
-        self.docx_path = pathlib.Path(file_path)
-
-    def get_docx(self):
-        """
-        Method for getting and saving book from queue.
-        """
-        try:
-            self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
-            content = self.access.get_doc(self.book_id)
-            self.logger_object.log('File was received from server.')
-            self.save_docx(content)
-        except FileNotFoundError as f_err:
-            self.logger_object.log("Can't get docx from server.", logging.ERROR)
-            self.logger_object.log_error_to_main_log()
-            raise f_err
-        except Exception as exc:
-            raise exc
-
     def _libra_run(self, out_dir_path):
         command = ['libreoffice', '--headless',
-                   '--convert-to', 'html', f'{str(self.docx_path)}',
+                   '--convert-to', 'html', f'{str(self.file_path)}',
                    '--outdir', f'{out_dir_path}']
         result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
         self.logger_object.log(f'Result of libra conversion for book_{self.book_id}: {result.returncode}, {result.stdout}', logging.DEBUG)
@@ -84,12 +32,12 @@ class DocxBook:
         """
         Method for convert .docx document to .html file.
         """
-        self.logger_object.log(f'File - {self.docx_path}.')
-        print(f'{self.docx_path}')
+        self.logger_object.log(f'File - {self.file_path}.')
+        print(f'{self.file_path}')
         self.logger_object.log('Beginning of conversion from .docx to .html.')
 
         try:
-            f = open(self.docx_path)
+            f = open(self.file_path)
             f.close()
         except FileNotFoundError as error:
             self.logger_object.log('Invalid path to input data.', logging.ERROR)
@@ -142,18 +90,6 @@ class DocxBook:
         self.logger_object.log('End of conversion from .docx to .html.')
         self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
 
-    def check_output_directory(self):
-        if self.output_path is None:
-            folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-            output_path = os.path.join(folder_path, f'json/{self.book_id}.json')
-            self.output_path = output_path
-
-        self.output_path = pathlib.Path(self.output_path)
-        self.logger_object.log(f'Output file path: {self.output_path}')
-
-        pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
-        self.output_path.touch(exist_ok=True)
-
     def read_html(self):
         """
         Method for reading .html file into beautiful soup tag.
@@ -179,24 +115,6 @@ class DocxBook:
             f_out.write(body_tag.prettify())
             self.logger_object.log(f'Check final prettified html: {file_name}.')
 
-    def write_to_json(self, content: dict):
-        try:
-            with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
-                json.dump(content, f, ensure_ascii=False)
-            self.logger_object.log(f'Data has been saved to .json file: {self.output_path}')
-        except Exception as exc:
-            self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR)
-
-    def send_json_content(self, content: dict):
-        try:
-            self.access.send_book(self.book_id, content)
-            self.logger_object.log(f'JSON data has been sent to server.')
-        except Exception as exc:
-            self.logger_object.log('Error has occurred while sending json content.', logging.ERROR)
-            self.logger_object.log_error_to_main_log()
-            self.status_wrapper.set_error()
-            raise exc
-
     def convert_from_html(self):
         html_soup = self.read_html()
         parser = HTMLPreprocessor(html_soup, self.logger_object)
@@ -212,8 +130,8 @@ class DocxBook:
         folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
         folder_path = os.path.join(folder_path, f'docx')
         file_path = os.path.join(folder_path, f'{self.book_id}.docx')
-        self.docx_path = pathlib.Path(file_path)
-        self.logger_object.log(f'Test docx path: {self.docx_path}')
+        self.file_path = pathlib.Path(file_path)
+        self.logger_object.log(f'Test docx path: {self.file_path}')
 
         self.convert_doc_to_html()
         self.check_output_directory()
@@ -229,27 +147,31 @@ class DocxBook:
         self.write_html_from_list(parser.body_tag)
         self.logger_object.log('End of the test.')
 
+    def get_converted_book(self):
+        self.convert_doc_to_html()
+        self.check_output_directory()
+
+        html_soup = self.read_html()
+        self.logger_object.log('Beginning of processing .html file.')
+
+        parser = HTMLPreprocessor(html_soup, self.logger_object)
+        content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
+
+        self.logger_object.log('Beginning of processing json output.')
+        self.status_wrapper.set_generating()
+
+        json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
+        content_dict = json_converter.convert_to_dict()
+        return content_dict
+
     def conversion(self):
         try:
             self.logger_object.log('Beginning of conversion from .docx to .json.')
-            self.get_docx()
+            self.get_book_file()
             self.status_wrapper.set_processing()
-            self.convert_doc_to_html()
-            self.check_output_directory()
-
-            html_soup = self.read_html()
-            self.logger_object.log('Beginning of processing .html file.')
-
-            parser = HTMLPreprocessor(html_soup, self.logger_object)
-            content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
-
-            self.logger_object.log('Beginning of processing json output.')
-            self.status_wrapper.set_generating()
-
-            json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
-            content_dict = json_converter.convert_to_dict()
+            content_dict = self.get_converted_book()
             self.write_to_json(content_dict)
-            self.send_json_content(content_dict)
+            self.send_json_content_to_server(content_dict)
             self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
         except Exception as exc:
             self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
@@ -263,5 +185,5 @@ if __name__ == "__main__":
     file = pathlib.Path(os.path.join(folder, 'html/ch13/Ch_13_edit.html'))
     out_path = pathlib.Path(os.path.join(folder, 'json/ch13.json'))
 
-    book = DocxBook(html_path=file, output_path=out_path)
+    book = DocxBook(html_path=file)
     book.convert_from_html()
diff --git a/src/epub_solver.py b/src/epub_solver.py
new file mode 100644
index 0000000..08ffbcc
--- /dev/null
+++ b/src/epub_solver.py
@@ -0,0 +1,17 @@
+from epub_postprocessor import EpubPostprocessor
+from src.solver import BookSolver
+
+
+class EpubBook(BookSolver):
+
+    def __init__(self, book_id=0, access=None, main_logger=None,
+                 logging_format='%(asctime)s - %(levelname)s - %(message)s'):
+        super().__init__(book_id, access, main_logger, logging_format)
+        self.book_type = 'epub'
+
+    def get_converted_book(self):
+        json_converter = EpubPostprocessor(self.file_path, access=self.access, logger=self.logger_object)
+        content_dict = json_converter.convert_to_dict()
+        self.status_wrapper.set_generating()
+        return content_dict
+
diff --git a/src/epub_converter.py b/src/solver.py
similarity index 72%
rename from src/epub_converter.py
rename to src/solver.py
index 8f05584..c43f68a 100644
--- a/src/epub_converter.py
+++ b/src/solver.py
@@ -1,3 +1,5 @@
+""" This is Interface for solving a task of a book conversion"""
+
 import codecs
 import json
 import logging
@@ -5,17 +7,15 @@ import os
 import pathlib
 
 from livecarta_config import BookLogger, BookStatusWrapper, LawCartaConfig
-from epub_postprocessor import EpubPostprocessor
 
 
-class EpubBook:
+class BookSolver:
 
-    def __init__(self, book_id=0, access=None,
-                 main_logger=None,
-                 logging_format='%(asctime)s - %(levelname)s - %(message)s'):
+    def __init__(self, book_id=0, access=None, main_logger=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'):
+        self.book_type = None
         self.book_id = book_id
         self.access = access
-        self.epub_path = None
+        self.file_path = None  # path to book file, appears after downloading from server
         self.output_path = None  # path to json file
         self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
                                         logging_format=logging_format,
@@ -26,36 +26,36 @@ class EpubBook:
         assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
             "Length of headers doesn't match allowed levels."
 
-    def save_epub(self, content):
+    def save_book_file(self, content):
         """
-        Save binary content of file to .docx.
+        Save binary content of file to .docx/.epub.
         :param content: binary content of the file.
         """
         folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        folder_path = os.path.join(folder_path, f'epub/{self.book_id}')
+        folder_path = os.path.join(folder_path, f'{self.book_type}/{self.book_id}')
         pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
 
-        file_path = os.path.join(folder_path, f'{self.book_id}.epub')
+        file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
         try:
             with open(file_path, 'wb+') as file:
                 file.write(content)
             self.logger_object.log(f'File was saved to folder: {folder_path}.')
         except Exception as exc:
-            self.logger_object.log("Error in writing epub file.", logging.ERROR)
+            self.logger_object.log(f"Error in writing {self.book_type} file.", logging.ERROR)
             self.logger_object.log_error_to_main_log()
             raise exc
 
-        self.epub_path = pathlib.Path(file_path)
+        self.file_path = pathlib.Path(file_path)
 
-    def get_epub(self):
+    def get_book_file(self):
         """
-        Method for getting and saving book from queue.
+        Method for getting and saving book from server.
         """
         try:
             self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
             content = self.access.get_doc(self.book_id)
             self.logger_object.log('File was received from server.')
-            self.save_epub(content)
+            self.save_book_file(content)
         except FileNotFoundError as f_err:
             self.logger_object.log("Can't get docx from server.", logging.ERROR)
             self.logger_object.log_error_to_main_log()
@@ -84,7 +84,7 @@ class EpubBook:
         except Exception as exc:
             self.logger_object.log('Error has occurred while writing json file.'+ str(exc), logging.ERROR)
 
-    def send_json_content(self, content: dict):
+    def send_json_content_to_server(self, content: dict):
         try:
             self.access.send_book(self.book_id, content)
             self.logger_object.log(f'JSON data has been sent to server.')
@@ -94,31 +94,32 @@ class EpubBook:
             self.status_wrapper.set_error()
             raise exc
 
+    def get_converted_book(self):
+        self.logger_object.log('Beginning of processing json output.')
+        self.status_wrapper.set_generating()
+        return {}
+
     def test_conversion(self):
         self.logger_object.log('Beginning of the test.')
 
         folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        folder_path = os.path.join(folder_path, f'epub')
-        file_path = os.path.join(folder_path, f'{self.book_id}.epub')
-        self.epub_path = pathlib.Path(file_path)
-        self.logger_object.log(f'Test epub path: {self.epub_path}')
-        json_converter = EpubPostprocessor(self.epub_path)
-        content_dict = json_converter.convert_to_dict()
+        folder_path = os.path.join(folder_path, f'{self.book_type}')
+        file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
+        self.file_path = pathlib.Path(file_path)
+        self.logger_object.log(f'Test epub path: {self.file_path}')
+        content_dict = self.get_converted_book()
         self.write_to_json(content_dict)
         self.logger_object.log('End of the test.')
 
     def conversion(self):
-        self.logger_object.log('Beginning of conversion from .docx to .json.')
-        self.get_epub()
+        self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.')
+        self.get_book_file()
         self.status_wrapper.set_processing()
-        self.logger_object.log('Beginning of processing json output.')
 
         try:
-            json_converter = EpubPostprocessor(self.epub_path, access=self.access, logger=self.logger_object)
-            content_dict = json_converter.convert_to_dict()
-            self.status_wrapper.set_generating()
+            content_dict = self.get_converted_book()
             self.write_to_json(content_dict)
-            self.send_json_content(content_dict)
+            self.send_json_content_to_server(content_dict)
             self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
 
         except Exception as exc:
@@ -126,3 +127,4 @@ class EpubBook:
             self.logger_object.log_error_to_main_log(str(exc))
             self.status_wrapper.set_error()
             raise exc
+