diff --git a/.gitignore b/.gitignore index 9c71d7b..5277178 100644 --- a/.gitignore +++ b/.gitignore @@ -69,7 +69,7 @@ instance/ .scrapy # Sphinx documentation -docs/_build/ +documentation/_build/ # PyBuilder target/ diff --git a/Dockerfile b/Dockerfile index 7657bff..270927a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3 +FROM python:3.11.0 RUN apt-get update && \ apt-get install -y software-properties-common diff --git a/config/.gitignore b/config/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/config/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/consumer.py b/consumer.py index 1670000..fc5b54b 100644 --- a/consumer.py +++ b/consumer.py @@ -20,7 +20,7 @@ def local_convert_book(book_type: [DocxBook, EpubBook], book_id: int, logger: lo try: json_file_path = "books/json/9781614382264.json" book = book_type(book_id=book_id, main_logger=logger, **params) - book.conversion_local(json_file_path) + book.conversion(json_file_path) except Exception as exc: raise exc logger.info(f"Book-{book_id} has been proceeded.") @@ -78,7 +78,7 @@ def server_run(): try: folder_path = os.path.dirname(os.path.abspath(__file__)) config_path = Path(os.path.join( - folder_path, "config/queue_config.json")) + folder_path, "configs/queue_config.json")) with open(config_path, "r") as f: conf_param = json.load(f) @@ -95,7 +95,7 @@ def server_run(): channel.queue_declare(queue=conf_param["queue"], durable=True, arguments={ "x-max-priority": 10}) except TypeError as exc: - print("TypeError: problem with config, " + str(exc)) + print("TypeError: problem with queue config, " + str(exc)) except ValueError as exc: logger_object.log(logging.ERROR, f"Queue {conf_param['queue']} is not declared.") diff --git a/docs/style_config b/docs/style_config deleted file mode 100644 index 10fcb42..0000000 --- a/docs/style_config +++ /dev/null @@ -1,81 +0,0 @@ -config.allowedContent = { - sup: { - attributes: ['*'], - classes: ['*'] - }, - table: { - attributes: ['*'], - styles: ['*'] - }, - tr: { - attributes: ['*'], - styles: ['*'] - }, - th: { - attributes: ['*'], - classes: ['p-indent'], - styles: ['*'] - }, - td: { - attributes: ['*'], - classes: ['p-indent'], - styles: ['*'] - }, - tbody: { - attributes: ['*'], - styles: ['*'] - }, - thead: { - attributes: ['*'], - styles: ['*'] - }, - caption : {}, - img : { - attributes: ['*'], - classes: ['*'], - styles: ['*'] - }, - code : { - attributes: ['*'], - classes: ['*'], - styles: ['*'] - }, - pre : { - attributes: ['*'], - classes: ['*'], - styles: ['*'] - }, - p : { - styles: ['text-align', 'text-indent', 'border-bottom', 'border-top'], - classes: ['*'] - }, - strong : {}, - i : {}, - s : {}, - u : {}, - ul : {}, - ol : {}, - li : { - styles: ['text-align'] - }, - blockquote : {}, - span : { - attributes: ['*'], - classes: ['*'], - styles: ['*'] - }, - a : { - attributes: ['href', 'data-anchor-id', 'data-chapter-id', 'placeholder'], - classes: ['link-to-anchor'], - }, - iframe : { - attributes: ['*'], - classes: ['*'], - styles: ['*'] - }, - div : { - attributes: ['*'], - classes: ['youtube-embed-wrapper'], - styles: ['*'] - } - }; \ No newline at end of file diff --git a/lc_converter.sh b/lc_converter.sh index 9224b72..8099e3e 100644 --- a/lc_converter.sh +++ b/lc_converter.sh @@ -5,4 +5,4 @@ sudo docker stop lc_converter_container #remove container sudo docker rm -f lc_converter_container #start container -sudo docker run --name=lc_converter_container -v /var/log/lc-converter/:/app/logs lc_converter_image +sudo docker run --name=lc_converter_container -v /var/log/lc-converter:/app/logs lc_converter_image diff --git a/presets/.gitignore b/presets/.gitignore deleted file mode 100644 index c3bf4b1..0000000 --- a/presets/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -* -!.gitignore -!*.json \ No newline at end of file diff --git a/presets/docx_presets.json b/presets/docx_presets.json deleted file mode 100644 index 6d5613b..0000000 --- a/presets/docx_presets.json +++ /dev/null @@ -1,254 +0,0 @@ -[ - { - "preset_name": "wrapper", - "rules": [ - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "id", - "value": "^Table of Contents\\d+" - } - ], - "text": null - }, - "tag_to_wrap": { - "name": "TOC", - "attrs": [] - } - } - ] - }, - { - "preset_name": "decomposer", - "rules": [ - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "title", - "value": "footer" - } - ], - "text": null - } - }, - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "id", - "value": "^Table of Contents\\d+" - } - ], - "text": null - } - } - ] - }, - { - "preset_name": "replacer", - "rules": [ - { - "tags": ["^h[6-9]$"], - "condition": null, - "tag_to_replace": { - "name": "p", - "attrs": null - } - }, - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "style", - "value": "column-count: 2" - } - ], - "text": null - }, - "tag_to_replace": { - "name": "p", - "attrs": null - } - } - ] - }, - { - "preset_name": "attr_replacer", - "rules": [ - { - "tags": ["^p$"], - "condition": { - "attrs": [ - { - "name": "style", - "value": "column-count: 2" - } - ] - }, - "attr_to_replace": { - "name": "class", - "value": "columns2" - } - } - ] - }, - { - "preset_name": "unwrapper", - "rules": [ - { - "tags": ["^span$"], - "condition": { - "parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)", - "child_tags": null, - "attrs": null, - "text": null - } - }, - { - "tags": ["^span$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "style", - "value": "(^background: #[\\da-fA-F]{6}$)|(^letter-spacing: -?[\\d.]+pt$)" - } - ], - "text": null - } - }, - { - "tags": ["^span$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "lang", - "value": "^ru-RU$" - } - ], - "text": null - } - }, - { - "tags": ["^span$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "face", - "value": "^Times New Roman[\\w, ]+$" - } - ], - "text": null - } - }, - { - "tags": ["^p$"], - "condition": { - "parent_tags": ":is(li)", - "child_tags": null, - "attrs": null, - "text": null - } - }, - { - "tags": ["^a$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "name", - "value": "_GoBack" - } - ], - "text": null - } - }, - { - "tags": ["^u$"], - "condition": { - "parent_tags": ":is(a)", - "child_tags": null, - "attrs": null, - "text": null - } - }, - { - "tags": ["^u$"], - "condition": { - "parent_tags": null, - "child_tags": ":is(a)", - "attrs": null, - "text": null - } - }, - { - "tags": ["^b$"], - "condition": { - "parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)", - "child_tags": null, - "attrs": null, - "text": null - } - }, - { - "tags": ["^div$"], - "condition": null - } - ] - }, - { - "preset_name": "inserter", - "rules": [ - { - "tags": ["^p$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": null, - "text": "\\$\\$[\\s\\S]*?\\$\\$" - }, - "tag_to_insert": { - "name": "span", - "attrs": [ - { - "name": "class", - "value": "math-tex" - } - ] - } - } - ] - }, - { - "preset_name": "text_replacer", - "rules": [ - { - "tags": ["^p$"], - "condition": { - "text": "(\\\\nonumber\\\\\\\\\\\\noalign{\\\\pagebreak}[\\s\\S]*?)\\\\" - }, - "text_to_replace": "\\\\" - } - ] - } -] diff --git a/presets/epub_presets.json b/presets/epub_presets.json deleted file mode 100644 index d30e619..0000000 --- a/presets/epub_presets.json +++ /dev/null @@ -1,210 +0,0 @@ -[ - { - "preset_name": "table_wrapper", - "rules": [ - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "width", - "value": ".*" - } - ] - } - }, - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "border", - "value": ".*" - } - ] - } - }, - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "style", - "value": "border.*" - } - ] - } - }, - { - "tags": ["^div$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "bgcolor", - "value": ".*" - } - ] - } - }, - { - "tags": ["^section$", "^blockquote$"], - "condition": { - "parent_tags": null, - "child_tags": null, - "attrs": [ - { - "name": "class", - "value": "feature[1234]" - } - ] - } - } - ] - }, - { - "preset_name": "replacer", - "rules": [ - { - "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"], - "condition": null, - "tag_to_replace": { - "name": "p" - } - }, - { - "tags": ["^aside$"], - "condition": null, - "tag_to_replace": { - "name": "div" - } - }, - { - "tags": ["^header$", "^footer$"], - "condition": null, - "tag_to_replace": { - "name": "span" - } - }, - { - "tags": ["^code$", "^kbd$", "^var$"], - "condition": { - "parent_tags": ":not(pre, span)", - "child_tags": null, - "attrs": null - }, - "tag_to_replace": { - "name": "span" - } - }, - { - "tags": ["^em$"], - "condition": null, - "tag_to_replace": { - "name": "i" - } - }, - { - "tags": ["^b$"], - "condition": null, - "tag_to_replace": { - "name": "strong" - } - }, - { - "tags": ["^image$"], - "condition": null, - "tag_to_replace": { - "name": "img" - } - } - ] - }, - { - "preset_name": "attrs_remover", - "rules": [ - { - "tags": ["^sup$"], - "condition": null - } - ] - }, - { - "preset_name": "attr_replacer", - "rules": [ - { - "tags": ["^img$"], - "condition": { - "attrs": [ - { - "name": "xlink:href", - "value": ".*" - } - ] - }, - "attr_to_replace": { - "name": "src", - "value": null - } - } - ] - }, - { - "preset_name": "unwrapper", - "rules": [ - { - "tags": [ - "^section$", - "^blockquote$", - "^article$", - "^figcaption$", - "^main$", - "^body$", - "^html$", - "^svg$" - ], - "condition": null - }, - { - "tags": ["^p$"], - "condition": { - "parent_tags": "li", - "child_tags": null, - "attrs": null - } - } - ] - }, - { - "preset_name": "inserter", - "rules": [ - { - "tags": ["^pre$"], - "condition": { - "parent_tags": null, - "child_tags": ":not(:has(code, kbd, var))", - "attrs": null - }, - "tag_to_insert": { - "name": "code", - "attrs": [] - } - }, - { - "tags": ["^h[1-5]$"], - "condition": null, - "tag_to_insert": { - "name":"strong", - "attrs": [] - } - } - ] - } -] diff --git a/src/access.py b/src/access.py index 379f47c..e6f74f6 100644 --- a/src/access.py +++ b/src/access.py @@ -42,7 +42,7 @@ class Access: def set_credentials(self, url: str): folder_path: str = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) - config_path: str = os.path.join(folder_path, "config/api_config.json") + config_path: str = os.path.join(folder_path, "configs/api_config.json") with open(config_path, "r") as f: params: Dict[str, str] = json.load(f) diff --git a/src/book_solver.py b/src/book_solver.py index e036a74..3430e47 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -77,14 +77,14 @@ class BookSolver: """Method for getting and saving preset from server""" try: pass - self.preset_path = "presets/docx_presets.json" + self.preset_path = "preset/docx_presets.json" # self.logger_object.log(f"Start receiving preset file from server. URL:" - # f" {self.access.url}/doc-convert/{self.book_id}/presets") + # f" {self.access.url}/doc-convert/{self.book_id}/preset") # content = self.access.get_file( - # file_path=f"{self.access.url}/doc-convert/{self.book_id}/presets") + # file_path=f"{self.access.url}/doc-convert/{self.book_id}/preset") # self.logger_object.log("Preset file was received from server.") # self.preset_path = pathlib.Path( - # str(self.save_file(content, path_to_save="presets", file_type="json"))) + # str(self.save_file(content, path_to_save="preset", file_type="json"))) except FileNotFoundError as f_err: self.logger_object.log( "Can't get preset file from server.", logging.ERROR) diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 9cd172c..3356ec9 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -50,7 +50,7 @@ class DocxBook(BookSolver): # 2. Parses and cleans html, gets list of tags, gets footnotes try: html_preprocessor = HtmlPresetsProcessor( - logger=self.logger_object, preset_path="presets/docx_presets.json") + logger=self.logger_object, preset_path="preset/docx_presets.json") style_preprocessor = StyleReader() html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=self.logger_object, @@ -91,7 +91,7 @@ if __name__ == "__main__": html_converter = Docx2LibreHtml(file_path=docx_file_path, logger=logger_object, libre_locker=locker) html_preprocessor = HtmlPresetsProcessor( - logger=logger_object, preset_path="../../presets/docx_presets.json") + logger=logger_object, preset_path="../../preset/docx_presets.json") style_preprocessor = StyleReader() html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object, html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor) diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index ba63a34..470d307 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -35,7 +35,7 @@ class EpubBook(BookSolver): # Parses and cleans html, gets list of tags, gets footnotes try: html_preprocessor = HtmlPresetsProcessor( - logger=self.logger_object, preset_path="presets/epub_presets.json") + logger=self.logger_object, preset_path="preset/epub_presets.json") html_processor = HtmlEpubProcessor(logger=self.logger_object, html_preprocessor=html_preprocessor) except Exception as exc: @@ -58,7 +58,7 @@ if __name__ == "__main__": logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1]) html_preprocessor = HtmlPresetsProcessor( - logger=logger_object, preset_path="../../presets/epub_presets.json") + logger=logger_object, preset_path="../../preset/epub_presets.json") style_preprocessor = StyleReader() html_processor = HtmlEpubProcessor(logger=logger_object, html_preprocessor=html_preprocessor) diff --git a/src/util/check_dirs.py b/src/util/check_dirs.py index f7a0af0..b98cb31 100644 --- a/src/util/check_dirs.py +++ b/src/util/check_dirs.py @@ -20,7 +20,7 @@ def check_dir(dir_path: str): if __name__ == "__main__": folders = parse_args().folders if not folders: - folders = ["books/epub", "books/docx", "books/html", "books/json", "logs", "config"] + folders = ["books/epub", "books/docx", "books/html", "books/json", "logs", "configs"] folder_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) folders = [os.path.join(folder_path, folder) for folder in folders]