Small folder changes

This commit is contained in:
Kiryl
2022-11-15 16:35:18 +03:00
parent be615ddf40
commit d38455fca1
14 changed files with 16 additions and 566 deletions

2
.gitignore vendored
View File

@@ -69,7 +69,7 @@ instance/
.scrapy
# Sphinx documentation
docs/_build/
documentation/_build/
# PyBuilder
target/

View File

@@ -1,4 +1,4 @@
FROM python:3
FROM python:3.11.0
RUN apt-get update && \
apt-get install -y software-properties-common

2
config/.gitignore vendored
View File

@@ -1,2 +0,0 @@
*
!.gitignore

View File

@@ -20,7 +20,7 @@ def local_convert_book(book_type: [DocxBook, EpubBook], book_id: int, logger: lo
try:
json_file_path = "books/json/9781614382264.json"
book = book_type(book_id=book_id, main_logger=logger, **params)
book.conversion_local(json_file_path)
book.conversion(json_file_path)
except Exception as exc:
raise exc
logger.info(f"Book-{book_id} has been proceeded.")
@@ -78,7 +78,7 @@ def server_run():
try:
folder_path = os.path.dirname(os.path.abspath(__file__))
config_path = Path(os.path.join(
folder_path, "config/queue_config.json"))
folder_path, "configs/queue_config.json"))
with open(config_path, "r") as f:
conf_param = json.load(f)
@@ -95,7 +95,7 @@ def server_run():
channel.queue_declare(queue=conf_param["queue"], durable=True, arguments={
"x-max-priority": 10})
except TypeError as exc:
print("TypeError: problem with config, " + str(exc))
print("TypeError: problem with queue config, " + str(exc))
except ValueError as exc:
logger_object.log(logging.ERROR,
f"Queue {conf_param['queue']} is not declared.")

View File

@@ -1,81 +0,0 @@
config.allowedContent = {
sup: {
attributes: ['*'],
classes: ['*']
},
table: {
attributes: ['*'],
styles: ['*']
},
tr: {
attributes: ['*'],
styles: ['*']
},
th: {
attributes: ['*'],
classes: ['p-indent'],
styles: ['*']
},
td: {
attributes: ['*'],
classes: ['p-indent'],
styles: ['*']
},
tbody: {
attributes: ['*'],
styles: ['*']
},
thead: {
attributes: ['*'],
styles: ['*']
},
caption : {},
img : {
attributes: ['*'],
classes: ['*'],
styles: ['*']
},
code : {
attributes: ['*'],
classes: ['*'],
styles: ['*']
},
pre : {
attributes: ['*'],
classes: ['*'],
styles: ['*']
},
p : {
styles: ['text-align', 'text-indent', 'border-bottom', 'border-top'],
classes: ['*']
},
strong : {},
i : {},
s : {},
u : {},
ul : {},
ol : {},
li : {
styles: ['text-align']
},
blockquote : {},
span : {
attributes: ['*'],
classes: ['*'],
styles: ['*']
},
a : {
attributes: ['href', 'data-anchor-id', 'data-chapter-id', 'placeholder'],
classes: ['link-to-anchor'],
},
iframe : {
attributes: ['*'],
classes: ['*'],
styles: ['*']
},
div : {
attributes: ['*'],
classes: ['youtube-embed-wrapper'],
styles: ['*']
}
};

View File

@@ -5,4 +5,4 @@ sudo docker stop lc_converter_container
#remove container
sudo docker rm -f lc_converter_container
#start container
sudo docker run --name=lc_converter_container -v /var/log/lc-converter/:/app/logs lc_converter_image
sudo docker run --name=lc_converter_container -v /var/log/lc-converter:/app/logs lc_converter_image

3
presets/.gitignore vendored
View File

@@ -1,3 +0,0 @@
*
!.gitignore
!*.json

View File

@@ -1,254 +0,0 @@
[
{
"preset_name": "wrapper",
"rules": [
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "id",
"value": "^Table of Contents\\d+"
}
],
"text": null
},
"tag_to_wrap": {
"name": "TOC",
"attrs": []
}
}
]
},
{
"preset_name": "decomposer",
"rules": [
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "title",
"value": "footer"
}
],
"text": null
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "id",
"value": "^Table of Contents\\d+"
}
],
"text": null
}
}
]
},
{
"preset_name": "replacer",
"rules": [
{
"tags": ["^h[6-9]$"],
"condition": null,
"tag_to_replace": {
"name": "p",
"attrs": null
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "style",
"value": "column-count: 2"
}
],
"text": null
},
"tag_to_replace": {
"name": "p",
"attrs": null
}
}
]
},
{
"preset_name": "attr_replacer",
"rules": [
{
"tags": ["^p$"],
"condition": {
"attrs": [
{
"name": "style",
"value": "column-count: 2"
}
]
},
"attr_to_replace": {
"name": "class",
"value": "columns2"
}
}
]
},
{
"preset_name": "unwrapper",
"rules": [
{
"tags": ["^span$"],
"condition": {
"parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)",
"child_tags": null,
"attrs": null,
"text": null
}
},
{
"tags": ["^span$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "style",
"value": "(^background: #[\\da-fA-F]{6}$)|(^letter-spacing: -?[\\d.]+pt$)"
}
],
"text": null
}
},
{
"tags": ["^span$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "lang",
"value": "^ru-RU$"
}
],
"text": null
}
},
{
"tags": ["^span$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "face",
"value": "^Times New Roman[\\w, ]+$"
}
],
"text": null
}
},
{
"tags": ["^p$"],
"condition": {
"parent_tags": ":is(li)",
"child_tags": null,
"attrs": null,
"text": null
}
},
{
"tags": ["^a$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "name",
"value": "_GoBack"
}
],
"text": null
}
},
{
"tags": ["^u$"],
"condition": {
"parent_tags": ":is(a)",
"child_tags": null,
"attrs": null,
"text": null
}
},
{
"tags": ["^u$"],
"condition": {
"parent_tags": null,
"child_tags": ":is(a)",
"attrs": null,
"text": null
}
},
{
"tags": ["^b$"],
"condition": {
"parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)",
"child_tags": null,
"attrs": null,
"text": null
}
},
{
"tags": ["^div$"],
"condition": null
}
]
},
{
"preset_name": "inserter",
"rules": [
{
"tags": ["^p$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": null,
"text": "\\$\\$[\\s\\S]*?\\$\\$"
},
"tag_to_insert": {
"name": "span",
"attrs": [
{
"name": "class",
"value": "math-tex"
}
]
}
}
]
},
{
"preset_name": "text_replacer",
"rules": [
{
"tags": ["^p$"],
"condition": {
"text": "(\\\\nonumber\\\\\\\\\\\\noalign{\\\\pagebreak}[\\s\\S]*?)\\\\"
},
"text_to_replace": "\\\\"
}
]
}
]

View File

@@ -1,210 +0,0 @@
[
{
"preset_name": "table_wrapper",
"rules": [
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "width",
"value": ".*"
}
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "border",
"value": ".*"
}
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "style",
"value": "border.*"
}
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "bgcolor",
"value": ".*"
}
]
}
},
{
"tags": ["^section$", "^blockquote$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "class",
"value": "feature[1234]"
}
]
}
}
]
},
{
"preset_name": "replacer",
"rules": [
{
"tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"],
"condition": null,
"tag_to_replace": {
"name": "p"
}
},
{
"tags": ["^aside$"],
"condition": null,
"tag_to_replace": {
"name": "div"
}
},
{
"tags": ["^header$", "^footer$"],
"condition": null,
"tag_to_replace": {
"name": "span"
}
},
{
"tags": ["^code$", "^kbd$", "^var$"],
"condition": {
"parent_tags": ":not(pre, span)",
"child_tags": null,
"attrs": null
},
"tag_to_replace": {
"name": "span"
}
},
{
"tags": ["^em$"],
"condition": null,
"tag_to_replace": {
"name": "i"
}
},
{
"tags": ["^b$"],
"condition": null,
"tag_to_replace": {
"name": "strong"
}
},
{
"tags": ["^image$"],
"condition": null,
"tag_to_replace": {
"name": "img"
}
}
]
},
{
"preset_name": "attrs_remover",
"rules": [
{
"tags": ["^sup$"],
"condition": null
}
]
},
{
"preset_name": "attr_replacer",
"rules": [
{
"tags": ["^img$"],
"condition": {
"attrs": [
{
"name": "xlink:href",
"value": ".*"
}
]
},
"attr_to_replace": {
"name": "src",
"value": null
}
}
]
},
{
"preset_name": "unwrapper",
"rules": [
{
"tags": [
"^section$",
"^blockquote$",
"^article$",
"^figcaption$",
"^main$",
"^body$",
"^html$",
"^svg$"
],
"condition": null
},
{
"tags": ["^p$"],
"condition": {
"parent_tags": "li",
"child_tags": null,
"attrs": null
}
}
]
},
{
"preset_name": "inserter",
"rules": [
{
"tags": ["^pre$"],
"condition": {
"parent_tags": null,
"child_tags": ":not(:has(code, kbd, var))",
"attrs": null
},
"tag_to_insert": {
"name": "code",
"attrs": []
}
},
{
"tags": ["^h[1-5]$"],
"condition": null,
"tag_to_insert": {
"name":"strong",
"attrs": []
}
}
]
}
]

View File

@@ -42,7 +42,7 @@ class Access:
def set_credentials(self, url: str):
folder_path: str = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
config_path: str = os.path.join(folder_path, "config/api_config.json")
config_path: str = os.path.join(folder_path, "configs/api_config.json")
with open(config_path, "r") as f:
params: Dict[str, str] = json.load(f)

View File

@@ -77,14 +77,14 @@ class BookSolver:
"""Method for getting and saving preset from server"""
try:
pass
self.preset_path = "presets/docx_presets.json"
self.preset_path = "preset/docx_presets.json"
# self.logger_object.log(f"Start receiving preset file from server. URL:"
# f" {self.access.url}/doc-convert/{self.book_id}/presets")
# f" {self.access.url}/doc-convert/{self.book_id}/preset")
# content = self.access.get_file(
# file_path=f"{self.access.url}/doc-convert/{self.book_id}/presets")
# file_path=f"{self.access.url}/doc-convert/{self.book_id}/preset")
# self.logger_object.log("Preset file was received from server.")
# self.preset_path = pathlib.Path(
# str(self.save_file(content, path_to_save="presets", file_type="json")))
# str(self.save_file(content, path_to_save="preset", file_type="json")))
except FileNotFoundError as f_err:
self.logger_object.log(
"Can't get preset file from server.", logging.ERROR)

View File

@@ -50,7 +50,7 @@ class DocxBook(BookSolver):
# 2. Parses and cleans html, gets list of tags, gets footnotes
try:
html_preprocessor = HtmlPresetsProcessor(
logger=self.logger_object, preset_path="presets/docx_presets.json")
logger=self.logger_object, preset_path="preset/docx_presets.json")
style_preprocessor = StyleReader()
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup,
logger=self.logger_object,
@@ -91,7 +91,7 @@ if __name__ == "__main__":
html_converter = Docx2LibreHtml(file_path=docx_file_path,
logger=logger_object, libre_locker=locker)
html_preprocessor = HtmlPresetsProcessor(
logger=logger_object, preset_path="../../presets/docx_presets.json")
logger=logger_object, preset_path="../../preset/docx_presets.json")
style_preprocessor = StyleReader()
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)

View File

@@ -35,7 +35,7 @@ class EpubBook(BookSolver):
# Parses and cleans html, gets list of tags, gets footnotes
try:
html_preprocessor = HtmlPresetsProcessor(
logger=self.logger_object, preset_path="presets/epub_presets.json")
logger=self.logger_object, preset_path="preset/epub_presets.json")
html_processor = HtmlEpubProcessor(logger=self.logger_object,
html_preprocessor=html_preprocessor)
except Exception as exc:
@@ -58,7 +58,7 @@ if __name__ == "__main__":
logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1])
html_preprocessor = HtmlPresetsProcessor(
logger=logger_object, preset_path="../../presets/epub_presets.json")
logger=logger_object, preset_path="../../preset/epub_presets.json")
style_preprocessor = StyleReader()
html_processor = HtmlEpubProcessor(logger=logger_object,
html_preprocessor=html_preprocessor)

View File

@@ -20,7 +20,7 @@ def check_dir(dir_path: str):
if __name__ == "__main__":
folders = parse_args().folders
if not folders:
folders = ["books/epub", "books/docx", "books/html", "books/json", "logs", "config"]
folders = ["books/epub", "books/docx", "books/html", "books/json", "logs", "configs"]
folder_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
folders = [os.path.join(folder_path, folder) for folder in folders]