forked from LiveCarta/BookConverter
Small folder changes
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -69,7 +69,7 @@ instance/
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
documentation/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM python:3
|
||||
FROM python:3.11.0
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y software-properties-common
|
||||
|
||||
2
config/.gitignore
vendored
2
config/.gitignore
vendored
@@ -1,2 +0,0 @@
|
||||
*
|
||||
!.gitignore
|
||||
@@ -20,7 +20,7 @@ def local_convert_book(book_type: [DocxBook, EpubBook], book_id: int, logger: lo
|
||||
try:
|
||||
json_file_path = "books/json/9781614382264.json"
|
||||
book = book_type(book_id=book_id, main_logger=logger, **params)
|
||||
book.conversion_local(json_file_path)
|
||||
book.conversion(json_file_path)
|
||||
except Exception as exc:
|
||||
raise exc
|
||||
logger.info(f"Book-{book_id} has been proceeded.")
|
||||
@@ -78,7 +78,7 @@ def server_run():
|
||||
try:
|
||||
folder_path = os.path.dirname(os.path.abspath(__file__))
|
||||
config_path = Path(os.path.join(
|
||||
folder_path, "config/queue_config.json"))
|
||||
folder_path, "configs/queue_config.json"))
|
||||
with open(config_path, "r") as f:
|
||||
conf_param = json.load(f)
|
||||
|
||||
@@ -95,7 +95,7 @@ def server_run():
|
||||
channel.queue_declare(queue=conf_param["queue"], durable=True, arguments={
|
||||
"x-max-priority": 10})
|
||||
except TypeError as exc:
|
||||
print("TypeError: problem with config, " + str(exc))
|
||||
print("TypeError: problem with queue config, " + str(exc))
|
||||
except ValueError as exc:
|
||||
logger_object.log(logging.ERROR,
|
||||
f"Queue {conf_param['queue']} is not declared.")
|
||||
|
||||
@@ -1,81 +0,0 @@
|
||||
config.allowedContent = {
|
||||
sup: {
|
||||
attributes: ['*'],
|
||||
classes: ['*']
|
||||
},
|
||||
table: {
|
||||
attributes: ['*'],
|
||||
styles: ['*']
|
||||
},
|
||||
tr: {
|
||||
attributes: ['*'],
|
||||
styles: ['*']
|
||||
},
|
||||
th: {
|
||||
attributes: ['*'],
|
||||
classes: ['p-indent'],
|
||||
styles: ['*']
|
||||
},
|
||||
td: {
|
||||
attributes: ['*'],
|
||||
classes: ['p-indent'],
|
||||
styles: ['*']
|
||||
},
|
||||
tbody: {
|
||||
attributes: ['*'],
|
||||
styles: ['*']
|
||||
},
|
||||
thead: {
|
||||
attributes: ['*'],
|
||||
styles: ['*']
|
||||
},
|
||||
caption : {},
|
||||
img : {
|
||||
attributes: ['*'],
|
||||
classes: ['*'],
|
||||
styles: ['*']
|
||||
},
|
||||
code : {
|
||||
attributes: ['*'],
|
||||
classes: ['*'],
|
||||
styles: ['*']
|
||||
},
|
||||
pre : {
|
||||
attributes: ['*'],
|
||||
classes: ['*'],
|
||||
styles: ['*']
|
||||
},
|
||||
p : {
|
||||
styles: ['text-align', 'text-indent', 'border-bottom', 'border-top'],
|
||||
classes: ['*']
|
||||
},
|
||||
strong : {},
|
||||
i : {},
|
||||
s : {},
|
||||
u : {},
|
||||
ul : {},
|
||||
ol : {},
|
||||
li : {
|
||||
styles: ['text-align']
|
||||
},
|
||||
blockquote : {},
|
||||
span : {
|
||||
attributes: ['*'],
|
||||
classes: ['*'],
|
||||
styles: ['*']
|
||||
},
|
||||
a : {
|
||||
attributes: ['href', 'data-anchor-id', 'data-chapter-id', 'placeholder'],
|
||||
classes: ['link-to-anchor'],
|
||||
},
|
||||
iframe : {
|
||||
attributes: ['*'],
|
||||
classes: ['*'],
|
||||
styles: ['*']
|
||||
},
|
||||
div : {
|
||||
attributes: ['*'],
|
||||
classes: ['youtube-embed-wrapper'],
|
||||
styles: ['*']
|
||||
}
|
||||
};
|
||||
@@ -5,4 +5,4 @@ sudo docker stop lc_converter_container
|
||||
#remove container
|
||||
sudo docker rm -f lc_converter_container
|
||||
#start container
|
||||
sudo docker run --name=lc_converter_container -v /var/log/lc-converter/:/app/logs lc_converter_image
|
||||
sudo docker run --name=lc_converter_container -v /var/log/lc-converter:/app/logs lc_converter_image
|
||||
|
||||
3
presets/.gitignore
vendored
3
presets/.gitignore
vendored
@@ -1,3 +0,0 @@
|
||||
*
|
||||
!.gitignore
|
||||
!*.json
|
||||
@@ -1,254 +0,0 @@
|
||||
[
|
||||
{
|
||||
"preset_name": "wrapper",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^div$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "id",
|
||||
"value": "^Table of Contents\\d+"
|
||||
}
|
||||
],
|
||||
"text": null
|
||||
},
|
||||
"tag_to_wrap": {
|
||||
"name": "TOC",
|
||||
"attrs": []
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "decomposer",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^div$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "title",
|
||||
"value": "footer"
|
||||
}
|
||||
],
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^div$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "id",
|
||||
"value": "^Table of Contents\\d+"
|
||||
}
|
||||
],
|
||||
"text": null
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "replacer",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^h[6-9]$"],
|
||||
"condition": null,
|
||||
"tag_to_replace": {
|
||||
"name": "p",
|
||||
"attrs": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^div$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "style",
|
||||
"value": "column-count: 2"
|
||||
}
|
||||
],
|
||||
"text": null
|
||||
},
|
||||
"tag_to_replace": {
|
||||
"name": "p",
|
||||
"attrs": null
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "attr_replacer",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^p$"],
|
||||
"condition": {
|
||||
"attrs": [
|
||||
{
|
||||
"name": "style",
|
||||
"value": "column-count: 2"
|
||||
}
|
||||
]
|
||||
},
|
||||
"attr_to_replace": {
|
||||
"name": "class",
|
||||
"value": "columns2"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "unwrapper",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^span$"],
|
||||
"condition": {
|
||||
"parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)",
|
||||
"child_tags": null,
|
||||
"attrs": null,
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^span$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "style",
|
||||
"value": "(^background: #[\\da-fA-F]{6}$)|(^letter-spacing: -?[\\d.]+pt$)"
|
||||
}
|
||||
],
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^span$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "lang",
|
||||
"value": "^ru-RU$"
|
||||
}
|
||||
],
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^span$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "face",
|
||||
"value": "^Times New Roman[\\w, ]+$"
|
||||
}
|
||||
],
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^p$"],
|
||||
"condition": {
|
||||
"parent_tags": ":is(li)",
|
||||
"child_tags": null,
|
||||
"attrs": null,
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^a$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "name",
|
||||
"value": "_GoBack"
|
||||
}
|
||||
],
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^u$"],
|
||||
"condition": {
|
||||
"parent_tags": ":is(a)",
|
||||
"child_tags": null,
|
||||
"attrs": null,
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^u$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": ":is(a)",
|
||||
"attrs": null,
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^b$"],
|
||||
"condition": {
|
||||
"parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)",
|
||||
"child_tags": null,
|
||||
"attrs": null,
|
||||
"text": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^div$"],
|
||||
"condition": null
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "inserter",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^p$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": null,
|
||||
"text": "\\$\\$[\\s\\S]*?\\$\\$"
|
||||
},
|
||||
"tag_to_insert": {
|
||||
"name": "span",
|
||||
"attrs": [
|
||||
{
|
||||
"name": "class",
|
||||
"value": "math-tex"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "text_replacer",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^p$"],
|
||||
"condition": {
|
||||
"text": "(\\\\nonumber\\\\\\\\\\\\noalign{\\\\pagebreak}[\\s\\S]*?)\\\\"
|
||||
},
|
||||
"text_to_replace": "\\\\"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
@@ -1,210 +0,0 @@
|
||||
[
|
||||
{
|
||||
"preset_name": "table_wrapper",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^div$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "width",
|
||||
"value": ".*"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^div$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "border",
|
||||
"value": ".*"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^div$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "style",
|
||||
"value": "border.*"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^div$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "bgcolor",
|
||||
"value": ".*"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^section$", "^blockquote$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": null,
|
||||
"attrs": [
|
||||
{
|
||||
"name": "class",
|
||||
"value": "feature[1234]"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "replacer",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"],
|
||||
"condition": null,
|
||||
"tag_to_replace": {
|
||||
"name": "p"
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^aside$"],
|
||||
"condition": null,
|
||||
"tag_to_replace": {
|
||||
"name": "div"
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^header$", "^footer$"],
|
||||
"condition": null,
|
||||
"tag_to_replace": {
|
||||
"name": "span"
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^code$", "^kbd$", "^var$"],
|
||||
"condition": {
|
||||
"parent_tags": ":not(pre, span)",
|
||||
"child_tags": null,
|
||||
"attrs": null
|
||||
},
|
||||
"tag_to_replace": {
|
||||
"name": "span"
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^em$"],
|
||||
"condition": null,
|
||||
"tag_to_replace": {
|
||||
"name": "i"
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^b$"],
|
||||
"condition": null,
|
||||
"tag_to_replace": {
|
||||
"name": "strong"
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^image$"],
|
||||
"condition": null,
|
||||
"tag_to_replace": {
|
||||
"name": "img"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "attrs_remover",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^sup$"],
|
||||
"condition": null
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "attr_replacer",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^img$"],
|
||||
"condition": {
|
||||
"attrs": [
|
||||
{
|
||||
"name": "xlink:href",
|
||||
"value": ".*"
|
||||
}
|
||||
]
|
||||
},
|
||||
"attr_to_replace": {
|
||||
"name": "src",
|
||||
"value": null
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "unwrapper",
|
||||
"rules": [
|
||||
{
|
||||
"tags": [
|
||||
"^section$",
|
||||
"^blockquote$",
|
||||
"^article$",
|
||||
"^figcaption$",
|
||||
"^main$",
|
||||
"^body$",
|
||||
"^html$",
|
||||
"^svg$"
|
||||
],
|
||||
"condition": null
|
||||
},
|
||||
{
|
||||
"tags": ["^p$"],
|
||||
"condition": {
|
||||
"parent_tags": "li",
|
||||
"child_tags": null,
|
||||
"attrs": null
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"preset_name": "inserter",
|
||||
"rules": [
|
||||
{
|
||||
"tags": ["^pre$"],
|
||||
"condition": {
|
||||
"parent_tags": null,
|
||||
"child_tags": ":not(:has(code, kbd, var))",
|
||||
"attrs": null
|
||||
},
|
||||
"tag_to_insert": {
|
||||
"name": "code",
|
||||
"attrs": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"tags": ["^h[1-5]$"],
|
||||
"condition": null,
|
||||
"tag_to_insert": {
|
||||
"name":"strong",
|
||||
"attrs": []
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
@@ -42,7 +42,7 @@ class Access:
|
||||
def set_credentials(self, url: str):
|
||||
folder_path: str = os.path.dirname(
|
||||
os.path.dirname(os.path.abspath(__file__)))
|
||||
config_path: str = os.path.join(folder_path, "config/api_config.json")
|
||||
config_path: str = os.path.join(folder_path, "configs/api_config.json")
|
||||
with open(config_path, "r") as f:
|
||||
params: Dict[str, str] = json.load(f)
|
||||
|
||||
|
||||
@@ -77,14 +77,14 @@ class BookSolver:
|
||||
"""Method for getting and saving preset from server"""
|
||||
try:
|
||||
pass
|
||||
self.preset_path = "presets/docx_presets.json"
|
||||
self.preset_path = "preset/docx_presets.json"
|
||||
# self.logger_object.log(f"Start receiving preset file from server. URL:"
|
||||
# f" {self.access.url}/doc-convert/{self.book_id}/presets")
|
||||
# f" {self.access.url}/doc-convert/{self.book_id}/preset")
|
||||
# content = self.access.get_file(
|
||||
# file_path=f"{self.access.url}/doc-convert/{self.book_id}/presets")
|
||||
# file_path=f"{self.access.url}/doc-convert/{self.book_id}/preset")
|
||||
# self.logger_object.log("Preset file was received from server.")
|
||||
# self.preset_path = pathlib.Path(
|
||||
# str(self.save_file(content, path_to_save="presets", file_type="json")))
|
||||
# str(self.save_file(content, path_to_save="preset", file_type="json")))
|
||||
except FileNotFoundError as f_err:
|
||||
self.logger_object.log(
|
||||
"Can't get preset file from server.", logging.ERROR)
|
||||
|
||||
@@ -50,7 +50,7 @@ class DocxBook(BookSolver):
|
||||
# 2. Parses and cleans html, gets list of tags, gets footnotes
|
||||
try:
|
||||
html_preprocessor = HtmlPresetsProcessor(
|
||||
logger=self.logger_object, preset_path="presets/docx_presets.json")
|
||||
logger=self.logger_object, preset_path="preset/docx_presets.json")
|
||||
style_preprocessor = StyleReader()
|
||||
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup,
|
||||
logger=self.logger_object,
|
||||
@@ -91,7 +91,7 @@ if __name__ == "__main__":
|
||||
html_converter = Docx2LibreHtml(file_path=docx_file_path,
|
||||
logger=logger_object, libre_locker=locker)
|
||||
html_preprocessor = HtmlPresetsProcessor(
|
||||
logger=logger_object, preset_path="../../presets/docx_presets.json")
|
||||
logger=logger_object, preset_path="../../preset/docx_presets.json")
|
||||
style_preprocessor = StyleReader()
|
||||
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
|
||||
html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
|
||||
|
||||
@@ -35,7 +35,7 @@ class EpubBook(BookSolver):
|
||||
# Parses and cleans html, gets list of tags, gets footnotes
|
||||
try:
|
||||
html_preprocessor = HtmlPresetsProcessor(
|
||||
logger=self.logger_object, preset_path="presets/epub_presets.json")
|
||||
logger=self.logger_object, preset_path="preset/epub_presets.json")
|
||||
html_processor = HtmlEpubProcessor(logger=self.logger_object,
|
||||
html_preprocessor=html_preprocessor)
|
||||
except Exception as exc:
|
||||
@@ -58,7 +58,7 @@ if __name__ == "__main__":
|
||||
logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1])
|
||||
|
||||
html_preprocessor = HtmlPresetsProcessor(
|
||||
logger=logger_object, preset_path="../../presets/epub_presets.json")
|
||||
logger=logger_object, preset_path="../../preset/epub_presets.json")
|
||||
style_preprocessor = StyleReader()
|
||||
html_processor = HtmlEpubProcessor(logger=logger_object,
|
||||
html_preprocessor=html_preprocessor)
|
||||
|
||||
@@ -20,7 +20,7 @@ def check_dir(dir_path: str):
|
||||
if __name__ == "__main__":
|
||||
folders = parse_args().folders
|
||||
if not folders:
|
||||
folders = ["books/epub", "books/docx", "books/html", "books/json", "logs", "config"]
|
||||
folders = ["books/epub", "books/docx", "books/html", "books/json", "logs", "configs"]
|
||||
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
folders = [os.path.join(folder_path, folder) for folder in folders]
|
||||
|
||||
Reference in New Issue
Block a user