diff --git a/README.md b/README.md
index b0faa3d..dc32dde 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,25 @@
-# About
+
Converter
+
+
+
+
+
-This repository contains code related to docx/epub files conversion to livecarta inner format.
+
+
+## Table of Contents
+- [Introduction](#introduction)
+- [Features](#features)
+- [Top level project structure](#top-level-project-structure)
+- [How it Works](#how-it-works)
+- [Setup](#setup)
+ - [Development](#development)
+- [How to use](#how-to-use)
+
+
+## Introduction
+This is a Python 3 project for converting Docx|Epub documents -> LiveCarta inner format.
Livecarta book format is tree structure, where nodes are chapters.
Livecarta chapter is title + html code. Livecarta html code follows some restrictions:
@@ -12,10 +30,57 @@ Livecarta chapter is title + html code. Livecarta html code follows some restric
- Styles are added as _inline_, i.e. attribute `style` in html tag.
- Each tag has its own restrictions on attributes and style. See doc/style_config
+## Features
+- Converts Epub, Docx to JSON(LiveCarta inner format)
+- Compatible with python 3
+- Very small size (only .py files)
+- Multithreaded
-# Top level project structure
-
+## Top level project structure
- `consumer.py` - code which is responsible for receiving messages from rabbitMQ
- class `Access` - contains API code which is responsible for interaction with server.
- class `Solver` - contains code responsible for pipeline of solving the task: receiving book file, conversion, status updating, sending result back to server.
-- `livecarta_config.py `- constants that depend on LiveCarta
\ No newline at end of file
+- `livecarta_config.py `- constants that depend on LiveCarta
+
+## How it Works
+**2 approaches** in 3 steps each works:
+#### Epub
+***Step 1*** - Add CSS to HTML inline_style
+
+**Step 2** - Process every HTML chapter of Epub with presets
+
+**Step 3** - Convert dicts of HTML to JSON(LiveCarta inner format)
+
+#### Docx
+**Step 1** - Conversion of DOCX to HTML via LibreOffice
+
+**Step 2** - Process HTML with presets
+
+**Step 3** - Conversion of HTML to JSON(LiveCarta inner format)
+
+## Setup
+
+ python -m pip install -r requirements.txt
+
+### Development
+To fix a bug or enhance an existing module, follow these steps:
+
+- Fork the repo
+- Create a new branch (`git checkout -b improve-feature`)
+- Make the appropriate changes in the files
+- Add changes to reflect the changes made
+- Commit your changes (`git commit -am 'Improve feature'`)
+- Push to the branch (`git push origin improve-feature`)
+- Create a Pull Request
+
+## How to Use
+**1.** Run `consumer.py`
+The script will be constantly waiting for a message from the queue(RabbitMQ), into which we load the book via Import File to Convert in the admin panel
+You can also upload the book that have been converted locally using `def local_convert()` in `consumer.py`
+
+**b.** Run `docx_solver.py`
+1. You need to run it on Linux system, but if u're using Windows - just using python docker intepreter
+2. Upload a book to books/docx/ and set the variable `docx_file_path = books/docx/book_name` in __main__
+
+**c.** Run `epub_solver.py`
+Before that upload a book to books/epub/ and set the variable `epub_file_path = books/epub/book_name` in __main__
diff --git a/presets/epub_presets.json b/presets/epub_presets.json
index 1ff62a8..07e191c 100644
--- a/presets/epub_presets.json
+++ b/presets/epub_presets.json
@@ -16,6 +16,10 @@
"name": "border",
"value": ".*"
},
+ {
+ "name": "style",
+ "value": "border.*"
+ },
{
"name": "bgcolor",
"value": ".*"
@@ -42,14 +46,14 @@
"preset_name": "replacer",
"rules": [
{
- "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$"],
+ "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"],
"condition": null,
"tag_to_replace": "p"
},
{
"tags": ["^aside$"],
"condition": null,
- "tag_to_replace": "blockquote"
+ "tag_to_replace": "div"
},
{
"tags": ["^header$", "^footer$"],
@@ -65,6 +69,11 @@
},
"tag_to_replace": "span"
},
+ {
+ "tags": ["^em$"],
+ "condition": null,
+ "tag_to_replace": "i"
+ },
{
"tags": ["^b$"],
"condition": null,
@@ -101,6 +110,7 @@
{
"tags": [
"^section$",
+ "^blockquote$",
"^article$",
"^figcaption$",
"^main$",
@@ -131,6 +141,11 @@
"attrs": null
},
"tag_to_insert": "code"
+ },
+ {
+ "tags": ["^h[1-5]$"],
+ "condition": null,
+ "tag_to_insert": "strong"
}
]
}
diff --git a/src/docx_converter/html_docx_processor.py b/src/docx_converter/html_docx_processor.py
index 8650865..c92e997 100644
--- a/src/docx_converter/html_docx_processor.py
+++ b/src/docx_converter/html_docx_processor.py
@@ -13,8 +13,7 @@ from src.inline_style_processor import modify_html_soup_with_css_styles
class HtmlDocxProcessor:
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
self.logger = logger
- self.html_soup = html_soup
- self.body_tag = self.html_soup.body
+ self.body_tag: BeautifulSoup = BeautifulSoup(str(html_soup.body))
self.html_preprocessor = html_preprocessor
self.style_preprocessor = style_preprocessor
self.content: List[Tag] = []
@@ -23,7 +22,6 @@ class HtmlDocxProcessor:
for font in self.body_tag.find_all("font"):
font.name = "span"
-
def _process_hrefs(self):
a_tags_with_href = self.body_tag.find_all(
"a", {"href": re.compile("^.*http.+")})
@@ -206,10 +204,9 @@ class HtmlDocxProcessor:
else:
h_tag.unwrap()
-
def delete_content_before_toc(self):
# remove all tag upper the only in content !!! body tag is not updated
- toc_tag = self.html_soup.new_tag("TOC")
+ toc_tag = self.body_tag.new_tag("TOC")
if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:]
@@ -229,7 +226,7 @@ class HtmlDocxProcessor:
self.body_tag)
self.logger.log("Inline style processing.")
- modify_html_soup_with_css_styles(self.body_tag)
+ self.body_tag = modify_html_soup_with_css_styles(self.body_tag)
self.logger.log("Image processing.")
images = process_images(access, path_to_html=html_path,
@@ -256,9 +253,9 @@ class HtmlDocxProcessor:
self.logger.log(f".html using presets processing.")
_process_presets(html_preprocessor=self.html_preprocessor,
- html_soup=self.html_soup)
+ html_soup=self.body_tag)
- self.content = self.body_tag.find_all(recursive=False)
+ self.content = self.body_tag.body.find_all(recursive=False)
# delete text before table of content if exists
self.delete_content_before_toc()
diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py
index e92ac8b..40640c1 100644
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -1,5 +1,5 @@
import re
-from typing import Union
+from typing import List, Union
from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
clean/remove headings & add span with id
"""
- title_of_chapter = title_of_chapter.lower()
- for tag in chapter_tag.contents:
- tag: PageElement
+ def text_preparing(tag: PageElement):
text: str = tag if isinstance(tag, NavigableString) else tag.text
- if re.sub(r"[\s\xa0]", "", text):
- text = re.sub(r"[\s\xa0]", " ", text).lower()
- text = text.strip() # delete extra spaces
- if not isinstance(tag, NavigableString):
- if title_of_chapter == text or \
- (title_of_chapter in text and
- re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
- self.html_preprocessor._add_span_to_save_ids_for_links(
- tag, chapter_tag)
- tag.extract()
- return
- elif not self._remove_headings_content(tag, title_of_chapter):
- break
- else:
- tag.extract()
- return
+ text = re.sub(r"[\s\xa0]", " ", text).lower()
+ text = text.strip() # delete extra spaces
+ return text
+
+ title_of_chapter: str = title_of_chapter.lower()
+ title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
+ (title_of_chapter in text_preparing(tag) and
+ re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
+
+ text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
+ if title_in_text:
+ self.html_preprocessor._add_span_to_save_ids_for_links(
+ title_in_text[-1], chapter_tag)
+ title_in_text[-1].extract()
+ elif text_in_title:
+ [self.html_preprocessor._add_span_to_save_ids_for_links(
+ tag, chapter_tag) for tag in text_in_title]
+ [tag.extract() for tag in text_in_title]
@staticmethod
def _class_removing(chapter_tag: BeautifulSoup):
diff --git a/src/html_presets_processor.py b/src/html_presets_processor.py
index 30f7906..eeba3ca 100644
--- a/src/html_presets_processor.py
+++ b/src/html_presets_processor.py
@@ -28,6 +28,7 @@ class HtmlPresetsProcessor:
@staticmethod
def _decompose_tag(**kwargs):
+ kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
kwargs["tag"].decompose()
@staticmethod
@@ -112,6 +113,7 @@ class HtmlPresetsProcessor:
@staticmethod
def _unwrap_tag(**kwargs):
+ kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
kwargs["tag"].unwrap()
@staticmethod
@@ -153,7 +155,6 @@ class HtmlPresetsProcessor:
for parent_tag in body_tag.select(condition_on_tag[1]):
for tag in parent_tag.find_all([re.compile(tag) for tag in tags]):
# parent_tag != tag.parent
- tag.parent.attrs.update(tag.attrs)
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "child_tags":
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
diff --git a/src/inline_style_processor.py b/src/inline_style_processor.py
index d63122a..cc7c14d 100644
--- a/src/inline_style_processor.py
+++ b/src/inline_style_processor.py
@@ -14,7 +14,7 @@ class InlineStyleProcessor:
def __init__(self, tag_inline_style: Tag):
# tag with inline style + style parsed from css file
self.tag_inline_style = tag_inline_style
- self.tag_inline_style.attrs['style']: str = self.process_inline_style()
+ self.tag_inline_style.attrs["style"]: str = self.process_inline_style()
@staticmethod
def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str:
@@ -80,19 +80,19 @@ class InlineStyleProcessor:
processed_style = ";".join(split_style)+';'
margin_left_regexp = re.compile(
- r"((margin-left|margin): *(-*\w+);*)")
+ r"((margin-left|margin): *-*((\d*)\.*\d+)\w+;*)")
text_indent_regexp = re.compile(
- r"(text-indent: *(-*\w+);*)")
+ r"(text-indent: *-*((\d*)\.*\d+)\w+;*)")
has_margin = re.search(margin_left_regexp, processed_style)
has_text_indent = re.search(text_indent_regexp, processed_style)
if has_margin:
num_m = abs(int("0" + "".join(
- filter(str.isdigit, str(has_margin.group(3))))))
+ filter(str.isdigit, str(has_margin.group(4))))))
if has_text_indent:
num_ti = abs(int("0" + "".join(
- filter(str.isdigit, str(has_text_indent.group(2))))))
+ filter(str.isdigit, str(has_text_indent.group(3))))))
processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(num_m - num_ti)) + "px; ")
processed_style = processed_style.replace(
@@ -106,7 +106,7 @@ class InlineStyleProcessor:
elif has_text_indent:
processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(int("0" + "".join(
- filter(str.isdigit, str(has_text_indent.group(2)))))))
+ filter(str.isdigit, str(has_text_indent.group(3)))))))
+ "px; ")
return processed_style
return processed_style
@@ -127,22 +127,25 @@ class InlineStyleProcessor:
processed inline style
"""
- inline_style = self.tag_inline_style.attrs.get("style") + ";"
- # 1. Remove white color if tag doesn"t have background color in style
- inline_style = self.remove_white_if_no_bgcolor(
- inline_style, self.tag_inline_style)
- inline_style = inline_style.replace(
- "list-style-image", "list-style-type")
- # 2. Create list of styles from inline style
- # replace all spaces between "; & letter" to ";"
- style = re.sub(r"; *", ";", inline_style)
- # when we split style by ";", last element of the list is "" - None (remove it)
- split_inline_style: list = list(filter(None, style.split(";")))
- # 3. Duplicate styles check - if the tag had duplicate styles
- # split_inline_style = self.duplicate_styles_check(split_inline_style)
- # 4. Processing indents
- inline_style: str = self.indents_processing(split_inline_style)
- return inline_style
+ if self.tag_inline_style.attrs.get("style"):
+ inline_style = self.tag_inline_style.attrs.get("style") + ";"
+ # 1. Remove white color if tag doesn't have background color in style
+ inline_style = self.remove_white_if_no_bgcolor(
+ inline_style, self.tag_inline_style)
+ inline_style = inline_style.replace(
+ "list-style-image", "list-style-type")
+ # 2. Create list of styles from inline style
+ # replace all spaces between "; & letter" to ";"
+ style = re.sub(r"; *", ";", inline_style)
+ # when we split style by ";", last element of the list is "" - None (remove it)
+ split_inline_style: list = list(filter(None, style.split(";")))
+ # 3. Duplicate styles check - if the tag had duplicate styles
+ # split_inline_style = self.duplicate_styles_check(split_inline_style)
+ # 4. Processing indents
+ inline_style: str = self.indents_processing(split_inline_style)
+ return inline_style
+ else:
+ return ""
@staticmethod
def check_style_to_be_tag(style: str) -> List[tuple]:
diff --git a/src/livecarta_config.py b/src/livecarta_config.py
index c050d8f..7930112 100644
--- a/src/livecarta_config.py
+++ b/src/livecarta_config.py
@@ -59,6 +59,7 @@ class LiveCartaConfig:
"font-style": ["italic"], #
"text-decoration": ["underline", "line-through"], # ,
"text-decoration-line": ["underline", "line-through"], # ,
+ "text-transform": [],
"vertical-align": ["super"], #
"color": [],
"background-color": [],
@@ -76,4 +77,5 @@ class LiveCartaConfig:
"margin-left": [],
"margin-top": [],
"margin": [],
+
}
diff --git a/src/style_reader.py b/src/style_reader.py
index 9810caf..daa2c3e 100644
--- a/src/style_reader.py
+++ b/src/style_reader.py
@@ -1,6 +1,6 @@
import re
import cssutils
-from typing import Tuple
+from typing import List, Tuple, Union
from os.path import dirname, normpath, join
from src.util.color_reader import str2hex
@@ -16,28 +16,29 @@ class StyleReader:
to suit LiveCarta style convention.
"""
self.LIVECARTA_STYLE_ATTRS_MAPPING = {
- "text-indent": self.convert_indents_tag_values,
+ "text-indent": lambda x: self.convert_tag_style_values(x, is_indent=True),
"font-variant": lambda x: x,
"text-align": lambda x: x,
"font": lambda x: "",
"font-family": lambda x: x,
"font-size": self.convert_tag_style_values,
+ "text-transform": lambda x: x,
"color": self.get_text_color,
"background-color": self.get_bg_color,
"background": self.get_bg_color,
- "border": lambda x: x if x != "0" else "",
- "border-top-width": lambda x: x if x != "0" else "",
- "border-right-width": lambda x: x if x != "0" else "",
- "border-left-width": lambda x: x if x != "0" else "",
- "border-bottom-width": lambda x: x if x != "0" else "",
- "border-top": lambda x: x if x != "0" else "",
- "border-bottom": lambda x: x if x != "0" else "",
+ "border": self.convert_tag_style_values,
+ "border-top-width": self.convert_tag_style_values,
+ "border-right-width": self.convert_tag_style_values,
+ "border-left-width": self.convert_tag_style_values,
+ "border-bottom-width": self.convert_tag_style_values,
+ "border-top": self.convert_tag_style_values,
+ "border-bottom": self.convert_tag_style_values,
"list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
"list-style-image": lambda x: "disc",
- "margin-left": self.convert_indents_tag_values,
- "margin-top": self.convert_tag_style_values,
- "margin": self.convert_indents_tag_values,
- "width": self.convert_tag_style_values,
+ "margin-left": lambda x: self.convert_tag_style_values(x, is_indent=True),
+ "margin-top": lambda x: self.convert_tag_style_values(x, is_indent=True),
+ "margin": lambda x: self.convert_tag_style_values(x, is_indent=True),
+ "width": lambda x: self.convert_tag_style_values(x) if "%" not in x else x
}
@staticmethod
@@ -68,43 +69,26 @@ class StyleReader:
-------
size_value: str
converted value size
+
"""
- size_regexp = re.compile(
- r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)|(^-*(\d*\.*\d+)in$)")
- has_style_attrs = re.search(size_regexp, size_value)
- if has_style_attrs:
- if has_style_attrs.group(1):
+ def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str:
+ size_number = float(size_number.replace(unit_to_replace, "")) * multiplier
+ return str(size_number) + "px"
+ has_size = re.search(r"(\d+)([\w%]+)", size_value)
+ values: List = size_value.split(" ")
+ if has_size:
+ size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0]
+ if has_size.group(2) == "%":
multiplier = 5.76 if is_indent else 0.16
- size_value = float(size_value.replace("%", "")) * multiplier
- return str(size_value) + "px"
- elif has_style_attrs.group(3):
+ values[size_number_idx] = convert_size_number(values[size_number_idx], "%", multiplier)
+ elif has_size.group(2) == "em":
multiplier = 18 if is_indent else 16
- size_value = float(size_value.replace("em", "")) * multiplier
- return str(size_value) + "px"
- elif has_style_attrs.group(5):
- size_value = float(size_value.replace("pt", "")) * 4/3
- return str(size_value) + "px"
- elif has_style_attrs.group(7):
- size_value = float(size_value.replace("in", "")) * 96
- return str(size_value) + "px"
- else:
- return ""
- return size_value
-
- def convert_indents_tag_values(self, size_value: str) -> str:
- """
- Function converts values of ["text-indent", "margin-left", "margin"]
- Parameters
- ----------
- size_value: str
-
- Returns
- -------
- size_value: str
-
- """
- size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
- else self.convert_tag_style_values(size_value.split(" ")[-1], True)
+ values[size_number_idx] = convert_size_number(values[size_number_idx], "em", multiplier)
+ elif has_size.group(2) == "pt":
+ values[size_number_idx] = convert_size_number(values[size_number_idx], "pt", 4 / 3)
+ elif has_size.group(2) == "in":
+ values[size_number_idx] = convert_size_number(values[size_number_idx], "in", 96)
+ size_value = " ".join(values)
return size_value
@staticmethod
@@ -125,17 +109,18 @@ class StyleReader:
return constraints_on_value, value_not_in_possible_values_list
def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
- for i, style in enumerate(split_style):
+ for i, style in reversed(list(enumerate(split_style))):
style_name, style_value = style.split(":")
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
- # property not in LIVECARTA_STYLE_ATTRS, remove from css file
- split_style[i] = ""
- return split_style
+ # property not in LIVECARTA_STYLE_ATTRS, remove
+ split_style.remove(style)
+ continue
cleaned_value = self.clean_value(style_value, style_name)
if all(self.style_conditions(cleaned_value, style_name)):
- # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
- split_style[i] = ""
+ # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove
+ split_style.remove(style)
+ continue
else:
if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
@@ -156,7 +141,7 @@ class StyleReader:
split_style = self.update_inline_styles_to_livecarta_convention(
split_style)
- style = "; ".join(split_style)
+ style = "; ".join(split_style) if split_style else ""
return style
def process_inline_styles_in_html_soup(self, html_content):
diff --git a/src/util/color_reader.py b/src/util/color_reader.py
index 92b3ee7..31bab3b 100644
--- a/src/util/color_reader.py
+++ b/src/util/color_reader.py
@@ -103,7 +103,7 @@ def str2hex(s: str) -> str:
return rgb_percent_to_hex((r, g, b))
if "rgb" in s.lower():
- rgba = re.findall("([0-9] *\.?[0-9]+)", s)
+ rgba = re.findall("(\d+(?:\.\d+)?)", s)
r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2])
if len(rgba) == 4:
alpha = float(rgba[3])