diff --git a/README.md b/README.md index b0faa3d..dc32dde 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,25 @@ -# About +

Converter


+

+ + LiveCarta converter + +

-This repository contains code related to docx/epub files conversion to livecarta inner format. + + +## Table of Contents +- [Introduction](#introduction) +- [Features](#features) +- [Top level project structure](#top-level-project-structure) +- [How it Works](#how-it-works) +- [Setup](#setup) + - [Development](#development) +- [How to use](#how-to-use) + + +## Introduction +This is a Python 3 project for converting Docx|Epub documents -> LiveCarta inner format. Livecarta book format is tree structure, where nodes are chapters. Livecarta chapter is title + html code. Livecarta html code follows some restrictions: @@ -12,10 +30,57 @@ Livecarta chapter is title + html code. Livecarta html code follows some restric - Styles are added as _inline_, i.e. attribute `style` in html tag. - Each tag has its own restrictions on attributes and style. See doc/style_config +## Features +- Converts Epub, Docx to JSON(LiveCarta inner format) +- Compatible with python 3 +- Very small size (only .py files) +- Multithreaded -# Top level project structure - +## Top level project structure - `consumer.py` - code which is responsible for receiving messages from rabbitMQ - class `Access` - contains API code which is responsible for interaction with server. - class `Solver` - contains code responsible for pipeline of solving the task: receiving book file, conversion, status updating, sending result back to server. -- `livecarta_config.py `- constants that depend on LiveCarta \ No newline at end of file +- `livecarta_config.py `- constants that depend on LiveCarta + +## How it Works +**2 approaches** in 3 steps each works: +#### Epub +***Step 1*** - Add CSS to HTML inline_style + +**Step 2** - Process every HTML chapter of Epub with presets + +**Step 3** - Convert dicts of HTML to JSON(LiveCarta inner format) + +#### Docx +**Step 1** - Conversion of DOCX to HTML via LibreOffice + +**Step 2** - Process HTML with presets + +**Step 3** - Conversion of HTML to JSON(LiveCarta inner format) + +## Setup + + python -m pip install -r requirements.txt + +### Development +To fix a bug or enhance an existing module, follow these steps: + +- Fork the repo +- Create a new branch (`git checkout -b improve-feature`) +- Make the appropriate changes in the files +- Add changes to reflect the changes made +- Commit your changes (`git commit -am 'Improve feature'`) +- Push to the branch (`git push origin improve-feature`) +- Create a Pull Request + +## How to Use +**1.** Run `consumer.py` +The script will be constantly waiting for a message from the queue(RabbitMQ), into which we load the book via Import File to Convert in the admin panel +You can also upload the book that have been converted locally using `def local_convert()` in `consumer.py` + +**b.** Run `docx_solver.py` +1. You need to run it on Linux system, but if u're using Windows - just using python docker intepreter +2. Upload a book to books/docx/ and set the variable `docx_file_path = books/docx/book_name` in __main__ + +**c.** Run `epub_solver.py` +Before that upload a book to books/epub/ and set the variable `epub_file_path = books/epub/book_name` in __main__ diff --git a/presets/epub_presets.json b/presets/epub_presets.json index 1ff62a8..07e191c 100644 --- a/presets/epub_presets.json +++ b/presets/epub_presets.json @@ -16,6 +16,10 @@ "name": "border", "value": ".*" }, + { + "name": "style", + "value": "border.*" + }, { "name": "bgcolor", "value": ".*" @@ -42,14 +46,14 @@ "preset_name": "replacer", "rules": [ { - "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$"], + "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"], "condition": null, "tag_to_replace": "p" }, { "tags": ["^aside$"], "condition": null, - "tag_to_replace": "blockquote" + "tag_to_replace": "div" }, { "tags": ["^header$", "^footer$"], @@ -65,6 +69,11 @@ }, "tag_to_replace": "span" }, + { + "tags": ["^em$"], + "condition": null, + "tag_to_replace": "i" + }, { "tags": ["^b$"], "condition": null, @@ -101,6 +110,7 @@ { "tags": [ "^section$", + "^blockquote$", "^article$", "^figcaption$", "^main$", @@ -131,6 +141,11 @@ "attrs": null }, "tag_to_insert": "code" + }, + { + "tags": ["^h[1-5]$"], + "condition": null, + "tag_to_insert": "strong" } ] } diff --git a/src/docx_converter/html_docx_processor.py b/src/docx_converter/html_docx_processor.py index 8650865..c92e997 100644 --- a/src/docx_converter/html_docx_processor.py +++ b/src/docx_converter/html_docx_processor.py @@ -13,8 +13,7 @@ from src.inline_style_processor import modify_html_soup_with_css_styles class HtmlDocxProcessor: def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor): self.logger = logger - self.html_soup = html_soup - self.body_tag = self.html_soup.body + self.body_tag: BeautifulSoup = BeautifulSoup(str(html_soup.body)) self.html_preprocessor = html_preprocessor self.style_preprocessor = style_preprocessor self.content: List[Tag] = [] @@ -23,7 +22,6 @@ class HtmlDocxProcessor: for font in self.body_tag.find_all("font"): font.name = "span" - def _process_hrefs(self): a_tags_with_href = self.body_tag.find_all( "a", {"href": re.compile("^.*http.+")}) @@ -206,10 +204,9 @@ class HtmlDocxProcessor: else: h_tag.unwrap() - def delete_content_before_toc(self): # remove all tag upper the only in content !!! body tag is not updated - toc_tag = self.html_soup.new_tag("TOC") + toc_tag = self.body_tag.new_tag("TOC") if toc_tag in self.content: ind = self.content.index(toc_tag) + 1 self.content = self.content[ind:] @@ -229,7 +226,7 @@ class HtmlDocxProcessor: self.body_tag) self.logger.log("Inline style processing.") - modify_html_soup_with_css_styles(self.body_tag) + self.body_tag = modify_html_soup_with_css_styles(self.body_tag) self.logger.log("Image processing.") images = process_images(access, path_to_html=html_path, @@ -256,9 +253,9 @@ class HtmlDocxProcessor: self.logger.log(f".html using presets processing.") _process_presets(html_preprocessor=self.html_preprocessor, - html_soup=self.html_soup) + html_soup=self.body_tag) - self.content = self.body_tag.find_all(recursive=False) + self.content = self.body_tag.body.find_all(recursive=False) # delete text before table of content if exists self.delete_content_before_toc() diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index e92ac8b..40640c1 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -1,5 +1,5 @@ import re -from typing import Union +from typing import List, Union from bs4.element import PageElement from bs4 import BeautifulSoup, Tag, NavigableString, Comment @@ -92,26 +92,26 @@ class HtmlEpubProcessor: clean/remove headings & add span with id """ - title_of_chapter = title_of_chapter.lower() - for tag in chapter_tag.contents: - tag: PageElement + def text_preparing(tag: PageElement): text: str = tag if isinstance(tag, NavigableString) else tag.text - if re.sub(r"[\s\xa0]", "", text): - text = re.sub(r"[\s\xa0]", " ", text).lower() - text = text.strip() # delete extra spaces - if not isinstance(tag, NavigableString): - if title_of_chapter == text or \ - (title_of_chapter in text and - re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): - self.html_preprocessor._add_span_to_save_ids_for_links( - tag, chapter_tag) - tag.extract() - return - elif not self._remove_headings_content(tag, title_of_chapter): - break - else: - tag.extract() - return + text = re.sub(r"[\s\xa0]", " ", text).lower() + text = text.strip() # delete extra spaces + return text + + title_of_chapter: str = title_of_chapter.lower() + title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \ + (title_of_chapter in text_preparing(tag) and + re.findall(r"^h[1-3]$", tag.name or chapter_tag.name))) + + text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter)) + if title_in_text: + self.html_preprocessor._add_span_to_save_ids_for_links( + title_in_text[-1], chapter_tag) + title_in_text[-1].extract() + elif text_in_title: + [self.html_preprocessor._add_span_to_save_ids_for_links( + tag, chapter_tag) for tag in text_in_title] + [tag.extract() for tag in text_in_title] @staticmethod def _class_removing(chapter_tag: BeautifulSoup): diff --git a/src/html_presets_processor.py b/src/html_presets_processor.py index 30f7906..eeba3ca 100644 --- a/src/html_presets_processor.py +++ b/src/html_presets_processor.py @@ -28,6 +28,7 @@ class HtmlPresetsProcessor: @staticmethod def _decompose_tag(**kwargs): + kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs) kwargs["tag"].decompose() @staticmethod @@ -112,6 +113,7 @@ class HtmlPresetsProcessor: @staticmethod def _unwrap_tag(**kwargs): + kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs) kwargs["tag"].unwrap() @staticmethod @@ -153,7 +155,6 @@ class HtmlPresetsProcessor: for parent_tag in body_tag.select(condition_on_tag[1]): for tag in parent_tag.find_all([re.compile(tag) for tag in tags]): # parent_tag != tag.parent - tag.parent.attrs.update(tag.attrs) action(body_tag=body_tag, tag=tag, rule=rule) elif condition_on_tag[0] == "child_tags": for tag in body_tag.find_all([re.compile(tag) for tag in tags]): diff --git a/src/inline_style_processor.py b/src/inline_style_processor.py index d63122a..cc7c14d 100644 --- a/src/inline_style_processor.py +++ b/src/inline_style_processor.py @@ -14,7 +14,7 @@ class InlineStyleProcessor: def __init__(self, tag_inline_style: Tag): # tag with inline style + style parsed from css file self.tag_inline_style = tag_inline_style - self.tag_inline_style.attrs['style']: str = self.process_inline_style() + self.tag_inline_style.attrs["style"]: str = self.process_inline_style() @staticmethod def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str: @@ -80,19 +80,19 @@ class InlineStyleProcessor: processed_style = ";".join(split_style)+';' margin_left_regexp = re.compile( - r"((margin-left|margin): *(-*\w+);*)") + r"((margin-left|margin): *-*((\d*)\.*\d+)\w+;*)") text_indent_regexp = re.compile( - r"(text-indent: *(-*\w+);*)") + r"(text-indent: *-*((\d*)\.*\d+)\w+;*)") has_margin = re.search(margin_left_regexp, processed_style) has_text_indent = re.search(text_indent_regexp, processed_style) if has_margin: num_m = abs(int("0" + "".join( - filter(str.isdigit, str(has_margin.group(3)))))) + filter(str.isdigit, str(has_margin.group(4)))))) if has_text_indent: num_ti = abs(int("0" + "".join( - filter(str.isdigit, str(has_text_indent.group(2)))))) + filter(str.isdigit, str(has_text_indent.group(3)))))) processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + str(abs(num_m - num_ti)) + "px; ") processed_style = processed_style.replace( @@ -106,7 +106,7 @@ class InlineStyleProcessor: elif has_text_indent: processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + str(abs(int("0" + "".join( - filter(str.isdigit, str(has_text_indent.group(2))))))) + filter(str.isdigit, str(has_text_indent.group(3))))))) + "px; ") return processed_style return processed_style @@ -127,22 +127,25 @@ class InlineStyleProcessor: processed inline style """ - inline_style = self.tag_inline_style.attrs.get("style") + ";" - # 1. Remove white color if tag doesn"t have background color in style - inline_style = self.remove_white_if_no_bgcolor( - inline_style, self.tag_inline_style) - inline_style = inline_style.replace( - "list-style-image", "list-style-type") - # 2. Create list of styles from inline style - # replace all spaces between "; & letter" to ";" - style = re.sub(r"; *", ";", inline_style) - # when we split style by ";", last element of the list is "" - None (remove it) - split_inline_style: list = list(filter(None, style.split(";"))) - # 3. Duplicate styles check - if the tag had duplicate styles - # split_inline_style = self.duplicate_styles_check(split_inline_style) - # 4. Processing indents - inline_style: str = self.indents_processing(split_inline_style) - return inline_style + if self.tag_inline_style.attrs.get("style"): + inline_style = self.tag_inline_style.attrs.get("style") + ";" + # 1. Remove white color if tag doesn't have background color in style + inline_style = self.remove_white_if_no_bgcolor( + inline_style, self.tag_inline_style) + inline_style = inline_style.replace( + "list-style-image", "list-style-type") + # 2. Create list of styles from inline style + # replace all spaces between "; & letter" to ";" + style = re.sub(r"; *", ";", inline_style) + # when we split style by ";", last element of the list is "" - None (remove it) + split_inline_style: list = list(filter(None, style.split(";"))) + # 3. Duplicate styles check - if the tag had duplicate styles + # split_inline_style = self.duplicate_styles_check(split_inline_style) + # 4. Processing indents + inline_style: str = self.indents_processing(split_inline_style) + return inline_style + else: + return "" @staticmethod def check_style_to_be_tag(style: str) -> List[tuple]: diff --git a/src/livecarta_config.py b/src/livecarta_config.py index c050d8f..7930112 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -59,6 +59,7 @@ class LiveCartaConfig: "font-style": ["italic"], # "text-decoration": ["underline", "line-through"], # , "text-decoration-line": ["underline", "line-through"], # , + "text-transform": [], "vertical-align": ["super"], # "color": [], "background-color": [], @@ -76,4 +77,5 @@ class LiveCartaConfig: "margin-left": [], "margin-top": [], "margin": [], + } diff --git a/src/style_reader.py b/src/style_reader.py index 9810caf..daa2c3e 100644 --- a/src/style_reader.py +++ b/src/style_reader.py @@ -1,6 +1,6 @@ import re import cssutils -from typing import Tuple +from typing import List, Tuple, Union from os.path import dirname, normpath, join from src.util.color_reader import str2hex @@ -16,28 +16,29 @@ class StyleReader: to suit LiveCarta style convention. """ self.LIVECARTA_STYLE_ATTRS_MAPPING = { - "text-indent": self.convert_indents_tag_values, + "text-indent": lambda x: self.convert_tag_style_values(x, is_indent=True), "font-variant": lambda x: x, "text-align": lambda x: x, "font": lambda x: "", "font-family": lambda x: x, "font-size": self.convert_tag_style_values, + "text-transform": lambda x: x, "color": self.get_text_color, "background-color": self.get_bg_color, "background": self.get_bg_color, - "border": lambda x: x if x != "0" else "", - "border-top-width": lambda x: x if x != "0" else "", - "border-right-width": lambda x: x if x != "0" else "", - "border-left-width": lambda x: x if x != "0" else "", - "border-bottom-width": lambda x: x if x != "0" else "", - "border-top": lambda x: x if x != "0" else "", - "border-bottom": lambda x: x if x != "0" else "", + "border": self.convert_tag_style_values, + "border-top-width": self.convert_tag_style_values, + "border-right-width": self.convert_tag_style_values, + "border-left-width": self.convert_tag_style_values, + "border-bottom-width": self.convert_tag_style_values, + "border-top": self.convert_tag_style_values, + "border-bottom": self.convert_tag_style_values, "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", "list-style-image": lambda x: "disc", - "margin-left": self.convert_indents_tag_values, - "margin-top": self.convert_tag_style_values, - "margin": self.convert_indents_tag_values, - "width": self.convert_tag_style_values, + "margin-left": lambda x: self.convert_tag_style_values(x, is_indent=True), + "margin-top": lambda x: self.convert_tag_style_values(x, is_indent=True), + "margin": lambda x: self.convert_tag_style_values(x, is_indent=True), + "width": lambda x: self.convert_tag_style_values(x) if "%" not in x else x } @staticmethod @@ -68,43 +69,26 @@ class StyleReader: ------- size_value: str converted value size + """ - size_regexp = re.compile( - r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)|(^-*(\d*\.*\d+)in$)") - has_style_attrs = re.search(size_regexp, size_value) - if has_style_attrs: - if has_style_attrs.group(1): + def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str: + size_number = float(size_number.replace(unit_to_replace, "")) * multiplier + return str(size_number) + "px" + has_size = re.search(r"(\d+)([\w%]+)", size_value) + values: List = size_value.split(" ") + if has_size: + size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0] + if has_size.group(2) == "%": multiplier = 5.76 if is_indent else 0.16 - size_value = float(size_value.replace("%", "")) * multiplier - return str(size_value) + "px" - elif has_style_attrs.group(3): + values[size_number_idx] = convert_size_number(values[size_number_idx], "%", multiplier) + elif has_size.group(2) == "em": multiplier = 18 if is_indent else 16 - size_value = float(size_value.replace("em", "")) * multiplier - return str(size_value) + "px" - elif has_style_attrs.group(5): - size_value = float(size_value.replace("pt", "")) * 4/3 - return str(size_value) + "px" - elif has_style_attrs.group(7): - size_value = float(size_value.replace("in", "")) * 96 - return str(size_value) + "px" - else: - return "" - return size_value - - def convert_indents_tag_values(self, size_value: str) -> str: - """ - Function converts values of ["text-indent", "margin-left", "margin"] - Parameters - ---------- - size_value: str - - Returns - ------- - size_value: str - - """ - size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\ - else self.convert_tag_style_values(size_value.split(" ")[-1], True) + values[size_number_idx] = convert_size_number(values[size_number_idx], "em", multiplier) + elif has_size.group(2) == "pt": + values[size_number_idx] = convert_size_number(values[size_number_idx], "pt", 4 / 3) + elif has_size.group(2) == "in": + values[size_number_idx] = convert_size_number(values[size_number_idx], "in", 96) + size_value = " ".join(values) return size_value @staticmethod @@ -125,17 +109,18 @@ class StyleReader: return constraints_on_value, value_not_in_possible_values_list def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list: - for i, style in enumerate(split_style): + for i, style in reversed(list(enumerate(split_style))): style_name, style_value = style.split(":") if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = "" - return split_style + # property not in LIVECARTA_STYLE_ATTRS, remove + split_style.remove(style) + continue cleaned_value = self.clean_value(style_value, style_name) if all(self.style_conditions(cleaned_value, style_name)): - # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = "" + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove + split_style.remove(style) + continue else: if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING: # function that converts our data @@ -156,7 +141,7 @@ class StyleReader: split_style = self.update_inline_styles_to_livecarta_convention( split_style) - style = "; ".join(split_style) + style = "; ".join(split_style) if split_style else "" return style def process_inline_styles_in_html_soup(self, html_content): diff --git a/src/util/color_reader.py b/src/util/color_reader.py index 92b3ee7..31bab3b 100644 --- a/src/util/color_reader.py +++ b/src/util/color_reader.py @@ -103,7 +103,7 @@ def str2hex(s: str) -> str: return rgb_percent_to_hex((r, g, b)) if "rgb" in s.lower(): - rgba = re.findall("([0-9] *\.?[0-9]+)", s) + rgba = re.findall("(\d+(?:\.\d+)?)", s) r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2]) if len(rgba) == 4: alpha = float(rgba[3])