Merge pull request #302 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
This commit is contained in:
Kiryl
2022-09-22 14:22:47 +03:00
committed by GitHub
9 changed files with 181 additions and 113 deletions

View File

@@ -1,7 +1,25 @@
# About
<h1 align="center"> Converter </h1> <br>
<p align="center">
<a href="https://livecarta.com/">
<img alt="LiveCarta converter" title="LiveCarta converter" src="https://assets.openstax.org/oscms-prodcms/media/partner_logos/LiveCarta_Logo.png" width="450">
</a>
</p>
This repository contains code related to docx/epub files conversion to livecarta inner format.
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
## Table of Contents
- [Introduction](#introduction)
- [Features](#features)
- [Top level project structure](#top-level-project-structure)
- [How it Works](#how-it-works)
- [Setup](#setup)
- [Development](#development)
- [How to use](#how-to-use)
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
## Introduction
This is a Python 3 project for converting Docx|Epub documents -> LiveCarta inner format.
Livecarta book format is tree structure, where nodes are chapters.
Livecarta chapter is title + html code. Livecarta html code follows some restrictions:
@@ -12,10 +30,57 @@ Livecarta chapter is title + html code. Livecarta html code follows some restric
- Styles are added as _inline_, i.e. attribute `style` in html tag.
- Each tag has its own restrictions on attributes and style. See doc/style_config
## Features
- Converts Epub, Docx to JSON(LiveCarta inner format)
- Compatible with python 3
- Very small size (only .py files)
- Multithreaded
# Top level project structure
## Top level project structure
- `consumer.py` - code which is responsible for receiving messages from rabbitMQ
- class `Access` - contains API code which is responsible for interaction with server.
- class `Solver` - contains code responsible for pipeline of solving the task: receiving book file, conversion, status updating, sending result back to server.
- `livecarta_config.py `- constants that depend on LiveCarta
- `livecarta_config.py `- constants that depend on LiveCarta
## How it Works
**2 approaches** in 3 steps each works:
#### Epub
***Step 1*** - Add CSS to HTML inline_style
**Step 2** - Process every HTML chapter of Epub with presets
**Step 3** - Convert dicts of HTML to JSON(LiveCarta inner format)
#### Docx
**Step 1** - Conversion of DOCX to HTML via LibreOffice
**Step 2** - Process HTML with presets
**Step 3** - Conversion of HTML to JSON(LiveCarta inner format)
## Setup
python -m pip install -r requirements.txt
### Development
To fix a bug or enhance an existing module, follow these steps:
- Fork the repo
- Create a new branch (`git checkout -b improve-feature`)
- Make the appropriate changes in the files
- Add changes to reflect the changes made
- Commit your changes (`git commit -am 'Improve feature'`)
- Push to the branch (`git push origin improve-feature`)
- Create a Pull Request
## How to Use
**1.** Run `consumer.py`
The script will be constantly waiting for a message from the queue(RabbitMQ), into which we load the book via Import File to Convert in the admin panel
You can also upload the book that have been converted locally using `def local_convert()` in `consumer.py`
**b.** Run `docx_solver.py`
1. You need to run it on Linux system, but if u're using Windows - just using python docker intepreter
2. Upload a book to books/docx/ and set the variable `docx_file_path = books/docx/book_name` in __main__
**c.** Run `epub_solver.py`
Before that upload a book to books/epub/ and set the variable `epub_file_path = books/epub/book_name` in __main__

View File

@@ -16,6 +16,10 @@
"name": "border",
"value": ".*"
},
{
"name": "style",
"value": "border.*"
},
{
"name": "bgcolor",
"value": ".*"
@@ -42,14 +46,14 @@
"preset_name": "replacer",
"rules": [
{
"tags": ["^h[6-9]$", "^figure$", "^section$", "^div$"],
"tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"],
"condition": null,
"tag_to_replace": "p"
},
{
"tags": ["^aside$"],
"condition": null,
"tag_to_replace": "blockquote"
"tag_to_replace": "div"
},
{
"tags": ["^header$", "^footer$"],
@@ -65,6 +69,11 @@
},
"tag_to_replace": "span"
},
{
"tags": ["^em$"],
"condition": null,
"tag_to_replace": "i"
},
{
"tags": ["^b$"],
"condition": null,
@@ -101,6 +110,7 @@
{
"tags": [
"^section$",
"^blockquote$",
"^article$",
"^figcaption$",
"^main$",
@@ -131,6 +141,11 @@
"attrs": null
},
"tag_to_insert": "code"
},
{
"tags": ["^h[1-5]$"],
"condition": null,
"tag_to_insert": "strong"
}
]
}

View File

@@ -13,8 +13,7 @@ from src.inline_style_processor import modify_html_soup_with_css_styles
class HtmlDocxProcessor:
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
self.logger = logger
self.html_soup = html_soup
self.body_tag = self.html_soup.body
self.body_tag: BeautifulSoup = BeautifulSoup(str(html_soup.body))
self.html_preprocessor = html_preprocessor
self.style_preprocessor = style_preprocessor
self.content: List[Tag] = []
@@ -23,7 +22,6 @@ class HtmlDocxProcessor:
for font in self.body_tag.find_all("font"):
font.name = "span"
def _process_hrefs(self):
a_tags_with_href = self.body_tag.find_all(
"a", {"href": re.compile("^.*http.+")})
@@ -206,10 +204,9 @@ class HtmlDocxProcessor:
else:
h_tag.unwrap()
def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated
toc_tag = self.html_soup.new_tag("TOC")
toc_tag = self.body_tag.new_tag("TOC")
if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:]
@@ -229,7 +226,7 @@ class HtmlDocxProcessor:
self.body_tag)
self.logger.log("Inline style processing.")
modify_html_soup_with_css_styles(self.body_tag)
self.body_tag = modify_html_soup_with_css_styles(self.body_tag)
self.logger.log("Image processing.")
images = process_images(access, path_to_html=html_path,
@@ -256,9 +253,9 @@ class HtmlDocxProcessor:
self.logger.log(f".html using presets processing.")
_process_presets(html_preprocessor=self.html_preprocessor,
html_soup=self.html_soup)
html_soup=self.body_tag)
self.content = self.body_tag.find_all(recursive=False)
self.content = self.body_tag.body.find_all(recursive=False)
# delete text before table of content if exists
self.delete_content_before_toc()

View File

@@ -1,5 +1,5 @@
import re
from typing import Union
from typing import List, Union
from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
clean/remove headings & add span with id
"""
title_of_chapter = title_of_chapter.lower()
for tag in chapter_tag.contents:
tag: PageElement
def text_preparing(tag: PageElement):
text: str = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
if not isinstance(tag, NavigableString):
if title_of_chapter == text or \
(title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
self.html_preprocessor._add_span_to_save_ids_for_links(
tag, chapter_tag)
tag.extract()
return
elif not self._remove_headings_content(tag, title_of_chapter):
break
else:
tag.extract()
return
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
return text
title_of_chapter: str = title_of_chapter.lower()
title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
(title_of_chapter in text_preparing(tag) and
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
if title_in_text:
self.html_preprocessor._add_span_to_save_ids_for_links(
title_in_text[-1], chapter_tag)
title_in_text[-1].extract()
elif text_in_title:
[self.html_preprocessor._add_span_to_save_ids_for_links(
tag, chapter_tag) for tag in text_in_title]
[tag.extract() for tag in text_in_title]
@staticmethod
def _class_removing(chapter_tag: BeautifulSoup):

View File

@@ -28,6 +28,7 @@ class HtmlPresetsProcessor:
@staticmethod
def _decompose_tag(**kwargs):
kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
kwargs["tag"].decompose()
@staticmethod
@@ -112,6 +113,7 @@ class HtmlPresetsProcessor:
@staticmethod
def _unwrap_tag(**kwargs):
kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
kwargs["tag"].unwrap()
@staticmethod
@@ -153,7 +155,6 @@ class HtmlPresetsProcessor:
for parent_tag in body_tag.select(condition_on_tag[1]):
for tag in parent_tag.find_all([re.compile(tag) for tag in tags]):
# parent_tag != tag.parent
tag.parent.attrs.update(tag.attrs)
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "child_tags":
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):

View File

@@ -14,7 +14,7 @@ class InlineStyleProcessor:
def __init__(self, tag_inline_style: Tag):
# tag with inline style + style parsed from css file
self.tag_inline_style = tag_inline_style
self.tag_inline_style.attrs['style']: str = self.process_inline_style()
self.tag_inline_style.attrs["style"]: str = self.process_inline_style()
@staticmethod
def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str:
@@ -80,19 +80,19 @@ class InlineStyleProcessor:
processed_style = ";".join(split_style)+';'
margin_left_regexp = re.compile(
r"((margin-left|margin): *(-*\w+);*)")
r"((margin-left|margin): *-*((\d*)\.*\d+)\w+;*)")
text_indent_regexp = re.compile(
r"(text-indent: *(-*\w+);*)")
r"(text-indent: *-*((\d*)\.*\d+)\w+;*)")
has_margin = re.search(margin_left_regexp, processed_style)
has_text_indent = re.search(text_indent_regexp, processed_style)
if has_margin:
num_m = abs(int("0" + "".join(
filter(str.isdigit, str(has_margin.group(3))))))
filter(str.isdigit, str(has_margin.group(4))))))
if has_text_indent:
num_ti = abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2))))))
filter(str.isdigit, str(has_text_indent.group(3))))))
processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(num_m - num_ti)) + "px; ")
processed_style = processed_style.replace(
@@ -106,7 +106,7 @@ class InlineStyleProcessor:
elif has_text_indent:
processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2)))))))
filter(str.isdigit, str(has_text_indent.group(3)))))))
+ "px; ")
return processed_style
return processed_style
@@ -127,22 +127,25 @@ class InlineStyleProcessor:
processed inline style
"""
inline_style = self.tag_inline_style.attrs.get("style") + ";"
# 1. Remove white color if tag doesn"t have background color in style
inline_style = self.remove_white_if_no_bgcolor(
inline_style, self.tag_inline_style)
inline_style = inline_style.replace(
"list-style-image", "list-style-type")
# 2. Create list of styles from inline style
# replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", inline_style)
# when we split style by ";", last element of the list is "" - None (remove it)
split_inline_style: list = list(filter(None, style.split(";")))
# 3. Duplicate styles check - if the tag had duplicate styles
# split_inline_style = self.duplicate_styles_check(split_inline_style)
# 4. Processing indents
inline_style: str = self.indents_processing(split_inline_style)
return inline_style
if self.tag_inline_style.attrs.get("style"):
inline_style = self.tag_inline_style.attrs.get("style") + ";"
# 1. Remove white color if tag doesn't have background color in style
inline_style = self.remove_white_if_no_bgcolor(
inline_style, self.tag_inline_style)
inline_style = inline_style.replace(
"list-style-image", "list-style-type")
# 2. Create list of styles from inline style
# replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", inline_style)
# when we split style by ";", last element of the list is "" - None (remove it)
split_inline_style: list = list(filter(None, style.split(";")))
# 3. Duplicate styles check - if the tag had duplicate styles
# split_inline_style = self.duplicate_styles_check(split_inline_style)
# 4. Processing indents
inline_style: str = self.indents_processing(split_inline_style)
return inline_style
else:
return ""
@staticmethod
def check_style_to_be_tag(style: str) -> List[tuple]:

View File

@@ -59,6 +59,7 @@ class LiveCartaConfig:
"font-style": ["italic"], # <i>
"text-decoration": ["underline", "line-through"], # <u> , <s>
"text-decoration-line": ["underline", "line-through"], # <u> , <s>
"text-transform": [],
"vertical-align": ["super"], # <sup>
"color": [],
"background-color": [],
@@ -76,4 +77,5 @@ class LiveCartaConfig:
"margin-left": [],
"margin-top": [],
"margin": [],
}

View File

@@ -1,6 +1,6 @@
import re
import cssutils
from typing import Tuple
from typing import List, Tuple, Union
from os.path import dirname, normpath, join
from src.util.color_reader import str2hex
@@ -16,28 +16,29 @@ class StyleReader:
to suit LiveCarta style convention.
"""
self.LIVECARTA_STYLE_ATTRS_MAPPING = {
"text-indent": self.convert_indents_tag_values,
"text-indent": lambda x: self.convert_tag_style_values(x, is_indent=True),
"font-variant": lambda x: x,
"text-align": lambda x: x,
"font": lambda x: "",
"font-family": lambda x: x,
"font-size": self.convert_tag_style_values,
"text-transform": lambda x: x,
"color": self.get_text_color,
"background-color": self.get_bg_color,
"background": self.get_bg_color,
"border": lambda x: x if x != "0" else "",
"border-top-width": lambda x: x if x != "0" else "",
"border-right-width": lambda x: x if x != "0" else "",
"border-left-width": lambda x: x if x != "0" else "",
"border-bottom-width": lambda x: x if x != "0" else "",
"border-top": lambda x: x if x != "0" else "",
"border-bottom": lambda x: x if x != "0" else "",
"border": self.convert_tag_style_values,
"border-top-width": self.convert_tag_style_values,
"border-right-width": self.convert_tag_style_values,
"border-left-width": self.convert_tag_style_values,
"border-bottom-width": self.convert_tag_style_values,
"border-top": self.convert_tag_style_values,
"border-bottom": self.convert_tag_style_values,
"list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
"list-style-image": lambda x: "disc",
"margin-left": self.convert_indents_tag_values,
"margin-top": self.convert_tag_style_values,
"margin": self.convert_indents_tag_values,
"width": self.convert_tag_style_values,
"margin-left": lambda x: self.convert_tag_style_values(x, is_indent=True),
"margin-top": lambda x: self.convert_tag_style_values(x, is_indent=True),
"margin": lambda x: self.convert_tag_style_values(x, is_indent=True),
"width": lambda x: self.convert_tag_style_values(x) if "%" not in x else x
}
@staticmethod
@@ -68,43 +69,26 @@ class StyleReader:
-------
size_value: str
converted value size
"""
size_regexp = re.compile(
r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)|(^-*(\d*\.*\d+)in$)")
has_style_attrs = re.search(size_regexp, size_value)
if has_style_attrs:
if has_style_attrs.group(1):
def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str:
size_number = float(size_number.replace(unit_to_replace, "")) * multiplier
return str(size_number) + "px"
has_size = re.search(r"(\d+)([\w%]+)", size_value)
values: List = size_value.split(" ")
if has_size:
size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0]
if has_size.group(2) == "%":
multiplier = 5.76 if is_indent else 0.16
size_value = float(size_value.replace("%", "")) * multiplier
return str(size_value) + "px"
elif has_style_attrs.group(3):
values[size_number_idx] = convert_size_number(values[size_number_idx], "%", multiplier)
elif has_size.group(2) == "em":
multiplier = 18 if is_indent else 16
size_value = float(size_value.replace("em", "")) * multiplier
return str(size_value) + "px"
elif has_style_attrs.group(5):
size_value = float(size_value.replace("pt", "")) * 4/3
return str(size_value) + "px"
elif has_style_attrs.group(7):
size_value = float(size_value.replace("in", "")) * 96
return str(size_value) + "px"
else:
return ""
return size_value
def convert_indents_tag_values(self, size_value: str) -> str:
"""
Function converts values of ["text-indent", "margin-left", "margin"]
Parameters
----------
size_value: str
Returns
-------
size_value: str
"""
size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
else self.convert_tag_style_values(size_value.split(" ")[-1], True)
values[size_number_idx] = convert_size_number(values[size_number_idx], "em", multiplier)
elif has_size.group(2) == "pt":
values[size_number_idx] = convert_size_number(values[size_number_idx], "pt", 4 / 3)
elif has_size.group(2) == "in":
values[size_number_idx] = convert_size_number(values[size_number_idx], "in", 96)
size_value = " ".join(values)
return size_value
@staticmethod
@@ -125,17 +109,18 @@ class StyleReader:
return constraints_on_value, value_not_in_possible_values_list
def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
for i, style in enumerate(split_style):
for i, style in reversed(list(enumerate(split_style))):
style_name, style_value = style.split(":")
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ""
return split_style
# property not in LIVECARTA_STYLE_ATTRS, remove
split_style.remove(style)
continue
cleaned_value = self.clean_value(style_value, style_name)
if all(self.style_conditions(cleaned_value, style_name)):
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ""
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove
split_style.remove(style)
continue
else:
if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
@@ -156,7 +141,7 @@ class StyleReader:
split_style = self.update_inline_styles_to_livecarta_convention(
split_style)
style = "; ".join(split_style)
style = "; ".join(split_style) if split_style else ""
return style
def process_inline_styles_in_html_soup(self, html_content):

View File

@@ -103,7 +103,7 @@ def str2hex(s: str) -> str:
return rgb_percent_to_hex((r, g, b))
if "rgb" in s.lower():
rgba = re.findall("([0-9] *\.?[0-9]+)", s)
rgba = re.findall("(\d+(?:\.\d+)?)", s)
r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2])
if len(rgba) == 4:
alpha = float(rgba[3])