Merge pull request #302 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
2022-09-22 14:22:47 +03:00
parent b7982e7911 00308b61e7
commit 0b35b869f9
9 changed files with 181 additions and 113 deletions
--- a/README.md
+++ b/README.md
@@ -1,7 +1,25 @@
-# About
+<h1 align="center"> Converter </h1> <br>
+<p align="center">
+  <a href="https://livecarta.com/">
+    <img alt="LiveCarta converter" title="LiveCarta converter" src="https://assets.openstax.org/oscms-prodcms/media/partner_logos/LiveCarta_Logo.png" width="450">
+  </a>
+</p>

-This repository contains code related to docx/epub files conversion to livecarta inner format.
+<!-- START doctoc generated TOC please keep comment here to allow auto update -->
+<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
+## Table of Contents

+- [Introduction](#introduction)
+- [Features](#features)
+- [Top level project structure](#top-level-project-structure)
+- [How it Works](#how-it-works)
+- [Setup](#setup)
+  - [Development](#development)
+- [How to use](#how-to-use)
+<!-- END doctoc generated TOC please keep comment here to allow auto update -->
+
+## Introduction
+This is a Python 3 project for converting Docx|Epub documents -> LiveCarta inner format. 
 Livecarta book format is tree structure, where nodes are chapters. 
 Livecarta chapter is title + html code. Livecarta html code follows some restrictions:

@@ -12,10 +30,57 @@ Livecarta chapter is title + html code. Livecarta html code follows some restric
 - Styles are added as _inline_, i.e. attribute `style` in html tag.
 - Each tag has its own restrictions on attributes and style. See doc/style_config

+## Features
+- Converts Epub, Docx to JSON(LiveCarta inner format)
+- Compatible with python 3
+- Very small size (only .py files)
+- Multithreaded

-# Top level project structure
-
+## Top level project structure
 - `consumer.py` - code which is responsible for receiving messages from  rabbitMQ
 - class `Access` - contains API code which is responsible for interaction with server.
 - class `Solver` - contains code responsible for pipeline of solving the task: receiving book file, conversion, status updating, sending result back to server.
- `livecarta_config.py `- constants that depend on LiveCarta
+- `livecarta_config.py `- constants that depend on LiveCarta
+
+## How it Works
+**2 approaches** in 3 steps each works:
+#### Epub
+***Step 1*** - Add CSS to HTML inline_style
+
+**Step 2** - Process every HTML chapter of Epub with presets
+
+**Step 3** - Convert dicts of HTML to JSON(LiveCarta inner format)
+
+#### Docx
+**Step 1** - Conversion of DOCX to HTML via LibreOffice
+
+**Step 2** - Process HTML with presets
+
+**Step 3** - Conversion of HTML to JSON(LiveCarta inner format)
+
+## Setup
+
+    python -m pip install -r requirements.txt
+    
+### Development
+To fix a bug or enhance an existing module, follow these steps:
+
+- Fork the repo
+- Create a new branch (`git checkout -b improve-feature`)
+- Make the appropriate changes in the files
+- Add changes to reflect the changes made
+- Commit your changes (`git commit -am 'Improve feature'`)
+- Push to the branch (`git push origin improve-feature`)
+- Create a Pull Request 
+
+## How to Use
+**1.** Run `consumer.py`
+The script will be constantly waiting for a message from the queue(RabbitMQ), into which we load the book via Import File to Convert in the admin panel
+You can also upload the book that have been converted locally using `def local_convert()` in `consumer.py`
+
+**b.** Run `docx_solver.py`
+1. You need to run it on Linux system, but if u're using Windows - just using python docker intepreter
+2. Upload a book to books/docx/ and set the variable `docx_file_path = books/docx/book_name` in __main__
+
+**c.** Run `epub_solver.py`
+Before that upload a book to books/epub/ and set the variable `epub_file_path = books/epub/book_name` in __main__
--- a/presets/epub_presets.json
+++ b/presets/epub_presets.json
@@ -16,6 +16,10 @@
                            "name": "border",
                            "value": ".*"
                        },
+                        {
+                            "name": "style",
+                            "value": "border.*"
+                        },
                        {
                            "name": "bgcolor",
                            "value": ".*"
@@ -42,14 +46,14 @@
        "preset_name": "replacer",
        "rules": [
            {
-                "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$"],
+                "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"],
                "condition": null,
                "tag_to_replace": "p"
            },
            {
                "tags": ["^aside$"],
                "condition": null,
-                "tag_to_replace": "blockquote"
+                "tag_to_replace": "div"
            },
            {
                "tags": ["^header$", "^footer$"],
@@ -65,6 +69,11 @@
                },
                "tag_to_replace": "span"
            },
+            {
+                "tags": ["^em$"],
+                "condition": null,
+                "tag_to_replace": "i"
+            },
            {
                "tags": ["^b$"],
                "condition": null,
@@ -101,6 +110,7 @@
            {
                "tags": [
                    "^section$",
+                    "^blockquote$",
                    "^article$",
                    "^figcaption$",
                    "^main$",
@@ -131,6 +141,11 @@
                    "attrs": null
                },
                "tag_to_insert": "code"
+            },
+            {
+                "tags": ["^h[1-5]$"],
+                "condition": null,
+                "tag_to_insert": "strong"
            }
        ]
    }
--- a/src/docx_converter/html_docx_processor.py
+++ b/src/docx_converter/html_docx_processor.py
@@ -13,8 +13,7 @@ from src.inline_style_processor import modify_html_soup_with_css_styles
 class HtmlDocxProcessor:
    def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
        self.logger = logger
-        self.html_soup = html_soup
-        self.body_tag = self.html_soup.body
+        self.body_tag: BeautifulSoup = BeautifulSoup(str(html_soup.body))
        self.html_preprocessor = html_preprocessor
        self.style_preprocessor = style_preprocessor
        self.content: List[Tag] = []
@@ -23,7 +22,6 @@ class HtmlDocxProcessor:
        for font in self.body_tag.find_all("font"):
            font.name = "span"

-
    def _process_hrefs(self):
        a_tags_with_href = self.body_tag.find_all(
            "a", {"href": re.compile("^.*http.+")})
@@ -206,10 +204,9 @@ class HtmlDocxProcessor:
            else:
                h_tag.unwrap()

-
    def delete_content_before_toc(self):
        # remove all tag upper the <TOC> only in content !!! body tag is not updated
-        toc_tag = self.html_soup.new_tag("TOC")
+        toc_tag = self.body_tag.new_tag("TOC")
        if toc_tag in self.content:
            ind = self.content.index(toc_tag) + 1
            self.content = self.content[ind:]
@@ -229,7 +226,7 @@ class HtmlDocxProcessor:
            self.body_tag)

        self.logger.log("Inline style processing.")
-        modify_html_soup_with_css_styles(self.body_tag)
+        self.body_tag = modify_html_soup_with_css_styles(self.body_tag)

        self.logger.log("Image processing.")
        images = process_images(access, path_to_html=html_path,
@@ -256,9 +253,9 @@ class HtmlDocxProcessor:

        self.logger.log(f".html using presets processing.")
        _process_presets(html_preprocessor=self.html_preprocessor,
-                         html_soup=self.html_soup)
+                         html_soup=self.body_tag)

-        self.content = self.body_tag.find_all(recursive=False)
+        self.content = self.body_tag.body.find_all(recursive=False)
        # delete text before table of content if exists
        self.delete_content_before_toc()

--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -1,5 +1,5 @@
 import re
-from typing import Union
+from typing import List, Union
 from bs4.element import PageElement
 from bs4 import BeautifulSoup, Tag, NavigableString, Comment

@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
            clean/remove headings & add span with id

        """
-        title_of_chapter = title_of_chapter.lower()
-        for tag in chapter_tag.contents:
-            tag: PageElement
+        def text_preparing(tag: PageElement):
            text: str = tag if isinstance(tag, NavigableString) else tag.text
-            if re.sub(r"[\s\xa0]", "", text):
-                text = re.sub(r"[\s\xa0]", " ", text).lower()
-                text = text.strip()  # delete extra spaces
-                if not isinstance(tag, NavigableString):
-                    if title_of_chapter == text or \
-                            (title_of_chapter in text and
-                             re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
-                        self.html_preprocessor._add_span_to_save_ids_for_links(
-                            tag, chapter_tag)
-                        tag.extract()
-                        return
-                    elif not self._remove_headings_content(tag, title_of_chapter):
-                        break
-                else:
-                    tag.extract()
-                    return
+            text = re.sub(r"[\s\xa0]", " ", text).lower()
+            text = text.strip()  # delete extra spaces
+            return text
+
+        title_of_chapter: str = title_of_chapter.lower()
+        title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
+                                                (title_of_chapter in text_preparing(tag) and
+                                                 re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
+
+        text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
+        if title_in_text:
+            self.html_preprocessor._add_span_to_save_ids_for_links(
+                title_in_text[-1], chapter_tag)
+            title_in_text[-1].extract()
+        elif text_in_title:
+            [self.html_preprocessor._add_span_to_save_ids_for_links(
+                tag, chapter_tag) for tag in text_in_title]
+            [tag.extract() for tag in text_in_title]

    @staticmethod
    def _class_removing(chapter_tag: BeautifulSoup):
--- a/src/html_presets_processor.py
+++ b/src/html_presets_processor.py
@@ -28,6 +28,7 @@ class HtmlPresetsProcessor:

    @staticmethod
    def _decompose_tag(**kwargs):
+        kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
        kwargs["tag"].decompose()

    @staticmethod
@@ -112,6 +113,7 @@ class HtmlPresetsProcessor:

    @staticmethod
    def _unwrap_tag(**kwargs):
+        kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
        kwargs["tag"].unwrap()

    @staticmethod
@@ -153,7 +155,6 @@ class HtmlPresetsProcessor:
                        for parent_tag in body_tag.select(condition_on_tag[1]):
                            for tag in parent_tag.find_all([re.compile(tag) for tag in tags]):
                                # parent_tag != tag.parent
-                                tag.parent.attrs.update(tag.attrs)
                                action(body_tag=body_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "child_tags":
                        for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
--- a/src/inline_style_processor.py
+++ b/src/inline_style_processor.py
@@ -14,7 +14,7 @@ class InlineStyleProcessor:
    def __init__(self, tag_inline_style: Tag):
        # tag with inline style + style parsed from css file
        self.tag_inline_style = tag_inline_style
-        self.tag_inline_style.attrs['style']: str = self.process_inline_style()
+        self.tag_inline_style.attrs["style"]: str = self.process_inline_style()

    @staticmethod
    def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str:
@@ -80,19 +80,19 @@ class InlineStyleProcessor:
        processed_style = ";".join(split_style)+';'

        margin_left_regexp = re.compile(
-            r"((margin-left|margin): *(-*\w+);*)")
+            r"((margin-left|margin): *-*((\d*)\.*\d+)\w+;*)")
        text_indent_regexp = re.compile(
-            r"(text-indent: *(-*\w+);*)")
+            r"(text-indent: *-*((\d*)\.*\d+)\w+;*)")

        has_margin = re.search(margin_left_regexp, processed_style)
        has_text_indent = re.search(text_indent_regexp, processed_style)
        if has_margin:
            num_m = abs(int("0" + "".join(
-                filter(str.isdigit, str(has_margin.group(3))))))
+                filter(str.isdigit, str(has_margin.group(4))))))

            if has_text_indent:
                num_ti = abs(int("0" + "".join(
-                    filter(str.isdigit, str(has_text_indent.group(2))))))
+                    filter(str.isdigit, str(has_text_indent.group(3))))))
                processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
                                                          str(abs(num_m - num_ti)) + "px; ")
                processed_style = processed_style.replace(
@@ -106,7 +106,7 @@ class InlineStyleProcessor:
        elif has_text_indent:
            processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
                                                      str(abs(int("0" + "".join(
-                                                          filter(str.isdigit, str(has_text_indent.group(2)))))))
+                                                          filter(str.isdigit, str(has_text_indent.group(3)))))))
                                                      + "px; ")
            return processed_style
        return processed_style
@@ -127,22 +127,25 @@ class InlineStyleProcessor:
            processed inline style

        """
-        inline_style = self.tag_inline_style.attrs.get("style") + ";"
-        # 1. Remove white color if tag doesn"t have background color in style
-        inline_style = self.remove_white_if_no_bgcolor(
-            inline_style, self.tag_inline_style)
-        inline_style = inline_style.replace(
-            "list-style-image", "list-style-type")
-        # 2. Create list of styles from inline style
-        # replace all spaces between "; & letter" to ";"
-        style = re.sub(r"; *", ";", inline_style)
-        # when we split style by ";", last element of the list is "" - None (remove it)
-        split_inline_style: list = list(filter(None, style.split(";")))
-        # 3. Duplicate styles check - if the tag had duplicate styles
-        # split_inline_style = self.duplicate_styles_check(split_inline_style)
-        # 4. Processing indents
-        inline_style: str = self.indents_processing(split_inline_style)
-        return inline_style
+        if self.tag_inline_style.attrs.get("style"):
+            inline_style = self.tag_inline_style.attrs.get("style") + ";"
+            # 1. Remove white color if tag doesn't have background color in style
+            inline_style = self.remove_white_if_no_bgcolor(
+                inline_style, self.tag_inline_style)
+            inline_style = inline_style.replace(
+                "list-style-image", "list-style-type")
+            # 2. Create list of styles from inline style
+            # replace all spaces between "; & letter" to ";"
+            style = re.sub(r"; *", ";", inline_style)
+            # when we split style by ";", last element of the list is "" - None (remove it)
+            split_inline_style: list = list(filter(None, style.split(";")))
+            # 3. Duplicate styles check - if the tag had duplicate styles
+            # split_inline_style = self.duplicate_styles_check(split_inline_style)
+            # 4. Processing indents
+            inline_style: str = self.indents_processing(split_inline_style)
+            return inline_style
+        else:
+            return ""

    @staticmethod
    def check_style_to_be_tag(style: str) -> List[tuple]:
--- a/src/livecarta_config.py
+++ b/src/livecarta_config.py
@@ -59,6 +59,7 @@ class LiveCartaConfig:
        "font-style": ["italic"],  # <i>
        "text-decoration": ["underline", "line-through"],  # <u> , <s>
        "text-decoration-line": ["underline", "line-through"],  # <u> , <s>
+        "text-transform": [],
        "vertical-align": ["super"],  # <sup>
        "color": [],
        "background-color": [],
@@ -76,4 +77,5 @@ class LiveCartaConfig:
        "margin-left": [],
        "margin-top": [],
        "margin": [],
+
    }
--- a/src/style_reader.py
+++ b/src/style_reader.py
@@ -1,6 +1,6 @@
 import re
 import cssutils
-from typing import Tuple
+from typing import List, Tuple, Union
 from os.path import dirname, normpath, join

 from src.util.color_reader import str2hex
@@ -16,28 +16,29 @@ class StyleReader:
        to suit LiveCarta style convention.
        """
        self.LIVECARTA_STYLE_ATTRS_MAPPING = {
-            "text-indent": self.convert_indents_tag_values,
+            "text-indent": lambda x: self.convert_tag_style_values(x, is_indent=True),
            "font-variant": lambda x: x,
            "text-align": lambda x: x,
            "font": lambda x: "",
            "font-family": lambda x: x,
            "font-size": self.convert_tag_style_values,
+            "text-transform": lambda x: x,
            "color": self.get_text_color,
            "background-color": self.get_bg_color,
            "background": self.get_bg_color,
-            "border": lambda x: x if x != "0" else "",
-            "border-top-width": lambda x: x if x != "0" else "",
-            "border-right-width": lambda x: x if x != "0" else "",
-            "border-left-width": lambda x: x if x != "0" else "",
-            "border-bottom-width": lambda x: x if x != "0" else "",
-            "border-top": lambda x: x if x != "0" else "",
-            "border-bottom": lambda x: x if x != "0" else "",
+            "border": self.convert_tag_style_values,
+            "border-top-width": self.convert_tag_style_values,
+            "border-right-width": self.convert_tag_style_values,
+            "border-left-width": self.convert_tag_style_values,
+            "border-bottom-width": self.convert_tag_style_values,
+            "border-top": self.convert_tag_style_values,
+            "border-bottom": self.convert_tag_style_values,
            "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
            "list-style-image": lambda x: "disc",
-            "margin-left": self.convert_indents_tag_values,
-            "margin-top": self.convert_tag_style_values,
-            "margin": self.convert_indents_tag_values,
-            "width": self.convert_tag_style_values,
+            "margin-left": lambda x: self.convert_tag_style_values(x, is_indent=True),
+            "margin-top": lambda x: self.convert_tag_style_values(x, is_indent=True),
+            "margin": lambda x: self.convert_tag_style_values(x, is_indent=True),
+            "width": lambda x: self.convert_tag_style_values(x) if "%" not in x else x
        }

    @staticmethod
@@ -68,43 +69,26 @@ class StyleReader:
        -------
        size_value: str
            converted value size
+
        """
-        size_regexp = re.compile(
-            r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)|(^-*(\d*\.*\d+)in$)")
-        has_style_attrs = re.search(size_regexp, size_value)
-        if has_style_attrs:
-            if has_style_attrs.group(1):
+        def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str:
+            size_number = float(size_number.replace(unit_to_replace, "")) * multiplier
+            return str(size_number) + "px"
+        has_size = re.search(r"(\d+)([\w%]+)", size_value)
+        values: List = size_value.split(" ")
+        if has_size:
+            size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0]
+            if has_size.group(2) == "%":
                multiplier = 5.76 if is_indent else 0.16
-                size_value = float(size_value.replace("%", "")) * multiplier
-                return str(size_value) + "px"
-            elif has_style_attrs.group(3):
+                values[size_number_idx] = convert_size_number(values[size_number_idx], "%", multiplier)
+            elif has_size.group(2) == "em":
                multiplier = 18 if is_indent else 16
-                size_value = float(size_value.replace("em", "")) * multiplier
-                return str(size_value) + "px"
-            elif has_style_attrs.group(5):
-                size_value = float(size_value.replace("pt", "")) * 4/3
-                return str(size_value) + "px"
-            elif has_style_attrs.group(7):
-                size_value = float(size_value.replace("in", "")) * 96
-                return str(size_value) + "px"
-            else:
-                return ""
-        return size_value
-
-    def convert_indents_tag_values(self, size_value: str) -> str:
-        """
-        Function converts values of ["text-indent", "margin-left", "margin"]
-        Parameters
-        ----------
-        size_value: str
-
-        Returns
-        -------
-        size_value: str
-
-        """
-        size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
-            else self.convert_tag_style_values(size_value.split(" ")[-1], True)
+                values[size_number_idx] = convert_size_number(values[size_number_idx], "em", multiplier)
+            elif has_size.group(2) == "pt":
+                values[size_number_idx] = convert_size_number(values[size_number_idx], "pt", 4 / 3)
+            elif has_size.group(2) == "in":
+                values[size_number_idx] = convert_size_number(values[size_number_idx], "in", 96)
+        size_value = " ".join(values)
        return size_value

    @staticmethod
@@ -125,17 +109,18 @@ class StyleReader:
        return constraints_on_value, value_not_in_possible_values_list

    def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
-        for i, style in enumerate(split_style):
+        for i, style in reversed(list(enumerate(split_style))):
            style_name, style_value = style.split(":")
            if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
-                # property not in LIVECARTA_STYLE_ATTRS, remove from css file
-                split_style[i] = ""
-                return split_style
+                # property not in LIVECARTA_STYLE_ATTRS, remove
+                split_style.remove(style)
+                continue

            cleaned_value = self.clean_value(style_value, style_name)
            if all(self.style_conditions(cleaned_value, style_name)):
-                # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
-                split_style[i] = ""
+                # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove
+                split_style.remove(style)
+                continue
            else:
                if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
                    # function that converts our data
@@ -156,7 +141,7 @@ class StyleReader:

        split_style = self.update_inline_styles_to_livecarta_convention(
            split_style)
-        style = "; ".join(split_style)
+        style = "; ".join(split_style) if split_style else ""
        return style

    def process_inline_styles_in_html_soup(self, html_content):
--- a/src/util/color_reader.py
+++ b/src/util/color_reader.py
@@ -103,7 +103,7 @@ def str2hex(s: str) -> str:
            return rgb_percent_to_hex((r, g, b))

    if "rgb" in s.lower():
-        rgba = re.findall("([0-9] *\.?[0-9]+)", s)
+        rgba = re.findall("(\d+(?:\.\d+)?)", s)
        r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2])
        if len(rgba) == 4:
            alpha = float(rgba[3])