add processing of JSON presets

2022-07-07 19:32:24 +03:00
parent 687c09417a
commit c4752a19db
5 changed files with 497 additions and 417 deletions
--- a/src/docx_converter/image_processing.py
+++ b/src/docx_converter/image_processing.py
@@ -1,5 +1,4 @@
 import os
-import logging
 import pathlib
 from shutil import copyfile

--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -4,33 +4,34 @@ import codecs
 import os
 from os.path import dirname, normpath, join
 from itertools import chain
+from premailer import transform
 from collections import defaultdict
 from typing import Dict, Union, List

-
 import ebooklib
 from ebooklib import epub
 from ebooklib.epub import Link, Section
-from bs4 import BeautifulSoup, Tag
-
+from bs4 import BeautifulSoup, NavigableString, Tag

 from src.util.helpers import BookLogger
+from src.preset_processor import PresetProcessor
+from src.epub_converter.css_preprocessor import CSSPreprocessor
+from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.epub_converter.image_processing import update_images_src_links
 from src.epub_converter.footnotes_processing import preprocess_footnotes
-from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
-from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
-from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
-    prepare_title, prepare_content
+from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor


 class EpubConverter:
-    def __init__(self, file_path, access=None, logger=None):
+    def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
        self.file_path = file_path
        self.access = access
        self.logger: BookLogger = logger
        self.ebooklib_book = epub.read_epub(file_path)
+        self.css_processor = css_preprocessor
+        self.html_preprocessor = html_processor

        # main container for all epub .xhtml files
        self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
@@ -74,25 +75,15 @@ class EpubConverter:
        self.process_inline_styles_in_html_soup()
        self.logger.log("CSS files processing.")
        self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
-        self.logger.log("CSS styles adding.")
+        self.logger.log("CSS  styles adding.")
        self.add_css_styles_to_html_soup()

-        # todo presets
-
        self.logger.log("Footnotes processing.")
        for href in self.html_href2html_body_soup:
-            content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
-                                                                     self.html_href2html_body_soup)
-            self.footnotes_contents.extend(content)
-            self.noterefs.extend(noterefs)
-            self.footnotes.extend(footnotes_tags)
-
-        for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
-            noteref.attrs["data-id"] = i + 1
-            noteref.attrs["id"] = f"footnote-{i + 1}"
-            footnote.attrs["href"] = f"#footnote-{i + 1}"
-
+            self.footnotes_contents, self.noterefs, self.footnotes =\
+                preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
        self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
+
        self.logger.log("TOC processing.")
        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
        # build simple toc from spine if needed
@@ -101,6 +92,7 @@ class EpubConverter:
        not_added = [
            x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
        self.logger.log(f"Html documents not added to TOC: {not_added}.")
+        self.logger.log(f"Add documents not added to TOC.")
        self.add_not_added_files_to_adjacency_list(not_added)
        self.logger.log(f"Html internal links and structure processing.")
        self.label_chapters_ids_with_lc_id()
@@ -149,7 +141,7 @@ class EpubConverter:
            for tag_initial_inline_style in tags_with_inline_style:
                inline_style = tag_initial_inline_style.attrs["style"]
                tag_initial_inline_style.attrs["style"] = \
-                    build_inline_style_content(inline_style)
+                    self.css_processor.build_inline_style_content(inline_style)

    def build_html_and_css_relations(self) -> tuple[dict, dict]:
        """
@@ -181,16 +173,53 @@ class EpubConverter:
                html_href2css_href[html_href].append(css_href)
                if css_href not in css_href2css_content:
                    # css_href not in css_href2css_content, add to this dict
-                    css_href2css_content[css_href] = build_css_file_content(
+                    css_href2css_content[css_href] = self.css_processor.build_css_file_content(
                        self.get_css_content(css_href, html_href))

            for i, tag in enumerate(soup_html_content.find_all("style")):
                css_content = tag.string
                html_href2css_href[html_href].append(f"href{i}")
-                css_href2css_content[f"href{i}"] = build_css_file_content(
+                css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
                    css_content)
        return html_href2css_href, css_href2css_content

+    def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
+        """
+        Function adds styles from .css to inline style.
+        Parameters
+        ----------
+        html_soup: BeautifulSoup
+            html page with inline style
+        css_text: str
+            css content from css file
+        Returns
+        -------
+        inline_soup: BeautifulSoup
+            soup with styles from css
+
+        """
+        # remove this specification because it causes problems
+        css_text = css_text.replace(
+            '@namespace epub "http://www.idpf.org/2007/ops";', '')
+        # here we add css styles to inline style
+        html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
+                                              remove_classes=False,
+                                              external_styles=False,
+                                              allow_network=False,
+                                              disable_validation=True,
+                                              )
+        # soup with converted styles from css
+        inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
+
+        tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
+                                                      attrs={"style": re.compile(".*")})
+
+        # go through the tags with inline style + style parsed from css file
+        for tag_inline_style in tags_with_inline_style:
+            style_converter = TagInlineStyleProcessor(tag_inline_style)
+            style_converter.convert_initial_tag()
+        return inline_soup
+
    def add_css_styles_to_html_soup(self):
        """
        This function is designed to update html_href2html_body_soup
@@ -203,7 +232,7 @@ class EpubConverter:
                for css_href in self.html_href2css_href[html_href]:
                    css += self.css_href2css_content[css_href]
                html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
-                html_content = convert_html_soup_with_css_style(html_content, css)
+                html_content = self.convert_html_soup_with_css_style(html_content, css)
                self.html_href2html_body_soup[html_href] = html_content

    def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -488,6 +517,48 @@ class EpubConverter:
                                    f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
                                    f" Old id={a_tag_id}")

+    @staticmethod
+    def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
+        """
+        After processing on a first_id that corresponds to current chapter,
+        from initial html_soup all tags from current chapter are extracted
+        Parameters
+        ----------
+        first_id: str
+            Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
+        href: str
+            Name of current chapters file
+        html_soup: Tag
+            Soup object of current  file
+
+        Returns
+        -------
+        tags: list [Tag, NavigableString]
+            Chapter's tags
+
+        """
+        marked_tags = html_soup.find(
+            attrs={"id": first_id, "class": "converter-chapter-mark"})
+        if marked_tags:
+            next_tag = marked_tags.next_sibling
+            tags = []
+            while next_tag:
+                if not isinstance(next_tag, NavigableString) and \
+                        (next_tag.attrs.get("class") == "converter-chapter-mark"):
+                    break
+                tags.append(next_tag)
+                next_tag = next_tag.next_sibling
+
+            # remove tags between first_id and next found id
+            # save them in list for next steps
+            tags = [tag.extract() for tag in tags]
+            html_soup.smooth()
+
+        else:
+            assert 0, f"Warning: no match for {first_id, href}"
+
+        return tags
+
    def detect_one_chapter(self, nav_point: NavPoint):
        """
        Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
@@ -511,11 +582,11 @@ class EpubConverter:
        """
        if nav_point.id:
            soup = self.html_href2html_body_soup[nav_point.href]
-            chapter_tags = get_tags_between_chapter_marks(
+            subchapter_tags = self.get_tags_between_chapter_marks(
                first_id=nav_point.id, href=nav_point.href, html_soup=soup)
            new_tree = BeautifulSoup("", "html.parser")
-            for tag in chapter_tags:
-                new_tree.append(tag)
+            for subchapter_tag in subchapter_tags:
+                new_tree.append(subchapter_tag)
            self.href_chapter_id2soup_html[(
                nav_point.href, nav_point.id)] = new_tree

@@ -527,8 +598,8 @@ class EpubConverter:
        """Function build chapters content, starts from top level chapters"""
        top_level_nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
-            for point in top_level_nav_points:
-                self.detect_one_chapter(point)
+            for tl_nav_point in top_level_nav_points:
+                self.detect_one_chapter(tl_nav_point)

    def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
        """
@@ -561,9 +632,9 @@ class EpubConverter:
                                                                    if hasattr(self.file_path, "stem") else "book_id")

        is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
-        title_preprocessed = prepare_title(title)
-        content_preprocessed = prepare_content(title_preprocessed, content,
-                                               remove_title_from_chapter=is_chapter)
+        title_preprocessed = self.html_preprocessor.prepare_title(title)
+        content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
+                                                                      remove_title_from_chapter=is_chapter)
        sub_nodes = []
        # warning! not EpubHtmlItems won't be added to chapter
        # if it doesn't have subchapters
@@ -598,11 +669,17 @@ class EpubConverter:


 if __name__ == "__main__":
-    epub_file_path = "../../epub/9781641050234.epub"
+    epub_file_path = "../../epub/Modern_Java_in_Action.epub"
    logger_object = BookLogger(
        name="epub", book_id=epub_file_path.split("/")[-1])

-    json_converter = EpubConverter(epub_file_path, logger=logger_object)
+    preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
+        .get_preset_json()
+    css_preprocessor = CSSPreprocessor(logger=logger_object)
+    html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
+
+    json_converter = EpubConverter(epub_file_path, logger=logger_object,
+                                   css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
    content_dict = json_converter.convert_to_dict()

    with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -1,4 +1,7 @@
 from src.book_solver import BookSolver
+from src.preset_processor import PresetProcessor
+from src.epub_converter.css_preprocessor import CSSPreprocessor
+from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
 from src.epub_converter.epub_converter import EpubConverter


@@ -14,8 +17,10 @@ class EpubBook(BookSolver):
        Function
        Steps
        ----------
-        1. Converts .epub to .html
-        2. Parses from line structure to nested structure
+        1. Gets data from preset structure
+        2. Add preset to html preprocessor
+        3. Converts .epub to .html
+        4. Parses from line structure to nested structure

        Returns
        ----------
@@ -23,7 +28,12 @@ class EpubBook(BookSolver):
            json for LiveCarta platform

        """
+        preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
+            .get_preset_json()
+        css_preprocessor = CSSPreprocessor(logger=self.logger_object)
+        html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
        json_converter = EpubConverter(
-            self.file_path, access=self.access, logger=self.logger_object)
+            self.file_path, access=self.access, logger=self.logger_object,
+            css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
        content_dict = json_converter.convert_to_dict()
        return content_dict
--- a/src/epub_converter/html_epub_preprocessor.py
+++ b/src/epub_converter/html_epub_preprocessor.py
@@ -1,419 +1,398 @@
 import re
+from bs4 import BeautifulSoup, NavigableString, Comment, Tag

-from bs4 import BeautifulSoup, NavigableString, Tag, Comment
-
-from src.livecarta_config import LiveCartaConfig
+from src.util.helpers import BookLogger


-def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
-    """
-    Function adds span with id from tag_to_be_removed
-    because this tag will be removed(unwrapped/extract)
-    Parameters
-    ----------
-    tag_to_be_removed: Soup object
-    chapter_tag: BeautifulSoup
+class HtmlEpubPreprocessor:
+    def __init__(self, preset, logger=None):
+        self.preset = preset
+        self.logger: BookLogger = logger
+        self.name2function = {
+            "table_wrapper": self._wrap_tags_with_table,
+            "replacer": self._tags_to_correspond_livecarta_tag,
+            "unwrapper": self._unwrap_tags,
+            "inserter": self._insert_tags_into_correspond_tags
+        }

-    Returns
-    -------
-    None
-        updated body tag
+    @staticmethod
+    def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
+        """
+        Function adds span with id from tag_to_be_removed
+        because this tag will be removed(unwrapped/extract)
+        Parameters
+        ----------
+        tag_to_be_removed: Soup object
+        chapter_tag: BeautifulSoup

-    """
-    def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
-        """Function inserts span before tag aren't supported by LiveCarta"""
-        new_tag = chapter_tag.new_tag("span")
-        new_tag.attrs["id"] = id_ or ""
-        new_tag.attrs["class"] = class_ or ""
-        new_tag.string = "\xa0"
-        tag_to_be_removed.insert_before(new_tag)
+        Returns
+        -------
+        None
+            updated body tag

-    if tag_to_be_removed.attrs.get("id"):
-        _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
-                                           id_=tag_to_be_removed.attrs["id"],
-                                           class_=tag_to_be_removed.attrs.get("class"))
+        """

+        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
+                                               class_: list):
+            """Function inserts span before tag aren't supported by LiveCarta"""
+            new_tag = chapter_tag.new_tag("span")
+            new_tag.attrs["id"] = id_ or ""
+            new_tag.attrs["class"] = class_ or ""
+            new_tag.string = "\xa0"
+            tag_to_be_removed.insert_before(new_tag)

-def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
-    """
-    After processing on a first_id that corresponds to current chapter,
-    from initial html_soup all tags from current chapter are extracted
-    Parameters
-    ----------
-    first_id: str
-        Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
-    href: str
-        Name of current chapters file
-    html_soup: Tag
-        Soup object of current  file
+        if tag_to_be_removed.attrs.get("id"):
+            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
+                                               id_=tag_to_be_removed.attrs["id"],
+                                               class_=tag_to_be_removed.attrs.get("class"))

-    Returns
-    -------
-    tags: list [Tag, NavigableString]
-        Chapter's tags
+    @staticmethod
+    def prepare_title(title_of_chapter: str) -> str:
+        """
+        Function finalise processing/cleaning title
+        Parameters
+        ----------
+        title_of_chapter: str

-    """
-    marked_tags = html_soup.find(
-        attrs={"id": first_id, "class": "converter-chapter-mark"})
-    if marked_tags:
-        next_tag = marked_tags.next_sibling
-        tags = []
-        while next_tag:
-            if not isinstance(next_tag, NavigableString) and \
-                    (next_tag.attrs.get("class") == "converter-chapter-mark"):
-                break
-            tags.append(next_tag)
-            next_tag = next_tag.next_sibling
+        Returns
+        -------
+        title: str
+            cleaned title

-        # remove tags between first_id and next found id
-        # save them in list for next steps
-        tags = [tag.extract() for tag in tags]
-        html_soup.smooth()
+        """
+        title = BeautifulSoup(title_of_chapter, features="lxml").string
+        # clean extra whitespace characters ([\r\n\t\f\v ])
+        title = re.sub(r"[\s\xa0]", " ", title).strip()
+        return title

-    else:
-        assert 0, f"Warning: no match for {first_id, href}"
+    @staticmethod
+    def _remove_comments(chapter_tag):
+        """
+        Function remove comments
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag

-    return tags
+        Returns
+        -------
+        None
+            Chapter Tag without comments

+        """
+        for tag in chapter_tag.find_all():
+            for element in tag(text=lambda text: isinstance(text, Comment)):
+                element.extract()

-def prepare_title(title_of_chapter: str) -> str:
-    """
-    Function finalise processing/cleaning title
-    Parameters
-    ----------
-    title_of_chapter: str
+    @staticmethod
+    def _wrap_strings_with_p(chapter_tag):
+        """
+        Function converts headings that aren't supported by LiveCarta with <p>
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag

-    Returns
-    -------
-    title: str
-        cleaned title
+        Returns
+        -------
+        None
+            Chapter Tag with wrapped NavigableStrings

-    """
-    title = BeautifulSoup(title_of_chapter, features="lxml").string
-    # clean extra whitespace characters ([\r\n\t\f\v ])
-    title = re.sub(r"[\s\xa0]", " ", title).strip()
-    return title
+        """
+        for node in chapter_tag:
+            if isinstance(node, NavigableString):
+                content = str(node)
+                content = re.sub(r"([\s\xa0])", " ", content).strip()
+                if content:
+                    p_tag = chapter_tag.new_tag("p")
+                    p_tag.append(str(node))
+                    node.replace_with(p_tag)

+    def _wrap_tags_with_table(self, chapter_tag, rules: list):
+        """
+        Function wraps <tag> with <table>
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag

-def _remove_comments(chapter_tag):
-    """
-    Function remove comments
-    Parameters
-    ----------
-    chapter_tag: BeautifulSoup
-        Tag & contents of the chapter tag
+        Returns
+        -------
+        None
+            Chapter Tag with wrapped certain tags with <table>

-    Returns
-    -------
-    None
-        Chapter Tag without comments
+        """

-    """
-    for tag in chapter_tag.find_all():
-        for element in tag(text=lambda text: isinstance(text, Comment)):
-            element.extract()
+        def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
+            table = chapter_tag.new_tag("table")
+            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
+                = border, "center", f"width:{width}%;"
+            tbody, tr, td = \
+                chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
+            td.attrs["bgcolor"] = bg_color
+            tag_to_be_wrapped.wrap(td)
+            td.wrap(tr)
+            tr.wrap(tbody)
+            tbody.wrap(table)
+            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
+            return table

+        def process_tag_using_table(tag_to_wrap):
+            _wrap_tag_with_table(
+                chapter_tag,
+                tag_to_be_wrapped=tag_to_wrap,
+                width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
+                border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
+                bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
+            self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
+            tag_to_wrap.unwrap()

-def _wrap_strings_with_p(chapter_tag):
-    """
-    Function converts headings that aren't supported by LiveCarta with <p>
-    Parameters
-    ----------
-    chapter_tag: BeautifulSoup
-        Tag & contents of the chapter tag
-
-    Returns
-    -------
-    None
-        Chapter Tag with wrapped NavigableStrings
-
-    """
-    for node in chapter_tag:
-        if isinstance(node, NavigableString):
-            content = str(node)
-            content = re.sub(r"([\s\xa0])", " ", content).strip()
-            if content:
-                p_tag = chapter_tag.new_tag("p")
-                p_tag.append(str(node))
-                node.replace_with(p_tag)
-
-
-def _wrap_tags_with_table(chapter_tag):
-    """
-    Function wraps <tag> with <table>
-    Parameters
-    ----------
-    chapter_tag: BeautifulSoup
-        Tag & contents of the chapter tag
-
-    Returns
-    -------
-    None
-        Chapter Tag with wrapped certain tags with <table>
-
-    """
-    def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
-        table = chapter_tag.new_tag("table")
-        table.attrs["border"], table.attrs["align"], table.attrs["style"] \
-            = border, "center", f"width:{width}%;"
-        tbody, tr, td = \
-            chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
-        td.attrs["bgcolor"] = bg_color
-        tag_to_be_wrapped.wrap(td)
-        td.wrap(tr)
-        tr.wrap(tbody)
-        tbody.wrap(table)
-        table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
-        return table
-
-    def process_tag_using_table(tag_to_wrap):
-        _wrap_tag_with_table(
-            chapter_tag,
-            tag_to_be_wrapped=tag_to_wrap,
-            width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
-            border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
-            bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
-        _add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
-        tag_to_wrap.unwrap()
-
-    for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
-        if isinstance(attrs, tuple):
-            attr, val = attrs[0], attrs[1]
-            for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
-                process_tag_using_table(tag_to_wrap)
-        else:
-            for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
-                if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
+        for rule in rules:
+            tags = rule["tags"]
+            for attr in rule["attrs"]:
+                for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
+                                                        {attr["name"]: re.compile(fr"{attr['value']}")}):
                    process_tag_using_table(tag_to_wrap)

+    @staticmethod
+    def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list):
+        """
+        Function to replace all tags to correspond LiveCarta tags
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag

-def _tags_to_correspond_livecarta_tag(chapter_tag):
-    """
-    Function to replace all tags to correspond LiveCarta tags
-    Parameters
-    ----------
-    chapter_tag: BeautifulSoup
-        Tag & contents of the chapter tag
+        Returns
+        -------
+        None
+            Chapter Tag with all tags replaced with LiveCarta tags

-    Returns
-    -------
-    None
-        Chapter Tag with all tags replaced with LiveCarta tags
-
-    """
-    for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
-        for key in reg_keys:
-            if isinstance(key, tuple):
-                replace = key[0]
-                parent, child = key[1], key[2]
-                for parent_tag in chapter_tag.select(parent):
-                    if replace == "parent":
-                        parent_tag.name = to_replace_value
-                    elif replace == "child":
-                        for child_tag in parent_tag.select(child):
-                            child_tag.name = to_replace_value
-                            if not child_tag.attrs.get("style"):
-                                child_tag.attrs["style"] =\
-                                    "font-size: 14px; font-family: courier new,courier,monospace;"
+        """
+        for rule in rules:
+            tags = rule["tags"]
+            tag_to_replace = rule["tag_to_replace"]
+            if rule["condition"]:
+                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+                    if condition_on_tag[0] == 'parent_tags':
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                            if tag.parent.select(condition_on_tag[1]):
+                                tag.name = tag_to_replace
+                    elif condition_on_tag[0] == 'child_tags':
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                            if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
+                                tag.name = tag_to_replace
+                    elif condition_on_tag[0] == "attrs":
+                        for attr in rule["condition"]["attrs"]:
+                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
+                                tag.name = tag_to_replace
            else:
-                tags = chapter_tag.find_all(re.compile(key))
-                for tag in tags:
+                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                    # todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
-                    tag.name = to_replace_value
+                    tag.name = tag_to_replace

+    def _unwrap_tags(self, chapter_tag, rules: dict):
+        """
+        Function unwrap tags and moves id to span
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag

-def _unwrap_tags(chapter_tag):
-    """
-    Function unwrap tags and moves id to span
-    Parameters
-    ----------
-    chapter_tag: BeautifulSoup
-        Tag & contents of the chapter tag
+        Returns
+        -------
+        None
+            Chapter Tag with unwrapped certain tags

-    Returns
-    -------
-    None
-        Chapter Tag with unwrapped certain tags
+        """
+        for tag_name in rules["tags"]:
+            for tag in chapter_tag.select(tag_name):
+                # if tag is a subtag
+                if ">" in tag_name:
+                    tag.parent.attrs.update(tag.attrs)
+                self._add_span_to_save_ids_for_links(tag, chapter_tag)
+                tag.unwrap()

-    """
-    for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
-        for tag in chapter_tag.select(tag_name):
-            # if tag is a subtag
-            if ">" in tag_name:
-                tag.parent.attrs.update(tag.attrs)
-            _add_span_to_save_ids_for_links(tag, chapter_tag)
-            tag.unwrap()
+    @staticmethod
+    def _insert_tags_into_correspond_tags(chapter_tag, rules: list):
+        """
+        Function inserts tags into correspond tags
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag

+        Returns
+        -------
+        None
+            Chapter Tag with inserted tags

-def _remove_headings_content(content_tag, title_of_chapter: str):
-    """
-    Function
-    - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
-    - adds span with id in order to
-    Parameters
-    ----------
-    content_tag: soup object
-        Tag of the page
-    title_of_chapter: str
-        Chapter title
+        """

-    Returns
-    -------
-    None
-        clean/remove headings & add span with id
+        def insert(tag, tag_to_insert):
+            # insert all items that was in tag to subtag and remove from tag
+            for content in reversed(tag.contents):
+                tag_to_insert.insert(0, content.extract())
+            # wrap subtag with items
+            tag.append(tag_to_insert)

-    """
-    title_of_chapter = title_of_chapter.lower()
-    for tag in content_tag.contents:
-        text = tag if isinstance(tag, NavigableString) else tag.text
-        if re.sub(r"[\s\xa0]", "", text):
-            text = re.sub(r"[\s\xa0]", " ", text).lower()
-            text = text.strip()   # delete extra spaces
-            if title_of_chapter == text or \
-                    (title_of_chapter in text and
-                     re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
-                _add_span_to_save_ids_for_links(tag, content_tag)
-                tag.extract()
-                return
-            elif not isinstance(tag, NavigableString):
-                if not _remove_headings_content(tag, title_of_chapter):
-                    break
-
-
-def _process_table(chapter_tag: BeautifulSoup):
-    """
-    Function preprocesses tables and tags(td|th|tr)
-    Parameters
-    ----------
-    chapter_tag: BeautifulSoup
-        Tag & contents of the chapter tag
-
-    Returns
-    -------
-    None
-        Chapter Tag with processed tables
-
-    """
-    tables = chapter_tag.find_all("table")
-    for table in tables:
-        for t_tag in table.find_all(re.compile("td|th|tr")):
-            width = ""
-            if t_tag.get("style"):
-                width_match = re.search(
-                    r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
-                if width_match:
-                    size = width_match.group(1)
-                    width = size + "px"
-
-            t_tag.attrs["width"] = t_tag.get("width") or width
-
-            if t_tag.attrs.get("style"):
-                t_tag.attrs["style"] = t_tag.attrs["style"].replace(
-                    "border:0;", "")
-                if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
-                    del t_tag.attrs["style"]
-
-        if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
-            table.attrs["border"] = "1"
-
-
-def _insert_tags_in_parents(chapter_tag):
-    """
-    Function inserts tags into correspond tags
-    Parameters
-    ----------
-    chapter_tag: BeautifulSoup
-        Tag & contents of the chapter tag
-
-    Returns
-    -------
-    None
-        Chapter Tag with inserted tags
-
-    """
-    parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
-    for parent_tag_name, condition in parent_tag2condition.items():
-        for parent_tag in chapter_tag.select(parent_tag_name):
-            if parent_tag.select(condition):
-                continue
+        for rule in rules:
+            tags = rule["tags"]
+            tag_to_insert = \
+                chapter_tag.new_tag(rule["tag_to_insert"])
+            if rule["condition"]:
+                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+                    if condition_on_tag[0] == 'parent_tags':
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                            if tag.parent.select(condition_on_tag[1]):
+                                insert(tag, tag_to_insert)
+                    elif condition_on_tag[0] == 'child_tags':
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                            if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
+                                insert(tag, tag_to_insert)
+                    elif condition_on_tag[0] == "attrs":
+                        for attr in rule["condition"]["attrs"]:
+                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
+                                insert(tag, tag_to_insert)
            else:
-                tag_to_insert = chapter_tag.new_tag(
-                    LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
-                # insert all items that was in pre to code and remove from pre
-                for content in reversed(parent_tag.contents):
-                    tag_to_insert.insert(0, content.extract())
-                # wrap code with items
-                parent_tag.append(tag_to_insert)
+                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+                    insert(tag, tag_to_insert)

+    def _remove_headings_content(self, content_tag, title_of_chapter: str):
+        """
+        Function
+        - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
+        - adds span with id in order to
+        Parameters
+        ----------
+        content_tag: soup object
+            Tag of the page
+        title_of_chapter: str
+            Chapter title

-def _class_removing(chapter_tag):
-    """
-    Function removes classes that aren't created by converter
-    Parameters
-    ----------
-    chapter_tag: BeautifulSoup
-        Tag & contents of the chapter tag
+        Returns
+        -------
+        None
+            clean/remove headings & add span with id

-    Returns
-    -------
-    None
-        Chapter Tag without original classes of the book
+        """
+        title_of_chapter = title_of_chapter.lower()
+        for tag in content_tag.contents:
+            text = tag if isinstance(tag, NavigableString) else tag.text
+            if re.sub(r"[\s\xa0]", "", text):
+                text = re.sub(r"[\s\xa0]", " ", text).lower()
+                text = text.strip()  # delete extra spaces
+                if title_of_chapter == text or \
+                        (title_of_chapter in text and
+                         re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
+                    self._add_span_to_save_ids_for_links(tag, content_tag)
+                    tag.extract()
+                    return
+                elif not isinstance(tag, NavigableString):
+                    if not self._remove_headings_content(tag, title_of_chapter):
+                        break

-    """
-    for tag in chapter_tag.find_all(recursive=True):
-        if tag.attrs.get("class") \
-                and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
-            del tag.attrs["class"]
+    @staticmethod
+    def _process_tables(chapter_tag: BeautifulSoup):
+        """
+        Function preprocesses tables and tags(td|th|tr)
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag

+        Returns
+        -------
+        None
+            Chapter Tag with processed tables

-def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
-    """
-    Function finalise processing/cleaning content
-    Parameters
-    ----------
-    title_str: str
+        """
+        tables = chapter_tag.find_all("table")
+        for table in tables:
+            for t_tag in table.find_all(re.compile("td|th|tr")):
+                width = ""
+                if t_tag.get("style"):
+                    width_match = re.search(
+                        r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
+                    if width_match:
+                        size = width_match.group(1)
+                        width = size + "px"

-    content_tag: Tag, soup object
+                t_tag.attrs["width"] = t_tag.get("width") or width

-    remove_title_from_chapter: bool
+                if t_tag.attrs.get("style"):
+                    t_tag.attrs["style"] = t_tag.attrs["style"].replace(
+                        "border:0;", "")
+                    if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
+                        del t_tag.attrs["style"]

-    Steps
-    ----------
-    1. comments removal
-    2. wrap NavigableString with tag <p>
-    3. wrap tags with <table>
-    4. replace tags with correspond LiveCarta tags
-    5. unwrap tags
-    6. heading removal
-    7. process_table
-    8. insert tags into correspond tags
-    9. class removal
+            if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
+                table.attrs["border"] = "1"

-    Returns
-    -------
-    content_tag: str
-        prepared content
+    @staticmethod
+    def _class_removing(chapter_tag):
+        """
+        Function removes classes that aren't created by converter
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag

-    """
-    # 1. remove comments
-    _remove_comments(content_tag)
+        Returns
+        -------
+        None
+            Chapter Tag without original classes of the book

-    # 2.
-    _wrap_strings_with_p(content_tag)
-    # 3.
-    _wrap_tags_with_table(content_tag)
-    # 4.
-    _tags_to_correspond_livecarta_tag(content_tag)
-    # 5.
-    _unwrap_tags(content_tag)
-    # 6.
-    if remove_title_from_chapter:
-        _remove_headings_content(content_tag, title_str)
-    # 7.
-    _process_table(content_tag)
-    # 8.
-    _insert_tags_in_parents(content_tag)
+        """
+        for tag in chapter_tag.find_all(recursive=True):
+            if tag.attrs.get("class") \
+                    and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
+                del tag.attrs["class"]

-    # 9. remove classes that weren't created by converter
-    _class_removing(content_tag)
-    return str(content_tag)
+    def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
+        """
+        Function finalise processing/cleaning content
+        Parameters
+        ----------
+        title_str: str
+
+        content_tag: Tag, soup object
+
+        remove_title_from_chapter: bool
+
+        Steps
+        ----------
+        1. comments removal
+        2. wrap NavigableString with tag <p>
+        3-6. wrap tags with <table>
+            replace tags with correspond LiveCarta tags
+            unwrap tags
+            insert tags into correspond tags
+        7. heading removal
+        8. process_tables
+        9. class removal
+
+        Returns
+        -------
+        content_tag: str
+            prepared content
+
+        """
+        # 1. remove comments
+        self._remove_comments(content_tag)
+        # 2.
+        self._wrap_strings_with_p(content_tag)
+        # 3-6.
+        for dict in self.preset:
+            func = self.name2function[dict["preset_name"]]
+            func(content_tag, dict['rules'])
+        # 7.
+        if remove_title_from_chapter:
+            self._remove_headings_content(content_tag, title_str)
+        # 8.
+        self._process_tables(content_tag)
+        # 9. remove classes that weren't created by converter
+        self._class_removing(content_tag)
+        return str(content_tag)
--- a/src/preset_processor.py
+++ b/src/preset_processor.py
@@ -0,0 +1,15 @@
+import json
+
+
+from src.util.helpers import BookLogger
+
+
+class PresetProcessor:
+    def __init__(self, preset_path="config/presets.json", logger=None):
+        self.preset_path = preset_path
+        self.logger: BookLogger = logger
+
+    def get_preset_json(self):
+        f = open(self.preset_path)
+        data = json.load(f)
+        return data