add processing of JSON presets

2022-07-07 19:32:24 +03:00
parent 687c09417a
commit c4752a19db
5 changed files with 497 additions and 417 deletions
--- a/src/docx_converter/image_processing.py
+++ b/src/docx_converter/image_processing.py
@@ -1,5 +1,4 @@
 import os
 import logging
 import pathlib
 from shutil import copyfile
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -4,33 +4,34 @@ import codecs
 import os
 from os.path import dirname, normpath, join
 from itertools import chain
 from premailer import transform
 from collections import defaultdict
 from typing import Dict, Union, List
 import ebooklib
 from ebooklib import epub
 from ebooklib.epub import Link, Section
-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup, NavigableString, Tag
 from src.util.helpers import BookLogger
 from src.preset_processor import PresetProcessor
 from src.epub_converter.css_preprocessor import CSSPreprocessor
 from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.epub_converter.image_processing import update_images_src_links
 from src.epub_converter.footnotes_processing import preprocess_footnotes
-from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
+from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
 from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
 from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
    prepare_title, prepare_content
 class EpubConverter:
-    def __init__(self, file_path, access=None, logger=None):
+    def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
        self.file_path = file_path
        self.access = access
        self.logger: BookLogger = logger
        self.ebooklib_book = epub.read_epub(file_path)
        self.css_processor = css_preprocessor
        self.html_preprocessor = html_processor
        # main container for all epub .xhtml files
        self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
@@ -74,25 +75,15 @@ class EpubConverter:
        self.process_inline_styles_in_html_soup()
        self.logger.log("CSS files processing.")
        self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
-        self.logger.log("CSS styles adding.")
+        self.logger.log("CSS  styles adding.")
        self.add_css_styles_to_html_soup()
        # todo presets
        self.logger.log("Footnotes processing.")
        for href in self.html_href2html_body_soup:
-            content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
+            self.footnotes_contents, self.noterefs, self.footnotes =\
-                                                                     self.html_href2html_body_soup)
+                preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
            self.footnotes_contents.extend(content)
            self.noterefs.extend(noterefs)
            self.footnotes.extend(footnotes_tags)
        for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
            noteref.attrs["data-id"] = i + 1
            noteref.attrs["id"] = f"footnote-{i + 1}"
            footnote.attrs["href"] = f"#footnote-{i + 1}"
        self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
        self.logger.log("TOC processing.")
        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
        # build simple toc from spine if needed
@@ -101,6 +92,7 @@ class EpubConverter:
        not_added = [
            x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
        self.logger.log(f"Html documents not added to TOC: {not_added}.")
        self.logger.log(f"Add documents not added to TOC.")
        self.add_not_added_files_to_adjacency_list(not_added)
        self.logger.log(f"Html internal links and structure processing.")
        self.label_chapters_ids_with_lc_id()
@@ -149,7 +141,7 @@ class EpubConverter:
            for tag_initial_inline_style in tags_with_inline_style:
                inline_style = tag_initial_inline_style.attrs["style"]
                tag_initial_inline_style.attrs["style"] = \
-                    build_inline_style_content(inline_style)
+                    self.css_processor.build_inline_style_content(inline_style)
    def build_html_and_css_relations(self) -> tuple[dict, dict]:
        """
@@ -181,16 +173,53 @@ class EpubConverter:
                html_href2css_href[html_href].append(css_href)
                if css_href not in css_href2css_content:
                    # css_href not in css_href2css_content, add to this dict
-                    css_href2css_content[css_href] = build_css_file_content(
+                    css_href2css_content[css_href] = self.css_processor.build_css_file_content(
                        self.get_css_content(css_href, html_href))
            for i, tag in enumerate(soup_html_content.find_all("style")):
                css_content = tag.string
                html_href2css_href[html_href].append(f"href{i}")
-                css_href2css_content[f"href{i}"] = build_css_file_content(
+                css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
                    css_content)
        return html_href2css_href, css_href2css_content
    def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
        """
        Function adds styles from .css to inline style.
        Parameters
        ----------
        html_soup: BeautifulSoup
            html page with inline style
        css_text: str
            css content from css file
        Returns
        -------
        inline_soup: BeautifulSoup
            soup with styles from css
        """
        # remove this specification because it causes problems
        css_text = css_text.replace(
            '@namespace epub "http://www.idpf.org/2007/ops";', '')
        # here we add css styles to inline style
        html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
                                              remove_classes=False,
                                              external_styles=False,
                                              allow_network=False,
                                              disable_validation=True,
                                              )
        # soup with converted styles from css
        inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
        tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
                                                      attrs={"style": re.compile(".*")})
        # go through the tags with inline style + style parsed from css file
        for tag_inline_style in tags_with_inline_style:
            style_converter = TagInlineStyleProcessor(tag_inline_style)
            style_converter.convert_initial_tag()
        return inline_soup
    def add_css_styles_to_html_soup(self):
        """
        This function is designed to update html_href2html_body_soup
@@ -203,7 +232,7 @@ class EpubConverter:
                for css_href in self.html_href2css_href[html_href]:
                    css += self.css_href2css_content[css_href]
                html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
-                html_content = convert_html_soup_with_css_style(html_content, css)
+                html_content = self.convert_html_soup_with_css_style(html_content, css)
                self.html_href2html_body_soup[html_href] = html_content
    def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -488,6 +517,48 @@ class EpubConverter:
                                    f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
                                    f" Old id={a_tag_id}")
    @staticmethod
    def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
        """
        After processing on a first_id that corresponds to current chapter,
        from initial html_soup all tags from current chapter are extracted
        Parameters
        ----------
        first_id: str
            Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
        href: str
            Name of current chapters file
        html_soup: Tag
            Soup object of current  file
        Returns
        -------
        tags: list [Tag, NavigableString]
            Chapter's tags
        """
        marked_tags = html_soup.find(
            attrs={"id": first_id, "class": "converter-chapter-mark"})
        if marked_tags:
            next_tag = marked_tags.next_sibling
            tags = []
            while next_tag:
                if not isinstance(next_tag, NavigableString) and \
                        (next_tag.attrs.get("class") == "converter-chapter-mark"):
                    break
                tags.append(next_tag)
                next_tag = next_tag.next_sibling
            # remove tags between first_id and next found id
            # save them in list for next steps
            tags = [tag.extract() for tag in tags]
            html_soup.smooth()
        else:
            assert 0, f"Warning: no match for {first_id, href}"
        return tags
    def detect_one_chapter(self, nav_point: NavPoint):
        """
        Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
@@ -511,11 +582,11 @@ class EpubConverter:
        """
        if nav_point.id:
            soup = self.html_href2html_body_soup[nav_point.href]
-            chapter_tags = get_tags_between_chapter_marks(
+            subchapter_tags = self.get_tags_between_chapter_marks(
                first_id=nav_point.id, href=nav_point.href, html_soup=soup)
            new_tree = BeautifulSoup("", "html.parser")
-            for tag in chapter_tags:
+            for subchapter_tag in subchapter_tags:
-                new_tree.append(tag)
+                new_tree.append(subchapter_tag)
            self.href_chapter_id2soup_html[(
                nav_point.href, nav_point.id)] = new_tree
@@ -527,8 +598,8 @@ class EpubConverter:
        """Function build chapters content, starts from top level chapters"""
        top_level_nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
-            for point in top_level_nav_points:
+            for tl_nav_point in top_level_nav_points:
-                self.detect_one_chapter(point)
+                self.detect_one_chapter(tl_nav_point)
    def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
        """
@@ -561,9 +632,9 @@ class EpubConverter:
                                                                    if hasattr(self.file_path, "stem") else "book_id")
        is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
-        title_preprocessed = prepare_title(title)
+        title_preprocessed = self.html_preprocessor.prepare_title(title)
-        content_preprocessed = prepare_content(title_preprocessed, content,
+        content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
-                                               remove_title_from_chapter=is_chapter)
+                                                                      remove_title_from_chapter=is_chapter)
        sub_nodes = []
        # warning! not EpubHtmlItems won't be added to chapter
        # if it doesn't have subchapters
@@ -598,11 +669,17 @@ class EpubConverter:
 if __name__ == "__main__":
-    epub_file_path = "../../epub/9781641050234.epub"
+    epub_file_path = "../../epub/Modern_Java_in_Action.epub"
    logger_object = BookLogger(
        name="epub", book_id=epub_file_path.split("/")[-1])
-    json_converter = EpubConverter(epub_file_path, logger=logger_object)
+    preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
        .get_preset_json()
    css_preprocessor = CSSPreprocessor(logger=logger_object)
    html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
    json_converter = EpubConverter(epub_file_path, logger=logger_object,
                                   css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
    content_dict = json_converter.convert_to_dict()
    with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -1,4 +1,7 @@
 from src.book_solver import BookSolver
 from src.preset_processor import PresetProcessor
 from src.epub_converter.css_preprocessor import CSSPreprocessor
 from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
 from src.epub_converter.epub_converter import EpubConverter
@@ -14,8 +17,10 @@ class EpubBook(BookSolver):
        Function
        Steps
        ----------
-        1. Converts .epub to .html
+        1. Gets data from preset structure
-        2. Parses from line structure to nested structure
+        2. Add preset to html preprocessor
        3. Converts .epub to .html
        4. Parses from line structure to nested structure
        Returns
        ----------
@@ -23,7 +28,12 @@ class EpubBook(BookSolver):
            json for LiveCarta platform
        """
        preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
            .get_preset_json()
        css_preprocessor = CSSPreprocessor(logger=self.logger_object)
        html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
        json_converter = EpubConverter(
-            self.file_path, access=self.access, logger=self.logger_object)
+            self.file_path, access=self.access, logger=self.logger_object,
            css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
        content_dict = json_converter.convert_to_dict()
        return content_dict
--- a/src/epub_converter/html_epub_preprocessor.py
+++ b/src/epub_converter/html_epub_preprocessor.py
@@ -1,419 +1,398 @@
 import re
 from bs4 import BeautifulSoup, NavigableString, Comment, Tag
-from bs4 import BeautifulSoup, NavigableString, Tag, Comment
+from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
-def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
+class HtmlEpubPreprocessor:
-    """
+    def __init__(self, preset, logger=None):
-    Function adds span with id from tag_to_be_removed
+        self.preset = preset
-    because this tag will be removed(unwrapped/extract)
+        self.logger: BookLogger = logger
-    Parameters
+        self.name2function = {
-    ----------
+            "table_wrapper": self._wrap_tags_with_table,
-    tag_to_be_removed: Soup object
+            "replacer": self._tags_to_correspond_livecarta_tag,
-    chapter_tag: BeautifulSoup
+            "unwrapper": self._unwrap_tags,
            "inserter": self._insert_tags_into_correspond_tags
        }
-    Returns
+    @staticmethod
-    -------
+    def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
-    None
+        """
-        updated body tag
+        Function adds span with id from tag_to_be_removed
        because this tag will be removed(unwrapped/extract)
        Parameters
        ----------
        tag_to_be_removed: Soup object
        chapter_tag: BeautifulSoup
-    """
+        Returns
-    def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
+        -------
-        """Function inserts span before tag aren't supported by LiveCarta"""
+        None
-        new_tag = chapter_tag.new_tag("span")
+            updated body tag
        new_tag.attrs["id"] = id_ or ""
        new_tag.attrs["class"] = class_ or ""
        new_tag.string = "\xa0"
        tag_to_be_removed.insert_before(new_tag)
-    if tag_to_be_removed.attrs.get("id"):
+        """
        _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
                                           id_=tag_to_be_removed.attrs["id"],
                                           class_=tag_to_be_removed.attrs.get("class"))
        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
                                               class_: list):
            """Function inserts span before tag aren't supported by LiveCarta"""
            new_tag = chapter_tag.new_tag("span")
            new_tag.attrs["id"] = id_ or ""
            new_tag.attrs["class"] = class_ or ""
            new_tag.string = "\xa0"
            tag_to_be_removed.insert_before(new_tag)
-def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
+        if tag_to_be_removed.attrs.get("id"):
-    """
+            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
-    After processing on a first_id that corresponds to current chapter,
+                                               id_=tag_to_be_removed.attrs["id"],
-    from initial html_soup all tags from current chapter are extracted
+                                               class_=tag_to_be_removed.attrs.get("class"))
    Parameters
    ----------
    first_id: str
        Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
    href: str
        Name of current chapters file
    html_soup: Tag
        Soup object of current  file
-    Returns
+    @staticmethod
-    -------
+    def prepare_title(title_of_chapter: str) -> str:
-    tags: list [Tag, NavigableString]
+        """
-        Chapter's tags
+        Function finalise processing/cleaning title
        Parameters
        ----------
        title_of_chapter: str
-    """
+        Returns
-    marked_tags = html_soup.find(
+        -------
-        attrs={"id": first_id, "class": "converter-chapter-mark"})
+        title: str
-    if marked_tags:
+            cleaned title
        next_tag = marked_tags.next_sibling
        tags = []
        while next_tag:
            if not isinstance(next_tag, NavigableString) and \
                    (next_tag.attrs.get("class") == "converter-chapter-mark"):
                break
            tags.append(next_tag)
            next_tag = next_tag.next_sibling
-        # remove tags between first_id and next found id
+        """
-        # save them in list for next steps
+        title = BeautifulSoup(title_of_chapter, features="lxml").string
-        tags = [tag.extract() for tag in tags]
+        # clean extra whitespace characters ([\r\n\t\f\v ])
-        html_soup.smooth()
+        title = re.sub(r"[\s\xa0]", " ", title).strip()
        return title
-    else:
+    @staticmethod
-        assert 0, f"Warning: no match for {first_id, href}"
+    def _remove_comments(chapter_tag):
        """
        Function remove comments
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
-    return tags
+        Returns
        -------
        None
            Chapter Tag without comments
        """
        for tag in chapter_tag.find_all():
            for element in tag(text=lambda text: isinstance(text, Comment)):
                element.extract()
-def prepare_title(title_of_chapter: str) -> str:
+    @staticmethod
-    """
+    def _wrap_strings_with_p(chapter_tag):
-    Function finalise processing/cleaning title
+        """
-    Parameters
+        Function converts headings that aren't supported by LiveCarta with <p>
-    ----------
+        Parameters
-    title_of_chapter: str
+        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
-    Returns
+        Returns
-    -------
+        -------
-    title: str
+        None
-        cleaned title
+            Chapter Tag with wrapped NavigableStrings
-    """
+        """
-    title = BeautifulSoup(title_of_chapter, features="lxml").string
+        for node in chapter_tag:
-    # clean extra whitespace characters ([\r\n\t\f\v ])
+            if isinstance(node, NavigableString):
-    title = re.sub(r"[\s\xa0]", " ", title).strip()
+                content = str(node)
-    return title
+                content = re.sub(r"([\s\xa0])", " ", content).strip()
                if content:
                    p_tag = chapter_tag.new_tag("p")
                    p_tag.append(str(node))
                    node.replace_with(p_tag)
    def _wrap_tags_with_table(self, chapter_tag, rules: list):
        """
        Function wraps <tag> with <table>
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
-def _remove_comments(chapter_tag):
+        Returns
-    """
+        -------
-    Function remove comments
+        None
-    Parameters
+            Chapter Tag with wrapped certain tags with <table>
    ----------
    chapter_tag: BeautifulSoup
        Tag & contents of the chapter tag
-    Returns
+        """
    -------
    None
        Chapter Tag without comments
-    """
+        def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
-    for tag in chapter_tag.find_all():
+            table = chapter_tag.new_tag("table")
-        for element in tag(text=lambda text: isinstance(text, Comment)):
+            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
-            element.extract()
+                = border, "center", f"width:{width}%;"
            tbody, tr, td = \
                chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
            td.attrs["bgcolor"] = bg_color
            tag_to_be_wrapped.wrap(td)
            td.wrap(tr)
            tr.wrap(tbody)
            tbody.wrap(table)
            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
            return table
        def process_tag_using_table(tag_to_wrap):
            _wrap_tag_with_table(
                chapter_tag,
                tag_to_be_wrapped=tag_to_wrap,
                width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
                border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
                bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
            self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
            tag_to_wrap.unwrap()
-def _wrap_strings_with_p(chapter_tag):
+        for rule in rules:
-    """
+            tags = rule["tags"]
-    Function converts headings that aren't supported by LiveCarta with <p>
+            for attr in rule["attrs"]:
-    Parameters
+                for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
-    ----------
+                                                        {attr["name"]: re.compile(fr"{attr['value']}")}):
    chapter_tag: BeautifulSoup
        Tag & contents of the chapter tag
    Returns
    -------
    None
        Chapter Tag with wrapped NavigableStrings
    """
    for node in chapter_tag:
        if isinstance(node, NavigableString):
            content = str(node)
            content = re.sub(r"([\s\xa0])", " ", content).strip()
            if content:
                p_tag = chapter_tag.new_tag("p")
                p_tag.append(str(node))
                node.replace_with(p_tag)
 def _wrap_tags_with_table(chapter_tag):
    """
    Function wraps <tag> with <table>
    Parameters
    ----------
    chapter_tag: BeautifulSoup
        Tag & contents of the chapter tag
    Returns
    -------
    None
        Chapter Tag with wrapped certain tags with <table>
    """
    def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
        table = chapter_tag.new_tag("table")
        table.attrs["border"], table.attrs["align"], table.attrs["style"] \
            = border, "center", f"width:{width}%;"
        tbody, tr, td = \
            chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
        td.attrs["bgcolor"] = bg_color
        tag_to_be_wrapped.wrap(td)
        td.wrap(tr)
        tr.wrap(tbody)
        tbody.wrap(table)
        table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
        return table
    def process_tag_using_table(tag_to_wrap):
        _wrap_tag_with_table(
            chapter_tag,
            tag_to_be_wrapped=tag_to_wrap,
            width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
            border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
            bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
        _add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
        tag_to_wrap.unwrap()
    for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
        if isinstance(attrs, tuple):
            attr, val = attrs[0], attrs[1]
            for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
                process_tag_using_table(tag_to_wrap)
        else:
            for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
                if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
                    process_tag_using_table(tag_to_wrap)
    @staticmethod
    def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list):
        """
        Function to replace all tags to correspond LiveCarta tags
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
-def _tags_to_correspond_livecarta_tag(chapter_tag):
+        Returns
-    """
+        -------
-    Function to replace all tags to correspond LiveCarta tags
+        None
-    Parameters
+            Chapter Tag with all tags replaced with LiveCarta tags
    ----------
    chapter_tag: BeautifulSoup
        Tag & contents of the chapter tag
-    Returns
+        """
-    -------
+        for rule in rules:
-    None
+            tags = rule["tags"]
-        Chapter Tag with all tags replaced with LiveCarta tags
+            tag_to_replace = rule["tag_to_replace"]
-
+            if rule["condition"]:
-    """
+                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
-    for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
+                    if condition_on_tag[0] == 'parent_tags':
-        for key in reg_keys:
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
-            if isinstance(key, tuple):
+                            if tag.parent.select(condition_on_tag[1]):
-                replace = key[0]
+                                tag.name = tag_to_replace
-                parent, child = key[1], key[2]
+                    elif condition_on_tag[0] == 'child_tags':
-                for parent_tag in chapter_tag.select(parent):
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
-                    if replace == "parent":
+                            if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
-                        parent_tag.name = to_replace_value
+                                tag.name = tag_to_replace
-                    elif replace == "child":
+                    elif condition_on_tag[0] == "attrs":
-                        for child_tag in parent_tag.select(child):
+                        for attr in rule["condition"]["attrs"]:
-                            child_tag.name = to_replace_value
+                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
-                            if not child_tag.attrs.get("style"):
+                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
-                                child_tag.attrs["style"] =\
+                                tag.name = tag_to_replace
                                    "font-size: 14px; font-family: courier new,courier,monospace;"
            else:
-                tags = chapter_tag.find_all(re.compile(key))
+                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                for tag in tags:
                    # todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
-                    tag.name = to_replace_value
+                    tag.name = tag_to_replace
    def _unwrap_tags(self, chapter_tag, rules: dict):
        """
        Function unwrap tags and moves id to span
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
-def _unwrap_tags(chapter_tag):
+        Returns
-    """
+        -------
-    Function unwrap tags and moves id to span
+        None
-    Parameters
+            Chapter Tag with unwrapped certain tags
    ----------
    chapter_tag: BeautifulSoup
        Tag & contents of the chapter tag
-    Returns
+        """
-    -------
+        for tag_name in rules["tags"]:
-    None
+            for tag in chapter_tag.select(tag_name):
-        Chapter Tag with unwrapped certain tags
+                # if tag is a subtag
                if ">" in tag_name:
                    tag.parent.attrs.update(tag.attrs)
                self._add_span_to_save_ids_for_links(tag, chapter_tag)
                tag.unwrap()
-    """
+    @staticmethod
-    for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
+    def _insert_tags_into_correspond_tags(chapter_tag, rules: list):
-        for tag in chapter_tag.select(tag_name):
+        """
-            # if tag is a subtag
+        Function inserts tags into correspond tags
-            if ">" in tag_name:
+        Parameters
-                tag.parent.attrs.update(tag.attrs)
+        ----------
-            _add_span_to_save_ids_for_links(tag, chapter_tag)
+        chapter_tag: BeautifulSoup
-            tag.unwrap()
+            Tag & contents of the chapter tag
        Returns
        -------
        None
            Chapter Tag with inserted tags
-def _remove_headings_content(content_tag, title_of_chapter: str):
+        """
    """
    Function
    - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
    - adds span with id in order to
    Parameters
    ----------
    content_tag: soup object
        Tag of the page
    title_of_chapter: str
        Chapter title
-    Returns
+        def insert(tag, tag_to_insert):
-    -------
+            # insert all items that was in tag to subtag and remove from tag
-    None
+            for content in reversed(tag.contents):
-        clean/remove headings & add span with id
+                tag_to_insert.insert(0, content.extract())
            # wrap subtag with items
            tag.append(tag_to_insert)
-    """
+        for rule in rules:
-    title_of_chapter = title_of_chapter.lower()
+            tags = rule["tags"]
-    for tag in content_tag.contents:
+            tag_to_insert = \
-        text = tag if isinstance(tag, NavigableString) else tag.text
+                chapter_tag.new_tag(rule["tag_to_insert"])
-        if re.sub(r"[\s\xa0]", "", text):
+            if rule["condition"]:
-            text = re.sub(r"[\s\xa0]", " ", text).lower()
+                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
-            text = text.strip()   # delete extra spaces
+                    if condition_on_tag[0] == 'parent_tags':
-            if title_of_chapter == text or \
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
-                    (title_of_chapter in text and
+                            if tag.parent.select(condition_on_tag[1]):
-                     re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
+                                insert(tag, tag_to_insert)
-                _add_span_to_save_ids_for_links(tag, content_tag)
+                    elif condition_on_tag[0] == 'child_tags':
-                tag.extract()
+                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
-                return
+                            if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
-            elif not isinstance(tag, NavigableString):
+                                insert(tag, tag_to_insert)
-                if not _remove_headings_content(tag, title_of_chapter):
+                    elif condition_on_tag[0] == "attrs":
-                    break
+                        for attr in rule["condition"]["attrs"]:
-
+                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
-
+                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
-def _process_table(chapter_tag: BeautifulSoup):
+                                insert(tag, tag_to_insert)
    """
    Function preprocesses tables and tags(td|th|tr)
    Parameters
    ----------
    chapter_tag: BeautifulSoup
        Tag & contents of the chapter tag
    Returns
    -------
    None
        Chapter Tag with processed tables
    """
    tables = chapter_tag.find_all("table")
    for table in tables:
        for t_tag in table.find_all(re.compile("td|th|tr")):
            width = ""
            if t_tag.get("style"):
                width_match = re.search(
                    r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
                if width_match:
                    size = width_match.group(1)
                    width = size + "px"
            t_tag.attrs["width"] = t_tag.get("width") or width
            if t_tag.attrs.get("style"):
                t_tag.attrs["style"] = t_tag.attrs["style"].replace(
                    "border:0;", "")
                if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
                    del t_tag.attrs["style"]
        if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
            table.attrs["border"] = "1"
 def _insert_tags_in_parents(chapter_tag):
    """
    Function inserts tags into correspond tags
    Parameters
    ----------
    chapter_tag: BeautifulSoup
        Tag & contents of the chapter tag
    Returns
    -------
    None
        Chapter Tag with inserted tags
    """
    parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
    for parent_tag_name, condition in parent_tag2condition.items():
        for parent_tag in chapter_tag.select(parent_tag_name):
            if parent_tag.select(condition):
                continue
            else:
-                tag_to_insert = chapter_tag.new_tag(
+                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
-                    LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
+                    insert(tag, tag_to_insert)
                # insert all items that was in pre to code and remove from pre
                for content in reversed(parent_tag.contents):
                    tag_to_insert.insert(0, content.extract())
                # wrap code with items
                parent_tag.append(tag_to_insert)
    def _remove_headings_content(self, content_tag, title_of_chapter: str):
        """
        Function
        - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
        - adds span with id in order to
        Parameters
        ----------
        content_tag: soup object
            Tag of the page
        title_of_chapter: str
            Chapter title
-def _class_removing(chapter_tag):
+        Returns
-    """
+        -------
-    Function removes classes that aren't created by converter
+        None
-    Parameters
+            clean/remove headings & add span with id
    ----------
    chapter_tag: BeautifulSoup
        Tag & contents of the chapter tag
-    Returns
+        """
-    -------
+        title_of_chapter = title_of_chapter.lower()
-    None
+        for tag in content_tag.contents:
-        Chapter Tag without original classes of the book
+            text = tag if isinstance(tag, NavigableString) else tag.text
            if re.sub(r"[\s\xa0]", "", text):
                text = re.sub(r"[\s\xa0]", " ", text).lower()
                text = text.strip()  # delete extra spaces
                if title_of_chapter == text or \
                        (title_of_chapter in text and
                         re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
                    self._add_span_to_save_ids_for_links(tag, content_tag)
                    tag.extract()
                    return
                elif not isinstance(tag, NavigableString):
                    if not self._remove_headings_content(tag, title_of_chapter):
                        break
-    """
+    @staticmethod
-    for tag in chapter_tag.find_all(recursive=True):
+    def _process_tables(chapter_tag: BeautifulSoup):
-        if tag.attrs.get("class") \
+        """
-                and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
+        Function preprocesses tables and tags(td|th|tr)
-            del tag.attrs["class"]
+        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
        Returns
        -------
        None
            Chapter Tag with processed tables
-def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
+        """
-    """
+        tables = chapter_tag.find_all("table")
-    Function finalise processing/cleaning content
+        for table in tables:
-    Parameters
+            for t_tag in table.find_all(re.compile("td|th|tr")):
-    ----------
+                width = ""
-    title_str: str
+                if t_tag.get("style"):
                    width_match = re.search(
                        r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
                    if width_match:
                        size = width_match.group(1)
                        width = size + "px"
-    content_tag: Tag, soup object
+                t_tag.attrs["width"] = t_tag.get("width") or width
-    remove_title_from_chapter: bool
+                if t_tag.attrs.get("style"):
                    t_tag.attrs["style"] = t_tag.attrs["style"].replace(
                        "border:0;", "")
                    if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
                        del t_tag.attrs["style"]
-    Steps
+            if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
-    ----------
+                table.attrs["border"] = "1"
    1. comments removal
    2. wrap NavigableString with tag <p>
    3. wrap tags with <table>
    4. replace tags with correspond LiveCarta tags
    5. unwrap tags
    6. heading removal
    7. process_table
    8. insert tags into correspond tags
    9. class removal
-    Returns
+    @staticmethod
-    -------
+    def _class_removing(chapter_tag):
-    content_tag: str
+        """
-        prepared content
+        Function removes classes that aren't created by converter
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
-    """
+        Returns
-    # 1. remove comments
+        -------
-    _remove_comments(content_tag)
+        None
            Chapter Tag without original classes of the book
-    # 2.
+        """
-    _wrap_strings_with_p(content_tag)
+        for tag in chapter_tag.find_all(recursive=True):
-    # 3.
+            if tag.attrs.get("class") \
-    _wrap_tags_with_table(content_tag)
+                    and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
-    # 4.
+                del tag.attrs["class"]
    _tags_to_correspond_livecarta_tag(content_tag)
    # 5.
    _unwrap_tags(content_tag)
    # 6.
    if remove_title_from_chapter:
        _remove_headings_content(content_tag, title_str)
    # 7.
    _process_table(content_tag)
    # 8.
    _insert_tags_in_parents(content_tag)
-    # 9. remove classes that weren't created by converter
+    def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
-    _class_removing(content_tag)
+        """
-    return str(content_tag)
+        Function finalise processing/cleaning content
        Parameters
        ----------
        title_str: str
        content_tag: Tag, soup object
        remove_title_from_chapter: bool
        Steps
        ----------
        1. comments removal
        2. wrap NavigableString with tag <p>
        3-6. wrap tags with <table>
            replace tags with correspond LiveCarta tags
            unwrap tags
            insert tags into correspond tags
        7. heading removal
        8. process_tables
        9. class removal
        Returns
        -------
        content_tag: str
            prepared content
        """
        # 1. remove comments
        self._remove_comments(content_tag)
        # 2.
        self._wrap_strings_with_p(content_tag)
        # 3-6.
        for dict in self.preset:
            func = self.name2function[dict["preset_name"]]
            func(content_tag, dict['rules'])
        # 7.
        if remove_title_from_chapter:
            self._remove_headings_content(content_tag, title_str)
        # 8.
        self._process_tables(content_tag)
        # 9. remove classes that weren't created by converter
        self._class_removing(content_tag)
        return str(content_tag)
--- a/src/preset_processor.py
+++ b/src/preset_processor.py
@@ -0,0 +1,15 @@
 import json
 from src.util.helpers import BookLogger
 class PresetProcessor:
    def __init__(self, preset_path="config/presets.json", logger=None):
        self.preset_path = preset_path
        self.logger: BookLogger = logger
    def get_preset_json(self):
        f = open(self.preset_path)
        data = json.load(f)
        return data