Merge processing tags[Docx, Epub]

2022-09-06 16:26:08 +03:00
parent ea37b19c36
commit ddc45e2d04
6 changed files with 226 additions and 277 deletions
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -5,6 +5,7 @@ from threading import Event

 from src.book_solver import BookSolver
 from src.util.helpers import BookLogger
+from src.html_preprocessor import HtmlPreprocessor
 from src.style_preprocessor import StylePreprocessor
 from src.docx_converter.docx2libre_html import Docx2LibreHTML
 from src.docx_converter.html_docx_processor import HTMLDocxProcessor
@@ -48,10 +49,14 @@ class DocxBook(BookSolver):

        # 2. Parses and cleans html, gets list of tags, gets footnotes
        try:
-            style_processor = StylePreprocessor()
-            parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
-                                       logger=self.logger_object, style_processor=style_processor)
-            bs_tags, footnotes, top_level_headers = parser.process_html(
+            html_preprocessor = HtmlPreprocessor(
+                logger=self.logger_object, preset_path="presets/docx_presets.json")
+            style_preprocessor = StylePreprocessor()
+            html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup,
+                                               logger=self.logger_object,
+                                               html_preprocessor=html_preprocessor,
+                                               style_preprocessor=style_preprocessor)
+            bs_tags, footnotes, top_level_headers = html_processor.process_html(
                self.access, html_converter.html_path, self.book_id)
        except Exception as exc:
            self.logger_object.log(
@@ -84,10 +89,12 @@ if __name__ == "__main__":
    html_converter = Docx2LibreHTML(file_path=docx_file_path,
                                    logger=logger_object, libre_locker=locker)

-    css_processor = StylePreprocessor()
-    parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
-                               style_processor=css_processor, preset_path="../../presets/docx_presets.json")
-    content, footnotes, top_level_headers = parser.process_html(
+    html_preprocessor = HtmlPreprocessor(
+        logger=logger_object, preset_path="../../presets/docx_presets.json")
+    style_preprocessor = StylePreprocessor()
+    html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
+                                       html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
+    content, footnotes, top_level_headers = html_processor.process_html(
        html_path=html_converter.html_path, book_id=html_converter.book_id)

    json_converter = LibreHTML2JSONConverter(
--- a/src/docx_converter/html_docx_processor.py
+++ b/src/docx_converter/html_docx_processor.py
@@ -1,32 +1,23 @@
 import re
-import json
 import pathlib
 from typing import List, Tuple, Dict, Union
 from bs4 import BeautifulSoup, Tag, NavigableString

 from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
+from src.html_preprocessor import _preprocess_html
 from src.docx_converter.image_processing import process_images
 from src.docx_converter.footnotes_processing import process_footnotes
 from src.tag_inline_style_processor import modify_html_soup_with_css_styles


 class HTMLDocxProcessor:
-
-    def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
-                 style_processor, preset_path: str = "presets/docx_presets.json"):
-        self.html_soup = html_soup
-        self.body_tag = html_soup.body
+    def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
        self.logger = logger
-        self.preset = json.load(open(preset_path))
-        self.style_processor = style_processor
-        self.name2action = {
-            "wrapper": self._wrap_tag,
-            "decomposer": self._decompose_tag,
-            "replacer": self._replace_tag,
-            "attr_replacer": self._replace_attr,
-            "unwrapper": self._unwrap_tag
-        }
+        self.html_soup = html_soup
+        self.body_tag = self.html_soup.body
+        self.html_preprocessor = html_preprocessor
+        self.style_preprocessor = style_preprocessor

    def _process_toc_links(self):
        """Function to extract nodes which contains TOC links, remove links from file and detect headers."""
@@ -59,84 +50,6 @@ class HTMLDocxProcessor:
                                f"Check the structure of the file."
                                f"Tag name: {tag.name}")

-    def _wrap_tag(self, **kwargs):
-        kwargs["tag"].wrap(self.html_soup.new_tag(kwargs["rule"]["tag_to_wrap"]))
-
-    @staticmethod
-    def _decompose_tag(**kwargs):
-        kwargs["tag"].decompose()
-
-    @staticmethod
-    def _replace_tag(**kwargs):
-        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
-        kwargs["tag"].name = tag_to_replace
-
-    @staticmethod
-    def _replace_attr(**kwargs):
-        attr, attr_value =\
-            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
-        attr_to_replace, attr_value_to_replace =\
-            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
-        if attr_to_replace:
-            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
-            if attr_value_to_replace:
-                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
-            del kwargs["tag"][attr]
-        elif attr_value_to_replace:
-            kwargs["tag"].attrs[attr] = attr_value_to_replace
-
-    @staticmethod
-    def _unwrap_tag(**kwargs):
-        kwargs["tag"].unwrap()
-
-    @staticmethod
-    def _process_tags(body_tag: Tag,
-                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
-                      action):
-        """
-        Function do action with tags
-        Parameters
-        ----------
-        body_tag: Tag
-            Tag & contents of the chapter tag
-        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
-            list of conditions when fire function
-        action: function
-            action what to do with tag
-        Returns
-        -------
-        NoReturn
-            Body Tag with processed certain tags
-
-        """
-        for rule in rules:
-            tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
-            if rule["condition"]:
-                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
-                    if condition_on_tag[0] == "parent_tags":
-                        for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
-                                                              for tag in tags])):
-                            tag.parent.attrs.update(tag.attrs)
-                            action(body_tag=body_tag, tag=tag, rule=rule)
-                    elif condition_on_tag[0] == "child_tags":
-                        for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
-                                                              for tag in tags])):
-                            action(body_tag=body_tag, tag=tag, rule=rule)
-                    elif condition_on_tag[0] == "attrs":
-                        for attr in rule["condition"]["attrs"]:
-                            for tag in body_tag.find_all([re.compile(tag) for tag in tags],
-                                                         {attr["name"]: re.compile(fr"{attr['value']}")}):
-                                action(body_tag=body_tag, tag=tag, rule=rule)
-                    # attr replacer
-                    elif condition_on_tag[0] == "tags":
-                        attr = rule["attr"]
-                        for tag in body_tag.find_all([re.compile(tag) for tag in tags],
-                                                     {attr['name']: re.compile(fr"{attr['value']}")}):
-                            action(body_tag=body_tag, tag=tag, rule=rule)
-            else:
-                for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
-                    action(body_tag=body_tag, tag=tag, rule=rule)
-
    def _process_quotes(self):
        """
            Function to process block quotes.
@@ -175,14 +88,6 @@ class HTMLDocxProcessor:

                    table.replaceWith(new_div)

-    @staticmethod
-    def convert_pt_to_px(value: float) -> float:
-        value = float(value)
-        if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
-            return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
-        else:
-            return value
-
    def _process_tables(self):
        """Function to process tables. Set "border" attribute."""
        tables = self.body_tag.find_all("table")
@@ -197,7 +102,10 @@ class HTMLDocxProcessor:
                        size = match.group(1)
                        units = match.group(2)
                        if units == "pt":
-                            size = self.convert_pt_to_px(size)
+                            value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\
+                                if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\
+                                else float(size)
+                            size = value
                        sizes.append(float(size))
                width = td.get("width")
                td.attrs = {}
@@ -392,14 +300,13 @@ class HTMLDocxProcessor:
        self.logger.log(f"Processing TOC and headers.")
        self._process_toc_links()

-        for rule in self.preset:
-            self.logger.log(rule["preset_name"].title() + " process.")
-            action = self.name2action[rule["preset_name"]]
-            self._process_tags(self.body_tag, rule["rules"], action)
+        _preprocess_html(html_preprocessor=self.html_preprocessor,
+                         html_soup=self.html_soup)

        # CSS after html processing cause of <fonts> that aren't supported by html
        self.logger.log("CSS inline style preprocessing.")
-        self.style_processor.process_inline_styles_in_html_soup(self.body_tag)
+        self.style_preprocessor.process_inline_styles_in_html_soup(
+            self.body_tag)

        self.logger.log("CSS inline style processing.")
        modify_html_soup_with_css_styles(self.body_tag)
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -13,7 +13,7 @@ from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.style_preprocessor import StylePreprocessor
-from src.epub_converter.html_epub_processor import HtmlEpubProcessor
+from src.epub_converter.html_epub_processor import HTMLEpubProcessor
 from src.epub_converter.image_processing import update_images_src_links
 from src.epub_converter.footnotes_processing import preprocess_footnotes
 from src.tag_inline_style_processor import modify_html_soup_with_css_styles
@@ -21,7 +21,7 @@ from src.tag_inline_style_processor import modify_html_soup_with_css_styles

 class EpubConverter:
    def __init__(self, book_path, access=None, logger: BookLogger = None,
-                 style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
+                 style_processor: StylePreprocessor = None, html_processor: HTMLEpubProcessor = None):
        self.book_path = book_path
        self.access = access
        self.logger: BookLogger = logger
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -30,13 +30,16 @@ class EpubBook(BookSolver):
            json for LiveCarta platform

        """
-        style_processor = StylePreprocessor()
-        html_processor = HtmlEpubProcessor(
-            logger=self.logger_object)
+        html_preprocessor = HtmlPreprocessor(
+            logger=self.logger_object, preset_path="presets/epub_presets.json")
+        style_preprocessor = StylePreprocessor()
+        html_processor = HTMLEpubProcessor(logger=self.logger_object,
+                                           html_preprocessor=html_preprocessor)
        json_converter = EpubConverter(
            self.book_path, access=self.access, logger=self.logger_object,
-            style_processor=style_processor, html_processor=html_processor)
+            style_processor=style_preprocessor, html_processor=html_processor)
        content_dict = json_converter.convert_to_dict()
+
        return content_dict


--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -1,58 +1,16 @@
 import re
-import json
-from typing import List, Dict, Union
+from typing import Union
 from bs4.element import PageElement
 from bs4 import BeautifulSoup, Tag, NavigableString, Comment

 from src.util.helpers import BookLogger
+from src.html_preprocessor import _preprocess_html


-class HtmlEpubProcessor:
-    def __init__(self, preset_path: str = "presets/epub_presets.json", logger: BookLogger = None):
-        self.preset = json.load(open(preset_path))
+class HTMLEpubProcessor:
+    def __init__(self, logger: BookLogger = None, html_preprocessor=None):
        self.logger = logger
-        self.name2action = {
-            "table_wrapper": self._process_tag_using_table,
-            "replacer": self._replace_tag,
-            "attr_replacer": self._replace_attr,
-            "unwrapper": self._unwrap_tag,
-            "inserter": self._insert_tag
-        }
-
-    @staticmethod
-    def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
-                                        chapter_tag: BeautifulSoup):
-        """
-        Function adds span with id from tag_to_be_removed
-        because this tag will be removed(unwrapped/extract)
-        Parameters
-        ----------
-        tag_to_be_removed: Union[PageElement, BeautifulSoup]
-
-        chapter_tag: BeautifulSoup
-
-        Returns
-        -------
-        NoReturn
-            updated body tag
-
-        """
-        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
-                                               tag_to_be_removed: Tag,
-                                               id_: str,
-                                               class_: Union[List[str], str]):
-            """Function inserts span before tag aren't supported by LiveCarta"""
-            new_tag: Tag = chapter_tag.new_tag("span")
-            new_tag.attrs["id"] = id_ or ""
-            new_tag.attrs["class"] = class_ or ""
-            new_tag.string = "\xa0"
-            tag_to_be_removed.insert_before(new_tag)
-
-        if tag_to_be_removed.attrs.get("id"):
-            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
-                                               tag_to_be_removed=tag_to_be_removed,
-                                               id_=tag_to_be_removed.attrs["id"],
-                                               class_=tag_to_be_removed.attrs.get("class"))
+        self.html_preprocessor = html_preprocessor

    @staticmethod
    def prepare_title(title_of_chapter: str) -> str:
@@ -116,111 +74,6 @@ class HtmlEpubProcessor:
                    p_tag.append(str(node))
                    node.replace_with(p_tag)

-    def _process_tag_using_table(self, **kwargs):
-        def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
-            table = kwargs["chapter_tag"].new_tag("table")
-            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
-                = border, "center", f"width:{width}%;"
-            tbody, tr, td = \
-                kwargs["chapter_tag"].new_tag("tbody"), kwargs["chapter_tag"].new_tag(
-                    "tr"), kwargs["chapter_tag"].new_tag("td")
-            td.attrs["bgcolor"] = bg_color
-            kwargs["tag"].wrap(td)
-            td.wrap(tr)
-            tr.wrap(tbody)
-            tbody.wrap(table)
-            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
-            return table
-        _wrap_tag_with_table(
-            width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
-                "width") else "100",
-            border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
-                "border") else None,
-            bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
-        self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["chapter_tag"])
-        kwargs["tag"].unwrap()
-
-    @staticmethod
-    def _replace_tag(**kwargs):
-        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
-        kwargs["tag"].name = tag_to_replace
-
-    @staticmethod
-    def _replace_attr(**kwargs):
-        attr, attr_value =\
-            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
-        attr_to_replace, attr_value_to_replace =\
-            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
-        if attr_to_replace:
-            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
-            if attr_value_to_replace:
-                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
-            del kwargs["tag"][attr]
-        elif attr_value_to_replace:
-            kwargs["tag"].attrs[attr] = attr_value_to_replace
-
-    @staticmethod
-    def _unwrap_tag(**kwargs):
-        kwargs["tag"].unwrap()
-
-    @staticmethod
-    def _insert_tag(**kwargs):
-        tag_to_insert = \
-            kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
-        # insert all items that was in tag to subtag and remove from tag
-        for content in reversed(kwargs["tag"].contents):
-            tag_to_insert.insert(0, content.extract())
-        # wrap subtag with items
-        kwargs["tag"].append(tag_to_insert)
-
-    @staticmethod
-    def _process_tags(chapter_tag: BeautifulSoup,
-                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
-                      action):
-        """
-        Function do action with tags
-        Parameters
-        ----------
-        chapter_tag: BeautifulSoup
-            Tag & contents of the chapter tag
-        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
-            list of conditions when fire function
-        action: function
-            action what to do with tag
-        Returns
-        -------
-        NoReturn
-            Body Tag with processed certain tags
-
-        """
-        for rule in rules:
-            tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
-            if rule["condition"]:
-                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
-                    if condition_on_tag[0] == "parent_tags":
-                        for tag in chapter_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
-                                                                 for tag in tags])):
-                            tag.parent.attrs.update(tag.attrs)
-                            action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-                    elif condition_on_tag[0] == "child_tags":
-                        for tag in chapter_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
-                                                                 for tag in tags])):
-                            action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-                    elif condition_on_tag[0] == "attrs":
-                        for attr in rule["condition"]["attrs"]:
-                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
-                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
-                                action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-                    # attr replacer
-                    elif condition_on_tag[0] == "tags":
-                        attr = rule["attr"]
-                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
-                                                        {attr['name']: re.compile(fr"{attr['value']}")}):
-                            action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-            else:
-                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
-                    action(chapter_tag=chapter_tag, tag=tag, rule=rule)
-
    def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
        """
        Function
@@ -250,7 +103,8 @@ class HtmlEpubProcessor:
                    if title_of_chapter == text or \
                            (title_of_chapter in text and
                             re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
-                        self._add_span_to_save_ids_for_links(tag, chapter_tag)
+                        self.html_preprocessor._add_span_to_save_ids_for_links(
+                            tag, chapter_tag)
                        tag.extract()
                        return
                    elif not self._remove_headings_content(tag, title_of_chapter):
@@ -350,9 +204,8 @@ class HtmlEpubProcessor:
        # 2.
        self._wrap_strings_with_p(chapter_tag)
        # 3-6.
-        for rule in self.preset:
-            action = self.name2action[rule["preset_name"]]
-            self._process_tags(chapter_tag, rule["rules"], action)
+        _preprocess_html(
+            html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
        # 7.
        if remove_title_from_chapter:
            self._remove_headings_content(chapter_tag, title_str)
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -0,0 +1,179 @@
+import re
+import json
+from bs4 import BeautifulSoup, Tag
+from bs4.element import PageElement
+from typing import List, Dict, Union
+
+from src.util.helpers import BookLogger
+
+
+class HtmlPreprocessor:
+    def __init__(self, logger: BookLogger, preset_path):
+        self.preset = json.load(open(preset_path))
+        self.logger = logger
+        self.name2action = {
+            "wrapper": self._wrap_tag,
+            "table_wrapper": self._process_tag_using_table,
+            "decomposer": self._decompose_tag,
+            "replacer": self._replace_tag,
+            "attr_replacer": self._replace_attr,
+            "unwrapper": self._unwrap_tag,
+            "inserter": self._insert_tag
+        }
+
+    @staticmethod
+    def _wrap_tag(**kwargs):
+        kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
+            kwargs["rule"]["tag_to_wrap"]))
+
+    @staticmethod
+    def _decompose_tag(**kwargs):
+        kwargs["tag"].decompose()
+
+    @staticmethod
+    def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
+                                        chapter_tag: BeautifulSoup):
+        """
+        Function adds span with id from tag_to_be_removed
+        because this tag will be removed(unwrapped/extract)
+        Parameters
+        ----------
+        tag_to_be_removed: Union[PageElement, BeautifulSoup]
+
+        chapter_tag: BeautifulSoup
+
+        Returns
+        -------
+        NoReturn
+            updated body tag
+
+        """
+        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
+                                               tag_to_be_removed: Tag,
+                                               id_: str,
+                                               class_: Union[List[str], str]):
+            """Function inserts span before tag aren't supported by LiveCarta"""
+            new_tag: Tag = chapter_tag.new_tag("span")
+            new_tag.attrs["id"] = id_ or ""
+            new_tag.attrs["class"] = class_ or ""
+            new_tag.string = "\xa0"
+            tag_to_be_removed.insert_before(new_tag)
+
+        if tag_to_be_removed.attrs.get("id"):
+            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
+                                               tag_to_be_removed=tag_to_be_removed,
+                                               id_=tag_to_be_removed.attrs["id"],
+                                               class_=tag_to_be_removed.attrs.get("class"))
+
+    def _process_tag_using_table(self, **kwargs):
+        def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
+            table = kwargs["body_tag"].new_tag("table")
+            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
+                = border, "center", f"width:{width}%;"
+            tbody, tr, td = \
+                kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag(
+                    "tr"), kwargs["body_tag"].new_tag("td")
+            td.attrs["bgcolor"] = bg_color
+            kwargs["tag"].wrap(td)
+            td.wrap(tr)
+            tr.wrap(tbody)
+            tbody.wrap(table)
+            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
+            return table
+        _wrap_tag_with_table(
+            width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
+                "width") else "100",
+            border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
+                "border") else None,
+            bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
+        self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"])
+        kwargs["tag"].unwrap()
+
+    @staticmethod
+    def _replace_tag(**kwargs):
+        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
+        kwargs["tag"].name = tag_to_replace
+
+    @staticmethod
+    def _replace_attr(**kwargs):
+        attr, attr_value =\
+            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
+        attr_to_replace, attr_value_to_replace =\
+            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
+        if attr_to_replace:
+            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
+            if attr_value_to_replace:
+                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
+            del kwargs["tag"][attr]
+        elif attr_value_to_replace:
+            kwargs["tag"].attrs[attr] = attr_value_to_replace
+
+    @staticmethod
+    def _unwrap_tag(**kwargs):
+        kwargs["tag"].unwrap()
+
+    @staticmethod
+    def _insert_tag(**kwargs):
+        tag_to_insert = \
+            kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
+        # insert all items that was in tag to subtag and remove from tag
+        for content in reversed(kwargs["tag"].contents):
+            tag_to_insert.insert(0, content.extract())
+        # wrap subtag with items
+        kwargs["tag"].append(tag_to_insert)
+
+    @staticmethod
+    def _process_tags(body_tag: BeautifulSoup,
+                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
+                      action):
+        """
+        Function does action with tags
+        Parameters
+        ----------
+        body_tag: BeautifulSoup
+            Tag & contents of the body tag
+        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
+            list of conditions when fire function
+        action: function
+            action what to do with tag
+        Returns
+        -------
+        NoReturn
+            Body Tag with processed certain tags
+
+        """
+        for rule in rules:
+            tags: List[str] = rule["tags"] if rule.get(
+                "tags") else rule["condition"]["tags"]
+            if rule["condition"]:
+                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+                    if condition_on_tag[0] == "parent_tags":
+                        for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
+                                                              for tag in tags])):
+                            tag.parent.attrs.update(tag.attrs)
+                            action(body_tag=body_tag, tag=tag, rule=rule)
+                    elif condition_on_tag[0] == "child_tags":
+                        for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
+                                                              for tag in tags])):
+                            action(body_tag=body_tag, tag=tag, rule=rule)
+                    elif condition_on_tag[0] == "attrs":
+                        for attr in rule["condition"]["attrs"]:
+                            for tag in body_tag.find_all([re.compile(tag) for tag in tags],
+                                                         {attr["name"]: re.compile(fr"{attr['value']}")}):
+                                action(body_tag=body_tag, tag=tag, rule=rule)
+                    # attr replacer
+                    elif condition_on_tag[0] == "tags":
+                        attr = rule["attr"]
+                        for tag in body_tag.find_all([re.compile(tag) for tag in tags],
+                                                     {attr['name']: re.compile(fr"{attr['value']}")}):
+                            action(body_tag=body_tag, tag=tag, rule=rule)
+            else:
+                for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
+                    action(body_tag=body_tag, tag=tag, rule=rule)
+
+
+def _preprocess_html(html_preprocessor: HtmlPreprocessor, html_soup: BeautifulSoup):
+    for rule in html_preprocessor.preset:
+        # html_preprocessor.logger.log(rule["preset_name"].title() + " process.")
+        action = html_preprocessor.name2action[rule["preset_name"]]
+        html_preprocessor._process_tags(html_soup, rule["rules"], action)