Merge processing tags[Docx, Epub]

2022-09-06 16:26:08 +03:00
parent ea37b19c36
commit ddc45e2d04
6 changed files with 226 additions and 277 deletions
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -5,6 +5,7 @@ from threading import Event
 from src.book_solver import BookSolver
 from src.util.helpers import BookLogger
 from src.html_preprocessor import HtmlPreprocessor
 from src.style_preprocessor import StylePreprocessor
 from src.docx_converter.docx2libre_html import Docx2LibreHTML
 from src.docx_converter.html_docx_processor import HTMLDocxProcessor
@@ -48,10 +49,14 @@ class DocxBook(BookSolver):
        # 2. Parses and cleans html, gets list of tags, gets footnotes
        try:
-            style_processor = StylePreprocessor()
+            html_preprocessor = HtmlPreprocessor(
-            parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
+                logger=self.logger_object, preset_path="presets/docx_presets.json")
-                                       logger=self.logger_object, style_processor=style_processor)
+            style_preprocessor = StylePreprocessor()
-            bs_tags, footnotes, top_level_headers = parser.process_html(
+            html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup,
                                               logger=self.logger_object,
                                               html_preprocessor=html_preprocessor,
                                               style_preprocessor=style_preprocessor)
            bs_tags, footnotes, top_level_headers = html_processor.process_html(
                self.access, html_converter.html_path, self.book_id)
        except Exception as exc:
            self.logger_object.log(
@@ -84,10 +89,12 @@ if __name__ == "__main__":
    html_converter = Docx2LibreHTML(file_path=docx_file_path,
                                    logger=logger_object, libre_locker=locker)
-    css_processor = StylePreprocessor()
+    html_preprocessor = HtmlPreprocessor(
-    parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
+        logger=logger_object, preset_path="../../presets/docx_presets.json")
-                               style_processor=css_processor, preset_path="../../presets/docx_presets.json")
+    style_preprocessor = StylePreprocessor()
-    content, footnotes, top_level_headers = parser.process_html(
+    html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
                                       html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
    content, footnotes, top_level_headers = html_processor.process_html(
        html_path=html_converter.html_path, book_id=html_converter.book_id)
    json_converter = LibreHTML2JSONConverter(
--- a/src/docx_converter/html_docx_processor.py
+++ b/src/docx_converter/html_docx_processor.py
@@ -1,32 +1,23 @@
 import re
 import json
 import pathlib
 from typing import List, Tuple, Dict, Union
 from bs4 import BeautifulSoup, Tag, NavigableString
 from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
 from src.html_preprocessor import _preprocess_html
 from src.docx_converter.image_processing import process_images
 from src.docx_converter.footnotes_processing import process_footnotes
 from src.tag_inline_style_processor import modify_html_soup_with_css_styles
 class HTMLDocxProcessor:
-
+    def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
    def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
                 style_processor, preset_path: str = "presets/docx_presets.json"):
        self.html_soup = html_soup
        self.body_tag = html_soup.body
        self.logger = logger
-        self.preset = json.load(open(preset_path))
+        self.html_soup = html_soup
-        self.style_processor = style_processor
+        self.body_tag = self.html_soup.body
-        self.name2action = {
+        self.html_preprocessor = html_preprocessor
-            "wrapper": self._wrap_tag,
+        self.style_preprocessor = style_preprocessor
            "decomposer": self._decompose_tag,
            "replacer": self._replace_tag,
            "attr_replacer": self._replace_attr,
            "unwrapper": self._unwrap_tag
        }
    def _process_toc_links(self):
        """Function to extract nodes which contains TOC links, remove links from file and detect headers."""
@@ -59,84 +50,6 @@ class HTMLDocxProcessor:
                                f"Check the structure of the file."
                                f"Tag name: {tag.name}")
    def _wrap_tag(self, **kwargs):
        kwargs["tag"].wrap(self.html_soup.new_tag(kwargs["rule"]["tag_to_wrap"]))
    @staticmethod
    def _decompose_tag(**kwargs):
        kwargs["tag"].decompose()
    @staticmethod
    def _replace_tag(**kwargs):
        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
        kwargs["tag"].name = tag_to_replace
    @staticmethod
    def _replace_attr(**kwargs):
        attr, attr_value =\
            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
        attr_to_replace, attr_value_to_replace =\
            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
        if attr_to_replace:
            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
            if attr_value_to_replace:
                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
            del kwargs["tag"][attr]
        elif attr_value_to_replace:
            kwargs["tag"].attrs[attr] = attr_value_to_replace
    @staticmethod
    def _unwrap_tag(**kwargs):
        kwargs["tag"].unwrap()
    @staticmethod
    def _process_tags(body_tag: Tag,
                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
                      action):
        """
        Function do action with tags
        Parameters
        ----------
        body_tag: Tag
            Tag & contents of the chapter tag
        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
            list of conditions when fire function
        action: function
            action what to do with tag
        Returns
        -------
        NoReturn
            Body Tag with processed certain tags
        """
        for rule in rules:
            tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
            if rule["condition"]:
                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
                    if condition_on_tag[0] == "parent_tags":
                        for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
                                                              for tag in tags])):
                            tag.parent.attrs.update(tag.attrs)
                            action(body_tag=body_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "child_tags":
                        for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
                                                              for tag in tags])):
                            action(body_tag=body_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "attrs":
                        for attr in rule["condition"]["attrs"]:
                            for tag in body_tag.find_all([re.compile(tag) for tag in tags],
                                                         {attr["name"]: re.compile(fr"{attr['value']}")}):
                                action(body_tag=body_tag, tag=tag, rule=rule)
                    # attr replacer
                    elif condition_on_tag[0] == "tags":
                        attr = rule["attr"]
                        for tag in body_tag.find_all([re.compile(tag) for tag in tags],
                                                     {attr['name']: re.compile(fr"{attr['value']}")}):
                            action(body_tag=body_tag, tag=tag, rule=rule)
            else:
                for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
                    action(body_tag=body_tag, tag=tag, rule=rule)
    def _process_quotes(self):
        """
            Function to process block quotes.
@@ -175,14 +88,6 @@ class HTMLDocxProcessor:
                    table.replaceWith(new_div)
    @staticmethod
    def convert_pt_to_px(value: float) -> float:
        value = float(value)
        if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
            return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
        else:
            return value
    def _process_tables(self):
        """Function to process tables. Set "border" attribute."""
        tables = self.body_tag.find_all("table")
@@ -197,7 +102,10 @@ class HTMLDocxProcessor:
                        size = match.group(1)
                        units = match.group(2)
                        if units == "pt":
-                            size = self.convert_pt_to_px(size)
+                            value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\
                                if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\
                                else float(size)
                            size = value
                        sizes.append(float(size))
                width = td.get("width")
                td.attrs = {}
@@ -392,14 +300,13 @@ class HTMLDocxProcessor:
        self.logger.log(f"Processing TOC and headers.")
        self._process_toc_links()
-        for rule in self.preset:
+        _preprocess_html(html_preprocessor=self.html_preprocessor,
-            self.logger.log(rule["preset_name"].title() + " process.")
+                         html_soup=self.html_soup)
            action = self.name2action[rule["preset_name"]]
            self._process_tags(self.body_tag, rule["rules"], action)
        # CSS after html processing cause of <fonts> that aren't supported by html
        self.logger.log("CSS inline style preprocessing.")
-        self.style_processor.process_inline_styles_in_html_soup(self.body_tag)
+        self.style_preprocessor.process_inline_styles_in_html_soup(
            self.body_tag)
        self.logger.log("CSS inline style processing.")
        modify_html_soup_with_css_styles(self.body_tag)
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -13,7 +13,7 @@ from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.style_preprocessor import StylePreprocessor
-from src.epub_converter.html_epub_processor import HtmlEpubProcessor
+from src.epub_converter.html_epub_processor import HTMLEpubProcessor
 from src.epub_converter.image_processing import update_images_src_links
 from src.epub_converter.footnotes_processing import preprocess_footnotes
 from src.tag_inline_style_processor import modify_html_soup_with_css_styles
@@ -21,7 +21,7 @@ from src.tag_inline_style_processor import modify_html_soup_with_css_styles
 class EpubConverter:
    def __init__(self, book_path, access=None, logger: BookLogger = None,
-                 style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
+                 style_processor: StylePreprocessor = None, html_processor: HTMLEpubProcessor = None):
        self.book_path = book_path
        self.access = access
        self.logger: BookLogger = logger
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -30,13 +30,16 @@ class EpubBook(BookSolver):
            json for LiveCarta platform
        """
-        style_processor = StylePreprocessor()
+        html_preprocessor = HtmlPreprocessor(
-        html_processor = HtmlEpubProcessor(
+            logger=self.logger_object, preset_path="presets/epub_presets.json")
-            logger=self.logger_object)
+        style_preprocessor = StylePreprocessor()
        html_processor = HTMLEpubProcessor(logger=self.logger_object,
                                           html_preprocessor=html_preprocessor)
        json_converter = EpubConverter(
            self.book_path, access=self.access, logger=self.logger_object,
-            style_processor=style_processor, html_processor=html_processor)
+            style_processor=style_preprocessor, html_processor=html_processor)
        content_dict = json_converter.convert_to_dict()
        return content_dict
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -1,58 +1,16 @@
 import re
-import json
+from typing import Union
 from typing import List, Dict, Union
 from bs4.element import PageElement
 from bs4 import BeautifulSoup, Tag, NavigableString, Comment
 from src.util.helpers import BookLogger
 from src.html_preprocessor import _preprocess_html
-class HtmlEpubProcessor:
+class HTMLEpubProcessor:
-    def __init__(self, preset_path: str = "presets/epub_presets.json", logger: BookLogger = None):
+    def __init__(self, logger: BookLogger = None, html_preprocessor=None):
        self.preset = json.load(open(preset_path))
        self.logger = logger
-        self.name2action = {
+        self.html_preprocessor = html_preprocessor
            "table_wrapper": self._process_tag_using_table,
            "replacer": self._replace_tag,
            "attr_replacer": self._replace_attr,
            "unwrapper": self._unwrap_tag,
            "inserter": self._insert_tag
        }
    @staticmethod
    def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
                                        chapter_tag: BeautifulSoup):
        """
        Function adds span with id from tag_to_be_removed
        because this tag will be removed(unwrapped/extract)
        Parameters
        ----------
        tag_to_be_removed: Union[PageElement, BeautifulSoup]
        chapter_tag: BeautifulSoup
        Returns
        -------
        NoReturn
            updated body tag
        """
        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
                                               tag_to_be_removed: Tag,
                                               id_: str,
                                               class_: Union[List[str], str]):
            """Function inserts span before tag aren't supported by LiveCarta"""
            new_tag: Tag = chapter_tag.new_tag("span")
            new_tag.attrs["id"] = id_ or ""
            new_tag.attrs["class"] = class_ or ""
            new_tag.string = "\xa0"
            tag_to_be_removed.insert_before(new_tag)
        if tag_to_be_removed.attrs.get("id"):
            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
                                               tag_to_be_removed=tag_to_be_removed,
                                               id_=tag_to_be_removed.attrs["id"],
                                               class_=tag_to_be_removed.attrs.get("class"))
    @staticmethod
    def prepare_title(title_of_chapter: str) -> str:
@@ -116,111 +74,6 @@ class HtmlEpubProcessor:
                    p_tag.append(str(node))
                    node.replace_with(p_tag)
    def _process_tag_using_table(self, **kwargs):
        def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
            table = kwargs["chapter_tag"].new_tag("table")
            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
                = border, "center", f"width:{width}%;"
            tbody, tr, td = \
                kwargs["chapter_tag"].new_tag("tbody"), kwargs["chapter_tag"].new_tag(
                    "tr"), kwargs["chapter_tag"].new_tag("td")
            td.attrs["bgcolor"] = bg_color
            kwargs["tag"].wrap(td)
            td.wrap(tr)
            tr.wrap(tbody)
            tbody.wrap(table)
            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
            return table
        _wrap_tag_with_table(
            width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
                "width") else "100",
            border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
                "border") else None,
            bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
        self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["chapter_tag"])
        kwargs["tag"].unwrap()
    @staticmethod
    def _replace_tag(**kwargs):
        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
        kwargs["tag"].name = tag_to_replace
    @staticmethod
    def _replace_attr(**kwargs):
        attr, attr_value =\
            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
        attr_to_replace, attr_value_to_replace =\
            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
        if attr_to_replace:
            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
            if attr_value_to_replace:
                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
            del kwargs["tag"][attr]
        elif attr_value_to_replace:
            kwargs["tag"].attrs[attr] = attr_value_to_replace
    @staticmethod
    def _unwrap_tag(**kwargs):
        kwargs["tag"].unwrap()
    @staticmethod
    def _insert_tag(**kwargs):
        tag_to_insert = \
            kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
        # insert all items that was in tag to subtag and remove from tag
        for content in reversed(kwargs["tag"].contents):
            tag_to_insert.insert(0, content.extract())
        # wrap subtag with items
        kwargs["tag"].append(tag_to_insert)
    @staticmethod
    def _process_tags(chapter_tag: BeautifulSoup,
                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
                      action):
        """
        Function do action with tags
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
            list of conditions when fire function
        action: function
            action what to do with tag
        Returns
        -------
        NoReturn
            Body Tag with processed certain tags
        """
        for rule in rules:
            tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
            if rule["condition"]:
                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
                    if condition_on_tag[0] == "parent_tags":
                        for tag in chapter_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
                                                                 for tag in tags])):
                            tag.parent.attrs.update(tag.attrs)
                            action(chapter_tag=chapter_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "child_tags":
                        for tag in chapter_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
                                                                 for tag in tags])):
                            action(chapter_tag=chapter_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "attrs":
                        for attr in rule["condition"]["attrs"]:
                            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
                                                            {attr["name"]: re.compile(fr"{attr['value']}")}):
                                action(chapter_tag=chapter_tag, tag=tag, rule=rule)
                    # attr replacer
                    elif condition_on_tag[0] == "tags":
                        attr = rule["attr"]
                        for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
                                                        {attr['name']: re.compile(fr"{attr['value']}")}):
                            action(chapter_tag=chapter_tag, tag=tag, rule=rule)
            else:
                for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
                    action(chapter_tag=chapter_tag, tag=tag, rule=rule)
    def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
        """
        Function
@@ -250,7 +103,8 @@ class HtmlEpubProcessor:
                    if title_of_chapter == text or \
                            (title_of_chapter in text and
                             re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
-                        self._add_span_to_save_ids_for_links(tag, chapter_tag)
+                        self.html_preprocessor._add_span_to_save_ids_for_links(
                            tag, chapter_tag)
                        tag.extract()
                        return
                    elif not self._remove_headings_content(tag, title_of_chapter):
@@ -350,9 +204,8 @@ class HtmlEpubProcessor:
        # 2.
        self._wrap_strings_with_p(chapter_tag)
        # 3-6.
-        for rule in self.preset:
+        _preprocess_html(
-            action = self.name2action[rule["preset_name"]]
+            html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
            self._process_tags(chapter_tag, rule["rules"], action)
        # 7.
        if remove_title_from_chapter:
            self._remove_headings_content(chapter_tag, title_str)
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -0,0 +1,179 @@
 import re
 import json
 from bs4 import BeautifulSoup, Tag
 from bs4.element import PageElement
 from typing import List, Dict, Union
 from src.util.helpers import BookLogger
 class HtmlPreprocessor:
    def __init__(self, logger: BookLogger, preset_path):
        self.preset = json.load(open(preset_path))
        self.logger = logger
        self.name2action = {
            "wrapper": self._wrap_tag,
            "table_wrapper": self._process_tag_using_table,
            "decomposer": self._decompose_tag,
            "replacer": self._replace_tag,
            "attr_replacer": self._replace_attr,
            "unwrapper": self._unwrap_tag,
            "inserter": self._insert_tag
        }
    @staticmethod
    def _wrap_tag(**kwargs):
        kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
            kwargs["rule"]["tag_to_wrap"]))
    @staticmethod
    def _decompose_tag(**kwargs):
        kwargs["tag"].decompose()
    @staticmethod
    def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
                                        chapter_tag: BeautifulSoup):
        """
        Function adds span with id from tag_to_be_removed
        because this tag will be removed(unwrapped/extract)
        Parameters
        ----------
        tag_to_be_removed: Union[PageElement, BeautifulSoup]
        chapter_tag: BeautifulSoup
        Returns
        -------
        NoReturn
            updated body tag
        """
        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
                                               tag_to_be_removed: Tag,
                                               id_: str,
                                               class_: Union[List[str], str]):
            """Function inserts span before tag aren't supported by LiveCarta"""
            new_tag: Tag = chapter_tag.new_tag("span")
            new_tag.attrs["id"] = id_ or ""
            new_tag.attrs["class"] = class_ or ""
            new_tag.string = "\xa0"
            tag_to_be_removed.insert_before(new_tag)
        if tag_to_be_removed.attrs.get("id"):
            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
                                               tag_to_be_removed=tag_to_be_removed,
                                               id_=tag_to_be_removed.attrs["id"],
                                               class_=tag_to_be_removed.attrs.get("class"))
    def _process_tag_using_table(self, **kwargs):
        def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
            table = kwargs["body_tag"].new_tag("table")
            table.attrs["border"], table.attrs["align"], table.attrs["style"] \
                = border, "center", f"width:{width}%;"
            tbody, tr, td = \
                kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag(
                    "tr"), kwargs["body_tag"].new_tag("td")
            td.attrs["bgcolor"] = bg_color
            kwargs["tag"].wrap(td)
            td.wrap(tr)
            tr.wrap(tbody)
            tbody.wrap(table)
            table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
            return table
        _wrap_tag_with_table(
            width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
                "width") else "100",
            border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
                "border") else None,
            bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
        self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"])
        kwargs["tag"].unwrap()
    @staticmethod
    def _replace_tag(**kwargs):
        tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
        kwargs["tag"].name = tag_to_replace
    @staticmethod
    def _replace_attr(**kwargs):
        attr, attr_value =\
            kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
        attr_to_replace, attr_value_to_replace =\
            kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
        if attr_to_replace:
            kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
            if attr_value_to_replace:
                kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
            del kwargs["tag"][attr]
        elif attr_value_to_replace:
            kwargs["tag"].attrs[attr] = attr_value_to_replace
    @staticmethod
    def _unwrap_tag(**kwargs):
        kwargs["tag"].unwrap()
    @staticmethod
    def _insert_tag(**kwargs):
        tag_to_insert = \
            kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
        # insert all items that was in tag to subtag and remove from tag
        for content in reversed(kwargs["tag"].contents):
            tag_to_insert.insert(0, content.extract())
        # wrap subtag with items
        kwargs["tag"].append(tag_to_insert)
    @staticmethod
    def _process_tags(body_tag: BeautifulSoup,
                      rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
                      action):
        """
        Function does action with tags
        Parameters
        ----------
        body_tag: BeautifulSoup
            Tag & contents of the body tag
        rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
            list of conditions when fire function
        action: function
            action what to do with tag
        Returns
        -------
        NoReturn
            Body Tag with processed certain tags
        """
        for rule in rules:
            tags: List[str] = rule["tags"] if rule.get(
                "tags") else rule["condition"]["tags"]
            if rule["condition"]:
                for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
                    if condition_on_tag[0] == "parent_tags":
                        for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
                                                              for tag in tags])):
                            tag.parent.attrs.update(tag.attrs)
                            action(body_tag=body_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "child_tags":
                        for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
                                                              for tag in tags])):
                            action(body_tag=body_tag, tag=tag, rule=rule)
                    elif condition_on_tag[0] == "attrs":
                        for attr in rule["condition"]["attrs"]:
                            for tag in body_tag.find_all([re.compile(tag) for tag in tags],
                                                         {attr["name"]: re.compile(fr"{attr['value']}")}):
                                action(body_tag=body_tag, tag=tag, rule=rule)
                    # attr replacer
                    elif condition_on_tag[0] == "tags":
                        attr = rule["attr"]
                        for tag in body_tag.find_all([re.compile(tag) for tag in tags],
                                                     {attr['name']: re.compile(fr"{attr['value']}")}):
                            action(body_tag=body_tag, tag=tag, rule=rule)
            else:
                for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
                    action(body_tag=body_tag, tag=tag, rule=rule)
 def _preprocess_html(html_preprocessor: HtmlPreprocessor, html_soup: BeautifulSoup):
    for rule in html_preprocessor.preset:
        # html_preprocessor.logger.log(rule["preset_name"].title() + " process.")
        action = html_preprocessor.name2action[rule["preset_name"]]
        html_preprocessor._process_tags(html_soup, rule["rules"], action)