Merge processing tags[Docx, Epub]

This commit is contained in:
Kiryl
2022-09-06 16:26:08 +03:00
parent ea37b19c36
commit ddc45e2d04
6 changed files with 226 additions and 277 deletions

View File

@@ -5,6 +5,7 @@ from threading import Event
from src.book_solver import BookSolver
from src.util.helpers import BookLogger
from src.html_preprocessor import HtmlPreprocessor
from src.style_preprocessor import StylePreprocessor
from src.docx_converter.docx2libre_html import Docx2LibreHTML
from src.docx_converter.html_docx_processor import HTMLDocxProcessor
@@ -48,10 +49,14 @@ class DocxBook(BookSolver):
# 2. Parses and cleans html, gets list of tags, gets footnotes
try:
style_processor = StylePreprocessor()
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
logger=self.logger_object, style_processor=style_processor)
bs_tags, footnotes, top_level_headers = parser.process_html(
html_preprocessor = HtmlPreprocessor(
logger=self.logger_object, preset_path="presets/docx_presets.json")
style_preprocessor = StylePreprocessor()
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup,
logger=self.logger_object,
html_preprocessor=html_preprocessor,
style_preprocessor=style_preprocessor)
bs_tags, footnotes, top_level_headers = html_processor.process_html(
self.access, html_converter.html_path, self.book_id)
except Exception as exc:
self.logger_object.log(
@@ -84,10 +89,12 @@ if __name__ == "__main__":
html_converter = Docx2LibreHTML(file_path=docx_file_path,
logger=logger_object, libre_locker=locker)
css_processor = StylePreprocessor()
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
style_processor=css_processor, preset_path="../../presets/docx_presets.json")
content, footnotes, top_level_headers = parser.process_html(
html_preprocessor = HtmlPreprocessor(
logger=logger_object, preset_path="../../presets/docx_presets.json")
style_preprocessor = StylePreprocessor()
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
content, footnotes, top_level_headers = html_processor.process_html(
html_path=html_converter.html_path, book_id=html_converter.book_id)
json_converter = LibreHTML2JSONConverter(

View File

@@ -1,32 +1,23 @@
import re
import json
import pathlib
from typing import List, Tuple, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.html_preprocessor import _preprocess_html
from src.docx_converter.image_processing import process_images
from src.docx_converter.footnotes_processing import process_footnotes
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
class HTMLDocxProcessor:
def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
style_processor, preset_path: str = "presets/docx_presets.json"):
self.html_soup = html_soup
self.body_tag = html_soup.body
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
self.logger = logger
self.preset = json.load(open(preset_path))
self.style_processor = style_processor
self.name2action = {
"wrapper": self._wrap_tag,
"decomposer": self._decompose_tag,
"replacer": self._replace_tag,
"attr_replacer": self._replace_attr,
"unwrapper": self._unwrap_tag
}
self.html_soup = html_soup
self.body_tag = self.html_soup.body
self.html_preprocessor = html_preprocessor
self.style_preprocessor = style_preprocessor
def _process_toc_links(self):
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
@@ -59,84 +50,6 @@ class HTMLDocxProcessor:
f"Check the structure of the file."
f"Tag name: {tag.name}")
def _wrap_tag(self, **kwargs):
kwargs["tag"].wrap(self.html_soup.new_tag(kwargs["rule"]["tag_to_wrap"]))
@staticmethod
def _decompose_tag(**kwargs):
kwargs["tag"].decompose()
@staticmethod
def _replace_tag(**kwargs):
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
kwargs["tag"].name = tag_to_replace
@staticmethod
def _replace_attr(**kwargs):
attr, attr_value =\
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
attr_to_replace, attr_value_to_replace =\
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
if attr_to_replace:
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
if attr_value_to_replace:
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
del kwargs["tag"][attr]
elif attr_value_to_replace:
kwargs["tag"].attrs[attr] = attr_value_to_replace
@staticmethod
def _unwrap_tag(**kwargs):
kwargs["tag"].unwrap()
@staticmethod
def _process_tags(body_tag: Tag,
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
action):
"""
Function do action with tags
Parameters
----------
body_tag: Tag
Tag & contents of the chapter tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
list of conditions when fire function
action: function
action what to do with tag
Returns
-------
NoReturn
Body Tag with processed certain tags
"""
for rule in rules:
tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == "parent_tags":
for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
for tag in tags])):
tag.parent.attrs.update(tag.attrs)
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "child_tags":
for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
for tag in tags])):
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
action(body_tag=body_tag, tag=tag, rule=rule)
# attr replacer
elif condition_on_tag[0] == "tags":
attr = rule["attr"]
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
{attr['name']: re.compile(fr"{attr['value']}")}):
action(body_tag=body_tag, tag=tag, rule=rule)
else:
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
action(body_tag=body_tag, tag=tag, rule=rule)
def _process_quotes(self):
"""
Function to process block quotes.
@@ -175,14 +88,6 @@ class HTMLDocxProcessor:
table.replaceWith(new_div)
@staticmethod
def convert_pt_to_px(value: float) -> float:
value = float(value)
if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
else:
return value
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
@@ -197,7 +102,10 @@ class HTMLDocxProcessor:
size = match.group(1)
units = match.group(2)
if units == "pt":
size = self.convert_pt_to_px(size)
value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\
if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\
else float(size)
size = value
sizes.append(float(size))
width = td.get("width")
td.attrs = {}
@@ -392,14 +300,13 @@ class HTMLDocxProcessor:
self.logger.log(f"Processing TOC and headers.")
self._process_toc_links()
for rule in self.preset:
self.logger.log(rule["preset_name"].title() + " process.")
action = self.name2action[rule["preset_name"]]
self._process_tags(self.body_tag, rule["rules"], action)
_preprocess_html(html_preprocessor=self.html_preprocessor,
html_soup=self.html_soup)
# CSS after html processing cause of <fonts> that aren't supported by html
self.logger.log("CSS inline style preprocessing.")
self.style_processor.process_inline_styles_in_html_soup(self.body_tag)
self.style_preprocessor.process_inline_styles_in_html_soup(
self.body_tag)
self.logger.log("CSS inline style processing.")
modify_html_soup_with_css_styles(self.body_tag)

View File

@@ -13,7 +13,7 @@ from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.style_preprocessor import StylePreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
from src.epub_converter.html_epub_processor import HTMLEpubProcessor
from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
@@ -21,7 +21,7 @@ from src.tag_inline_style_processor import modify_html_soup_with_css_styles
class EpubConverter:
def __init__(self, book_path, access=None, logger: BookLogger = None,
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
style_processor: StylePreprocessor = None, html_processor: HTMLEpubProcessor = None):
self.book_path = book_path
self.access = access
self.logger: BookLogger = logger

View File

@@ -30,13 +30,16 @@ class EpubBook(BookSolver):
json for LiveCarta platform
"""
style_processor = StylePreprocessor()
html_processor = HtmlEpubProcessor(
logger=self.logger_object)
html_preprocessor = HtmlPreprocessor(
logger=self.logger_object, preset_path="presets/epub_presets.json")
style_preprocessor = StylePreprocessor()
html_processor = HTMLEpubProcessor(logger=self.logger_object,
html_preprocessor=html_preprocessor)
json_converter = EpubConverter(
self.book_path, access=self.access, logger=self.logger_object,
style_processor=style_processor, html_processor=html_processor)
style_processor=style_preprocessor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict()
return content_dict

View File

@@ -1,58 +1,16 @@
import re
import json
from typing import List, Dict, Union
from typing import Union
from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from src.util.helpers import BookLogger
from src.html_preprocessor import _preprocess_html
class HtmlEpubProcessor:
def __init__(self, preset_path: str = "presets/epub_presets.json", logger: BookLogger = None):
self.preset = json.load(open(preset_path))
class HTMLEpubProcessor:
def __init__(self, logger: BookLogger = None, html_preprocessor=None):
self.logger = logger
self.name2action = {
"table_wrapper": self._process_tag_using_table,
"replacer": self._replace_tag,
"attr_replacer": self._replace_attr,
"unwrapper": self._unwrap_tag,
"inserter": self._insert_tag
}
@staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Union[PageElement, BeautifulSoup]
chapter_tag: BeautifulSoup
Returns
-------
NoReturn
updated body tag
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
tag_to_be_removed: Tag,
id_: str,
class_: Union[List[str], str]):
"""Function inserts span before tag aren't supported by LiveCarta"""
new_tag: Tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
self.html_preprocessor = html_preprocessor
@staticmethod
def prepare_title(title_of_chapter: str) -> str:
@@ -116,111 +74,6 @@ class HtmlEpubProcessor:
p_tag.append(str(node))
node.replace_with(p_tag)
def _process_tag_using_table(self, **kwargs):
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
table = kwargs["chapter_tag"].new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
kwargs["chapter_tag"].new_tag("tbody"), kwargs["chapter_tag"].new_tag(
"tr"), kwargs["chapter_tag"].new_tag("td")
td.attrs["bgcolor"] = bg_color
kwargs["tag"].wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
_wrap_tag_with_table(
width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
"width") else "100",
border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
"border") else None,
bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["chapter_tag"])
kwargs["tag"].unwrap()
@staticmethod
def _replace_tag(**kwargs):
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
kwargs["tag"].name = tag_to_replace
@staticmethod
def _replace_attr(**kwargs):
attr, attr_value =\
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
attr_to_replace, attr_value_to_replace =\
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
if attr_to_replace:
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
if attr_value_to_replace:
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
del kwargs["tag"][attr]
elif attr_value_to_replace:
kwargs["tag"].attrs[attr] = attr_value_to_replace
@staticmethod
def _unwrap_tag(**kwargs):
kwargs["tag"].unwrap()
@staticmethod
def _insert_tag(**kwargs):
tag_to_insert = \
kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
# insert all items that was in tag to subtag and remove from tag
for content in reversed(kwargs["tag"].contents):
tag_to_insert.insert(0, content.extract())
# wrap subtag with items
kwargs["tag"].append(tag_to_insert)
@staticmethod
def _process_tags(chapter_tag: BeautifulSoup,
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
action):
"""
Function do action with tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
list of conditions when fire function
action: function
action what to do with tag
Returns
-------
NoReturn
Body Tag with processed certain tags
"""
for rule in rules:
tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == "parent_tags":
for tag in chapter_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
for tag in tags])):
tag.parent.attrs.update(tag.attrs)
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "child_tags":
for tag in chapter_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
for tag in tags])):
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
# attr replacer
elif condition_on_tag[0] == "tags":
attr = rule["attr"]
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr['name']: re.compile(fr"{attr['value']}")}):
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
else:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
"""
Function
@@ -250,7 +103,8 @@ class HtmlEpubProcessor:
if title_of_chapter == text or \
(title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
self._add_span_to_save_ids_for_links(tag, chapter_tag)
self.html_preprocessor._add_span_to_save_ids_for_links(
tag, chapter_tag)
tag.extract()
return
elif not self._remove_headings_content(tag, title_of_chapter):
@@ -350,9 +204,8 @@ class HtmlEpubProcessor:
# 2.
self._wrap_strings_with_p(chapter_tag)
# 3-6.
for rule in self.preset:
action = self.name2action[rule["preset_name"]]
self._process_tags(chapter_tag, rule["rules"], action)
_preprocess_html(
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
# 7.
if remove_title_from_chapter:
self._remove_headings_content(chapter_tag, title_str)

179
src/html_preprocessor.py Normal file
View File

@@ -0,0 +1,179 @@
import re
import json
from bs4 import BeautifulSoup, Tag
from bs4.element import PageElement
from typing import List, Dict, Union
from src.util.helpers import BookLogger
class HtmlPreprocessor:
def __init__(self, logger: BookLogger, preset_path):
self.preset = json.load(open(preset_path))
self.logger = logger
self.name2action = {
"wrapper": self._wrap_tag,
"table_wrapper": self._process_tag_using_table,
"decomposer": self._decompose_tag,
"replacer": self._replace_tag,
"attr_replacer": self._replace_attr,
"unwrapper": self._unwrap_tag,
"inserter": self._insert_tag
}
@staticmethod
def _wrap_tag(**kwargs):
kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
kwargs["rule"]["tag_to_wrap"]))
@staticmethod
def _decompose_tag(**kwargs):
kwargs["tag"].decompose()
@staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Union[PageElement, BeautifulSoup]
chapter_tag: BeautifulSoup
Returns
-------
NoReturn
updated body tag
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
tag_to_be_removed: Tag,
id_: str,
class_: Union[List[str], str]):
"""Function inserts span before tag aren't supported by LiveCarta"""
new_tag: Tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
def _process_tag_using_table(self, **kwargs):
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
table = kwargs["body_tag"].new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag(
"tr"), kwargs["body_tag"].new_tag("td")
td.attrs["bgcolor"] = bg_color
kwargs["tag"].wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
_wrap_tag_with_table(
width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
"width") else "100",
border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
"border") else None,
bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"])
kwargs["tag"].unwrap()
@staticmethod
def _replace_tag(**kwargs):
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
kwargs["tag"].name = tag_to_replace
@staticmethod
def _replace_attr(**kwargs):
attr, attr_value =\
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
attr_to_replace, attr_value_to_replace =\
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
if attr_to_replace:
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
if attr_value_to_replace:
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
del kwargs["tag"][attr]
elif attr_value_to_replace:
kwargs["tag"].attrs[attr] = attr_value_to_replace
@staticmethod
def _unwrap_tag(**kwargs):
kwargs["tag"].unwrap()
@staticmethod
def _insert_tag(**kwargs):
tag_to_insert = \
kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
# insert all items that was in tag to subtag and remove from tag
for content in reversed(kwargs["tag"].contents):
tag_to_insert.insert(0, content.extract())
# wrap subtag with items
kwargs["tag"].append(tag_to_insert)
@staticmethod
def _process_tags(body_tag: BeautifulSoup,
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
action):
"""
Function does action with tags
Parameters
----------
body_tag: BeautifulSoup
Tag & contents of the body tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
list of conditions when fire function
action: function
action what to do with tag
Returns
-------
NoReturn
Body Tag with processed certain tags
"""
for rule in rules:
tags: List[str] = rule["tags"] if rule.get(
"tags") else rule["condition"]["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == "parent_tags":
for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
for tag in tags])):
tag.parent.attrs.update(tag.attrs)
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "child_tags":
for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
for tag in tags])):
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
action(body_tag=body_tag, tag=tag, rule=rule)
# attr replacer
elif condition_on_tag[0] == "tags":
attr = rule["attr"]
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
{attr['name']: re.compile(fr"{attr['value']}")}):
action(body_tag=body_tag, tag=tag, rule=rule)
else:
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
action(body_tag=body_tag, tag=tag, rule=rule)
def _preprocess_html(html_preprocessor: HtmlPreprocessor, html_soup: BeautifulSoup):
for rule in html_preprocessor.preset:
# html_preprocessor.logger.log(rule["preset_name"].title() + " process.")
action = html_preprocessor.name2action[rule["preset_name"]]
html_preprocessor._process_tags(html_soup, rule["rules"], action)