Add ability to have several conditions on tags

This commit is contained in:
Kiryl
2022-10-19 18:44:34 +03:00
parent 1363940fa2
commit 658f206f5d
2 changed files with 114 additions and 72 deletions

View File

@@ -107,12 +107,10 @@ class HtmlEpubProcessor:
len(text_preparing(tag)) != 0 and len(text_preparing(tag)) != 0 and
re.findall(r"^h[1-5]$", tag.name or chapter_tag.name)) re.findall(r"^h[1-5]$", tag.name or chapter_tag.name))
if title_in_text: if title_in_text:
self.html_preprocessor._add_span_to_save_ids_for_links( self.html_preprocessor.add_span_to_save_ids_for_links(title_in_text[-1], chapter_tag)
title_in_text[-1], chapter_tag)
title_in_text[-1].extract() title_in_text[-1].extract()
elif text_in_title: elif text_in_title:
[self.html_preprocessor._add_span_to_save_ids_for_links( [self.html_preprocessor.add_span_to_save_ids_for_links(tag, chapter_tag) for tag in text_in_title]
tag, chapter_tag) for tag in text_in_title]
[tag.extract() for tag in text_in_title] [tag.extract() for tag in text_in_title]
@staticmethod @staticmethod
@@ -135,12 +133,12 @@ class HtmlEpubProcessor:
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"] del tag.attrs["class"]
def prepare_content(self, title_str: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: def prepare_content(self, title: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
""" """
Function finalise processing/cleaning content Function finalise processing/cleaning content
Parameters Parameters
---------- ----------
title_str: str title: str
chapter_tag: BeautifulSoup, soup object chapter_tag: BeautifulSoup, soup object
@@ -170,7 +168,7 @@ class HtmlEpubProcessor:
self._wrap_strings_with_p(chapter_tag) self._wrap_strings_with_p(chapter_tag)
# 3. # 3.
if remove_title_from_chapter: if remove_title_from_chapter:
self._remove_headings_content(chapter_tag, title_str) self._remove_headings_content(chapter_tag, title)
# 4. # 4.
_process_presets( _process_presets(
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag) html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)

View File

@@ -2,7 +2,7 @@ import re
import json import json
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from bs4.element import PageElement from bs4.element import PageElement
from typing import List, Dict, Union from typing import List, Set, Dict, Union
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
@@ -16,15 +16,60 @@ class HtmlPresetsProcessor:
"table_wrapper": self._process_tag_using_table, "table_wrapper": self._process_tag_using_table,
"decomposer": self._decompose_tag, "decomposer": self._decompose_tag,
"replacer": self._replace_tag, "replacer": self._replace_tag,
"attrs_remover": self._remove_attrs,
"attr_replacer": self._replace_attr, "attr_replacer": self._replace_attr,
"unwrapper": self._unwrap_tag, "unwrapper": self._unwrap_tag,
"inserter": self._insert_tag, "inserter": self._insert_tag,
"text_replacer": self._replace_text "text_replacer": self._replace_text
} }
self.conditions = {
"parent_tags": self._tags_with_parent_condition,
"child_tags": self._tags_with_child_condition,
"attrs": self._tags_with_attrs_condition,
"text": self._tags_with_text_condition
}
@staticmethod
def _tags_with_parent_condition(**kwargs):
found_tags: Set[Tag] = set()
for parent_tag in kwargs["body_tag"].select(kwargs["family_condition"]):
for tag in parent_tag.find_all([re.compile(tag) for tag in kwargs["tags"]]):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _tags_with_child_condition(**kwargs):
found_tags: Set[Tag] = set()
for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]]):
if tag.select(kwargs["family_condition"]):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _tags_with_attrs_condition(**kwargs):
found_tags: Set[Tag] = set()
names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]]
values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]]
attr_conditions: Dict[str, str] = dict(zip(names, values))
for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]],
attr_conditions):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _tags_with_text_condition(**kwargs):
# find all tags that are in List of tags and tags that contains required text
found_tags: Set[Tag] = set()
for tag in kwargs["body_tag"].find_all(
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))",
t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]),
t.text)):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod @staticmethod
def _wrap_tag(**kwargs): def _wrap_tag(**kwargs):
kwargs["tag"].wrap(kwargs["body_tag"].new_tag( kwargs["found_tag"].wrap(kwargs["body_tag"].new_tag(
kwargs["rule"]["tag_to_wrap"]["name"])) kwargs["rule"]["tag_to_wrap"]["name"]))
@staticmethod @staticmethod
@@ -34,13 +79,13 @@ class HtmlPresetsProcessor:
parent_tag.attrs[key] = tag.attrs[key] parent_tag.attrs[key] = tag.attrs[key]
def _decompose_tag(self, **kwargs): def _decompose_tag(self, **kwargs):
if kwargs["tag"].parent: if kwargs["found_tag"].parent:
self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent) self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent)
kwargs["tag"].decompose() kwargs["found_tag"].decompose()
@staticmethod @staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup], def add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
chapter_tag: BeautifulSoup): chapter_tag: BeautifulSoup):
""" """
Function adds span with id from tag_to_be_removed Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract) because this tag will be removed(unwrapped/extract)
@@ -82,29 +127,33 @@ class HtmlPresetsProcessor:
kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag( kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag(
"tr"), kwargs["body_tag"].new_tag("td") "tr"), kwargs["body_tag"].new_tag("td")
td.attrs["bgcolor"] = bg_color td.attrs["bgcolor"] = bg_color
kwargs["tag"].wrap(td) kwargs["found_tag"].wrap(td)
td.wrap(tr) td.wrap(tr)
tr.wrap(tbody) tr.wrap(tbody)
tbody.wrap(table) tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table return table
_wrap_tag_with_table( _wrap_tag_with_table(
width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get( width=kwargs["found_tag"].attrs["width"] if kwargs["found_tag"].attrs.get(
"width") else "100", "width") else "100",
border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get( border=kwargs["found_tag"].attrs["border"] if kwargs["found_tag"].attrs.get(
"border") else None, "border") else None,
bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None) bg_color=kwargs["found_tag"].attrs["bgcolor"] if kwargs["found_tag"].attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"]) self.add_span_to_save_ids_for_links(kwargs["found_tag"], kwargs["body_tag"])
kwargs["tag"].unwrap() kwargs["found_tag"].unwrap()
@staticmethod @staticmethod
def _replace_tag(**kwargs): def _replace_tag(**kwargs):
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"] tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"]
kwargs["tag"].name = tag_to_replace kwargs["found_tag"].name = tag_to_replace
if kwargs["rule"]["tag_to_replace"].get("attrs"): if kwargs["rule"]["tag_to_replace"].get("attrs"):
dict_attributes = {attr["name"]: attr["value"] dict_attributes = {attr["name"]: attr["value"]
for attr in kwargs["rule"]["tag_to_replace"]["attrs"]} for attr in kwargs["rule"]["tag_to_replace"]["attrs"]}
kwargs["tag"].attrs = dict_attributes kwargs["found_tag"].attrs = dict_attributes
@staticmethod
def _remove_attrs(**kwargs):
kwargs["found_tag"].attrs = {}
@staticmethod @staticmethod
def _replace_attr(**kwargs): def _replace_attr(**kwargs):
@@ -114,21 +163,21 @@ class HtmlPresetsProcessor:
attr_to_replace, attr_value_to_replace =\ attr_to_replace, attr_value_to_replace =\
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
if attr_to_replace: if attr_to_replace:
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr_name] \ kwargs["found_tag"][attr_to_replace] = kwargs["found_tag"][attr_name] \
if kwargs["tag"].get(attr_name)\ if kwargs["found_tag"].get(attr_name)\
else "" else ""
if attr_value_to_replace: if attr_value_to_replace:
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace kwargs["found_tag"].attrs[attr_to_replace] = attr_value_to_replace
del kwargs["tag"][attr_name] del kwargs["found_tag"][attr_name]
elif attr_value_to_replace: elif attr_value_to_replace:
kwargs["tag"].attrs[attr_name] = attr_value_to_replace kwargs["found_tag"].attrs[attr_name] = attr_value_to_replace
elif attr_name: elif attr_name:
del kwargs["tag"][attr_name] del kwargs["found_tag"][attr_name]
def _unwrap_tag(self, **kwargs): def _unwrap_tag(self, **kwargs):
if kwargs["tag"].parent: if kwargs["found_tag"].parent:
self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent) self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent)
kwargs["tag"].unwrap() kwargs["found_tag"].unwrap()
@staticmethod @staticmethod
def _insert_tag(**kwargs): def _insert_tag(**kwargs):
@@ -138,29 +187,29 @@ class HtmlPresetsProcessor:
kwargs["body_tag"].new_tag( kwargs["body_tag"].new_tag(
kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes) kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes)
# insert all items that was in tag to subtag and remove from tag # insert all items that was in tag to subtag and remove from tag
for content in reversed(kwargs["tag"].contents): for content in reversed(kwargs["found_tag"].contents):
tag_to_insert.insert(0, content.extract()) tag_to_insert.insert(0, content.extract())
# wrap subtag with items # wrap subtag with items
kwargs["tag"].append(tag_to_insert) kwargs["found_tag"].append(tag_to_insert)
@staticmethod @staticmethod
def _replace_text(**kwargs): def _replace_text(**kwargs):
if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["tag"].string): if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["found_tag"].string):
new_text = re.sub(re.compile( new_text = re.sub(re.compile(
kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["tag"].string) kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["found_tag"].string)
kwargs["tag"].string.replace_with(new_text) kwargs["found_tag"].string.replace_with(new_text)
@staticmethod def process_tags(self,
def _process_tags(body_tag: BeautifulSoup, body_tag: BeautifulSoup,
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
action): action):
""" """
Function does action with tags Function does action with tags
Parameters Parameters
---------- ----------
body_tag: BeautifulSoup body_tag: BeautifulSoup
Tag & contents of the body tag Tag & contents of the body tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
list of conditions when fire function list of conditions when fire function
action: function action: function
action what to do with tag action what to do with tag
@@ -170,39 +219,34 @@ class HtmlPresetsProcessor:
Body Tag with processed certain tags Body Tag with processed certain tags
""" """
for rule in rules: for preset_rule in preset_rules:
tags: List[str] = rule["tags"] if rule.get( tags: List[str] = preset_rule["tags"] if preset_rule.get(
"tags") else rule["condition"]["tags"] "tags") else preset_rule["condition"]["tags"]
if rule["condition"]: found_tags: List[Tag] = []
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): if preset_rule["condition"]:
if condition_on_tag[0] == "parent_tags": conditions_on_tag = tuple((k, v) for k, v in preset_rule["condition"].items() if v)
for parent_tag in body_tag.select(condition_on_tag[1]): for condition_on_tag in conditions_on_tag:
for tag in parent_tag.find_all([re.compile(tag) for tag in tags]): condition_func = self.conditions[condition_on_tag[0]]
# parent_tag != tag.parent was_found, f_tags = condition_func(body_tag=body_tag,
action(body_tag=body_tag, tag=tag, rule=rule) tags=tags,
elif condition_on_tag[0] == "child_tags": rule=preset_rule,
for tag in body_tag.find_all([re.compile(tag) for tag in tags]): family_condition=condition_on_tag[1])
if tag.select(condition_on_tag[1]): found_tags = found_tags + f_tags if was_found else []
action(body_tag=body_tag, tag=tag, rule=rule) if not was_found:
elif condition_on_tag[0] == "attrs": break
for attr in rule["condition"]["attrs"]: # if there are several conditions on tags and found_tags isn't empty
for tag in body_tag.find_all([re.compile(tag) for tag in tags], if len(conditions_on_tag) > 1 and found_tags:
{attr["name"]: re.compile(fr"{attr['value']}")}): # tags satisfying all conditions(>1)
action(body_tag=body_tag, tag=tag, rule=rule) found_tags = [tag for tag in found_tags if found_tags.count(tag) > 1]
elif condition_on_tag[0] == "text": for found_tag in found_tags:
# find all tags that are in List of tags and tags that contains required text action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
for tag in body_tag.find_all(
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in tags]) + r"))",
t.name) and re.search(re.compile(rule["condition"]["text"]),
t.text)):
action(body_tag=body_tag, tag=tag, rule=rule)
else: else:
for tag in body_tag.find_all([re.compile(tag) for tag in tags]): for found_tag in body_tag.find_all([re.compile(tag) for tag in tags]):
action(body_tag=body_tag, tag=tag, rule=rule) action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
def _process_presets(html_preprocessor: HtmlPresetsProcessor, html_soup: BeautifulSoup): def _process_presets(html_preprocessor: HtmlPresetsProcessor, html_soup: BeautifulSoup):
for rule in html_preprocessor.preset: for preset in html_preprocessor.preset:
# html_preprocessor.logger.log(rule["preset_name"].title() + " process.") # html_preprocessor.logger.log(rule["preset_name"].title() + " process.")
action = html_preprocessor.name2action[rule["preset_name"]] action = html_preprocessor.name2action[preset["preset_name"]]
html_preprocessor._process_tags(html_soup, rule["rules"], action) html_preprocessor.process_tags(html_soup, preset["rules"], action)