import re import json from bs4 import BeautifulSoup, Tag from bs4.element import PageElement from typing import List, Dict, Union from src.util.helpers import BookLogger class HtmlPresetsProcessor: def __init__(self, logger: BookLogger, preset_path): self.preset = json.load(open(preset_path)) self.logger = logger self.name2action = { "wrapper": self._wrap_tag, "table_wrapper": self._process_tag_using_table, "decomposer": self._decompose_tag, "replacer": self._replace_tag, "attr_replacer": self._replace_attr, "unwrapper": self._unwrap_tag, "inserter": self._insert_tag } @staticmethod def _wrap_tag(**kwargs): kwargs["tag"].wrap(kwargs["body_tag"].new_tag( kwargs["rule"]["tag_to_wrap"])) @staticmethod def _decompose_tag(**kwargs): kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs) kwargs["tag"].decompose() @staticmethod def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup], chapter_tag: BeautifulSoup): """ Function adds span with id from tag_to_be_removed because this tag will be removed(unwrapped/extract) Parameters ---------- tag_to_be_removed: Union[PageElement, BeautifulSoup] chapter_tag: BeautifulSoup Returns ------- NoReturn updated body tag """ def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: Union[List[str], str]): """Function inserts span before tag aren't supported by LiveCarta""" new_tag: Tag = chapter_tag.new_tag("span") new_tag.attrs["id"] = id_ or "" new_tag.attrs["class"] = class_ or "" new_tag.string = "\xa0" tag_to_be_removed.insert_before(new_tag) if tag_to_be_removed.attrs.get("id"): _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, id_=tag_to_be_removed.attrs["id"], class_=tag_to_be_removed.attrs.get("class")) def _process_tag_using_table(self, **kwargs): def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag: table = kwargs["body_tag"].new_tag("table") table.attrs["border"], table.attrs["align"], table.attrs["style"] \ = border, "center", f"width:{width}%;" tbody, tr, td = \ kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag( "tr"), kwargs["body_tag"].new_tag("td") td.attrs["bgcolor"] = bg_color kwargs["tag"].wrap(td) td.wrap(tr) tr.wrap(tbody) tbody.wrap(table) table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) return table _wrap_tag_with_table( width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get( "width") else "100", border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get( "border") else None, bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None) self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"]) kwargs["tag"].unwrap() @staticmethod def _replace_tag(**kwargs): tag_to_replace: str = kwargs["rule"]["tag_to_replace"] kwargs["tag"].name = tag_to_replace @staticmethod def _replace_attr(**kwargs): attr, attr_value =\ kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"] attr_to_replace, attr_value_to_replace =\ kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] if attr_to_replace: kwargs["tag"][attr_to_replace] = kwargs["tag"][attr] if attr_value_to_replace: kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace del kwargs["tag"][attr] elif attr_value_to_replace: kwargs["tag"].attrs[attr] = attr_value_to_replace elif attr: del kwargs["tag"][attr] @staticmethod def _unwrap_tag(**kwargs): kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs) kwargs["tag"].unwrap() @staticmethod def _insert_tag(**kwargs): tag_to_insert = \ kwargs["body_tag"].new_tag(kwargs["rule"]["tag_to_insert"]) # insert all items that was in tag to subtag and remove from tag for content in reversed(kwargs["tag"].contents): tag_to_insert.insert(0, content.extract()) # wrap subtag with items kwargs["tag"].append(tag_to_insert) @staticmethod def _process_tags(body_tag: BeautifulSoup, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], action): """ Function does action with tags Parameters ---------- body_tag: BeautifulSoup Tag & contents of the body tag rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] list of conditions when fire function action: function action what to do with tag Returns ------- NoReturn Body Tag with processed certain tags """ for rule in rules: tags: List[str] = rule["tags"] if rule.get( "tags") else rule["condition"]["tags"] if rule["condition"]: for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): if condition_on_tag[0] == "parent_tags": for parent_tag in body_tag.select(condition_on_tag[1]): for tag in parent_tag.find_all([re.compile(tag) for tag in tags]): # parent_tag != tag.parent action(body_tag=body_tag, tag=tag, rule=rule) elif condition_on_tag[0] == "child_tags": for tag in body_tag.find_all([re.compile(tag) for tag in tags]): if tag.select(condition_on_tag[1]): action(body_tag=body_tag, tag=tag, rule=rule) elif condition_on_tag[0] == "attrs": for attr in rule["condition"]["attrs"]: for tag in body_tag.find_all([re.compile(tag) for tag in tags], {attr["name"]: re.compile(fr"{attr['value']}")}): action(body_tag=body_tag, tag=tag, rule=rule) # attr replacer elif condition_on_tag[0] == "tags": attr = rule["attr"] for tag in body_tag.find_all([re.compile(tag) for tag in tags], {attr['name']: re.compile(fr"{attr['value']}")}): action(body_tag=body_tag, tag=tag, rule=rule) else: for tag in body_tag.find_all([re.compile(tag) for tag in tags]): action(body_tag=body_tag, tag=tag, rule=rule) def _process_presets(html_preprocessor: HtmlPresetsProcessor, html_soup: BeautifulSoup): for rule in html_preprocessor.preset: # html_preprocessor.logger.log(rule["preset_name"].title() + " process.") action = html_preprocessor.name2action[rule["preset_name"]] html_preprocessor._process_tags(html_soup, rule["rules"], action)