forked from LiveCarta/BookConverter
184 lines
7.9 KiB
Python
184 lines
7.9 KiB
Python
import re
|
|
import json
|
|
from bs4 import BeautifulSoup, Tag
|
|
from bs4.element import PageElement
|
|
from typing import List, Dict, Union
|
|
|
|
from src.util.helpers import BookLogger
|
|
|
|
|
|
class HtmlPresetsProcessor:
|
|
def __init__(self, logger: BookLogger, preset_path):
|
|
self.preset = json.load(open(preset_path))
|
|
self.logger = logger
|
|
self.name2action = {
|
|
"wrapper": self._wrap_tag,
|
|
"table_wrapper": self._process_tag_using_table,
|
|
"decomposer": self._decompose_tag,
|
|
"replacer": self._replace_tag,
|
|
"attr_replacer": self._replace_attr,
|
|
"unwrapper": self._unwrap_tag,
|
|
"inserter": self._insert_tag
|
|
}
|
|
|
|
@staticmethod
|
|
def _wrap_tag(**kwargs):
|
|
kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
|
|
kwargs["rule"]["tag_to_wrap"]))
|
|
|
|
@staticmethod
|
|
def _decompose_tag(**kwargs):
|
|
kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
|
|
kwargs["tag"].decompose()
|
|
|
|
@staticmethod
|
|
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
|
|
chapter_tag: BeautifulSoup):
|
|
"""
|
|
Function adds span with id from tag_to_be_removed
|
|
because this tag will be removed(unwrapped/extract)
|
|
Parameters
|
|
----------
|
|
tag_to_be_removed: Union[PageElement, BeautifulSoup]
|
|
|
|
chapter_tag: BeautifulSoup
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
updated body tag
|
|
|
|
"""
|
|
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
|
|
tag_to_be_removed: Tag,
|
|
id_: str,
|
|
class_: Union[List[str], str]):
|
|
"""Function inserts span before tag aren't supported by LiveCarta"""
|
|
new_tag: Tag = chapter_tag.new_tag("span")
|
|
new_tag.attrs["id"] = id_ or ""
|
|
new_tag.attrs["class"] = class_ or ""
|
|
new_tag.string = "\xa0"
|
|
tag_to_be_removed.insert_before(new_tag)
|
|
|
|
if tag_to_be_removed.attrs.get("id"):
|
|
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
|
|
tag_to_be_removed=tag_to_be_removed,
|
|
id_=tag_to_be_removed.attrs["id"],
|
|
class_=tag_to_be_removed.attrs.get("class"))
|
|
|
|
def _process_tag_using_table(self, **kwargs):
|
|
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
|
|
table = kwargs["body_tag"].new_tag("table")
|
|
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
|
= border, "center", f"width:{width}%;"
|
|
tbody, tr, td = \
|
|
kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag(
|
|
"tr"), kwargs["body_tag"].new_tag("td")
|
|
td.attrs["bgcolor"] = bg_color
|
|
kwargs["tag"].wrap(td)
|
|
td.wrap(tr)
|
|
tr.wrap(tbody)
|
|
tbody.wrap(table)
|
|
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
|
return table
|
|
_wrap_tag_with_table(
|
|
width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
|
|
"width") else "100",
|
|
border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
|
|
"border") else None,
|
|
bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
|
|
self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"])
|
|
kwargs["tag"].unwrap()
|
|
|
|
@staticmethod
|
|
def _replace_tag(**kwargs):
|
|
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
|
|
kwargs["tag"].name = tag_to_replace
|
|
|
|
@staticmethod
|
|
def _replace_attr(**kwargs):
|
|
attr, attr_value =\
|
|
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
|
|
attr_to_replace, attr_value_to_replace =\
|
|
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
|
|
if attr_to_replace:
|
|
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
|
|
if attr_value_to_replace:
|
|
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
|
|
del kwargs["tag"][attr]
|
|
elif attr_value_to_replace:
|
|
kwargs["tag"].attrs[attr] = attr_value_to_replace
|
|
elif attr:
|
|
del kwargs["tag"][attr]
|
|
|
|
@staticmethod
|
|
def _unwrap_tag(**kwargs):
|
|
kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
|
|
kwargs["tag"].unwrap()
|
|
|
|
@staticmethod
|
|
def _insert_tag(**kwargs):
|
|
tag_to_insert = \
|
|
kwargs["body_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
|
|
# insert all items that was in tag to subtag and remove from tag
|
|
for content in reversed(kwargs["tag"].contents):
|
|
tag_to_insert.insert(0, content.extract())
|
|
# wrap subtag with items
|
|
kwargs["tag"].append(tag_to_insert)
|
|
|
|
@staticmethod
|
|
def _process_tags(body_tag: BeautifulSoup,
|
|
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
|
|
action):
|
|
"""
|
|
Function does action with tags
|
|
Parameters
|
|
----------
|
|
body_tag: BeautifulSoup
|
|
Tag & contents of the body tag
|
|
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
|
|
list of conditions when fire function
|
|
action: function
|
|
action what to do with tag
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
Body Tag with processed certain tags
|
|
|
|
"""
|
|
for rule in rules:
|
|
tags: List[str] = rule["tags"] if rule.get(
|
|
"tags") else rule["condition"]["tags"]
|
|
if rule["condition"]:
|
|
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
|
if condition_on_tag[0] == "parent_tags":
|
|
for parent_tag in body_tag.select(condition_on_tag[1]):
|
|
for tag in parent_tag.find_all([re.compile(tag) for tag in tags]):
|
|
# parent_tag != tag.parent
|
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
elif condition_on_tag[0] == "child_tags":
|
|
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
|
|
if tag.select(condition_on_tag[1]):
|
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
elif condition_on_tag[0] == "attrs":
|
|
for attr in rule["condition"]["attrs"]:
|
|
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
# attr replacer
|
|
elif condition_on_tag[0] == "tags":
|
|
attr = rule["attr"]
|
|
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
|
{attr['name']: re.compile(fr"{attr['value']}")}):
|
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
else:
|
|
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
|
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
|
|
|
|
def _process_presets(html_preprocessor: HtmlPresetsProcessor, html_soup: BeautifulSoup):
|
|
for rule in html_preprocessor.preset:
|
|
# html_preprocessor.logger.log(rule["preset_name"].title() + " process.")
|
|
action = html_preprocessor.name2action[rule["preset_name"]]
|
|
html_preprocessor._process_tags(html_soup, rule["rules"], action)
|