forked from LiveCarta/BookConverter
Merge all preset functions in 1 [Epub]
This commit is contained in:
@@ -7,16 +7,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
|||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
|
|
||||||
|
|
||||||
class HtmlEpubPreprocessor:
|
class HtmlEpubProcessor:
|
||||||
def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
|
def __init__(self, preset_path: str = "presets/presets.json", logger: BookLogger = None):
|
||||||
self.preset = json.load(open(preset_path))
|
self.preset = json.load(open(preset_path))
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.name2function = {
|
self.name2action = {
|
||||||
"table_wrapper": self._wrap_tags_with_table,
|
"table_wrapper": self._process_tag_using_table,
|
||||||
"replacer": self._tags_to_correspond_livecarta_tag,
|
"replacer": self._replace_tag,
|
||||||
"attr_replacer": self._replace_attrs_in_tags,
|
"attr_replacer": self._replace_attr,
|
||||||
"unwrapper": self._unwrap_tags,
|
"unwrapper": self._unwrap_tag,
|
||||||
"inserter": self._insert_tags_into_correspond_tags
|
"inserter": self._insert_tag
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -116,208 +116,103 @@ class HtmlEpubPreprocessor:
|
|||||||
p_tag.append(str(node))
|
p_tag.append(str(node))
|
||||||
node.replace_with(p_tag)
|
node.replace_with(p_tag)
|
||||||
|
|
||||||
def _wrap_tags_with_table(self,
|
def _process_tag_using_table(self, **kwargs):
|
||||||
chapter_tag: BeautifulSoup,
|
|
||||||
rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
|
|
||||||
"""
|
|
||||||
Function wraps <tag> with <table>
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
rules: List[Dict[str, List[str, Dict[str, str]]]]
|
|
||||||
list of conditions when fire function
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
NoReturn
|
|
||||||
Chapter Tag with wrapped certain tags with <table>
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
|
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
|
||||||
table = chapter_tag.new_tag("table")
|
table = kwargs["chapter_tag"].new_tag("table")
|
||||||
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
||||||
= border, "center", f"width:{width}%;"
|
= border, "center", f"width:{width}%;"
|
||||||
tbody, tr, td = \
|
tbody, tr, td = \
|
||||||
chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
|
kwargs["chapter_tag"].new_tag("tbody"), kwargs["chapter_tag"].new_tag(
|
||||||
"tr"), chapter_tag.new_tag("td")
|
"tr"), kwargs["chapter_tag"].new_tag("td")
|
||||||
td.attrs["bgcolor"] = bg_color
|
td.attrs["bgcolor"] = bg_color
|
||||||
tag_to_wrap.wrap(td)
|
kwargs["tag"].wrap(td)
|
||||||
td.wrap(tr)
|
td.wrap(tr)
|
||||||
tr.wrap(tbody)
|
tr.wrap(tbody)
|
||||||
tbody.wrap(table)
|
tbody.wrap(table)
|
||||||
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def process_tag_using_table():
|
|
||||||
_wrap_tag_with_table(
|
_wrap_tag_with_table(
|
||||||
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
|
width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
|
||||||
"width") else "100",
|
"width") else "100",
|
||||||
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
|
border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
|
||||||
"border") else None,
|
"border") else None,
|
||||||
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
|
||||||
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["chapter_tag"])
|
||||||
tag_to_wrap.unwrap()
|
kwargs["tag"].unwrap()
|
||||||
|
|
||||||
for rule in rules:
|
|
||||||
tags = rule["tags"]
|
|
||||||
for attr in rule["attrs"]:
|
|
||||||
for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
|
||||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
|
||||||
process_tag_using_table()
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
|
def _replace_tag(**kwargs):
|
||||||
rules: List[Dict[str,
|
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
|
||||||
Union[List[str], str, Dict[str,
|
kwargs["tag"].name = tag_to_replace
|
||||||
Union[str, List[Dict[str, str]]]]]]]):
|
|
||||||
"""
|
|
||||||
Function to replace all tags to correspond LiveCarta tags
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
|
|
||||||
list of conditions when fire function
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
NoReturn
|
|
||||||
Chapter Tag with all tags replaced with LiveCarta tags
|
|
||||||
|
|
||||||
"""
|
|
||||||
for rule in rules:
|
|
||||||
tags: List[str] = rule["tags"]
|
|
||||||
tag_to_replace: str = rule["tag_to_replace"]
|
|
||||||
if rule["condition"]:
|
|
||||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
|
||||||
if condition_on_tag[0] == "parent_tags":
|
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
||||||
if tag.parent.select(condition_on_tag[1]):
|
|
||||||
tag.name = tag_to_replace
|
|
||||||
elif condition_on_tag[0] == "child_tags":
|
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
||||||
if "not" in condition_on_tag[1]:
|
|
||||||
if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])):
|
|
||||||
tag.name = tag_to_replace
|
|
||||||
else:
|
|
||||||
if tag.select(condition_on_tag[1]):
|
|
||||||
tag.name = tag_to_replace
|
|
||||||
elif condition_on_tag[0] == "attrs":
|
|
||||||
for attr in rule["condition"]["attrs"]:
|
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
|
||||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
|
||||||
tag.name = tag_to_replace
|
|
||||||
else:
|
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
||||||
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
|
||||||
tag.name = tag_to_replace
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
|
def _replace_attr(**kwargs):
|
||||||
"""
|
attr = kwargs["rule"]["attr"]
|
||||||
Function to replace all tags to correspond LiveCarta tags
|
attr_to_replace = kwargs["rule"]["attr_to_replace"]
|
||||||
Parameters
|
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
|
||||||
----------
|
del kwargs["tag"][attr]
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
|
|
||||||
list of conditions when fire function
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
NoReturn
|
|
||||||
Chapter Tag with all tags replaced with LiveCarta tags
|
|
||||||
|
|
||||||
"""
|
|
||||||
for rule in rules:
|
|
||||||
attr = rule["attr"]
|
|
||||||
tags: List[str] = rule["condition"]["tags"]
|
|
||||||
attr_to_replace = rule["attr_to_replace"]
|
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
|
||||||
{attr: re.compile(r".*")}):
|
|
||||||
tag[attr_to_replace] = tag[attr]
|
|
||||||
del tag[attr]
|
|
||||||
|
|
||||||
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: List[Dict[str, List[str]]]):
|
|
||||||
"""
|
|
||||||
Function unwrap tags and moves id to span
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
rules: List[Dict[str, List[str]]]
|
|
||||||
list of conditions when fire function
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
NoReturn
|
|
||||||
Chapter Tag with unwrapped certain tags
|
|
||||||
|
|
||||||
"""
|
|
||||||
for rule in rules:
|
|
||||||
for tag_name in rule["tags"]:
|
|
||||||
for tag in chapter_tag.select(tag_name):
|
|
||||||
# if tag is a subtag
|
|
||||||
if ">" in tag_name:
|
|
||||||
tag.parent.attrs.update(tag.attrs)
|
|
||||||
self._add_span_to_save_ids_for_links(tag, chapter_tag)
|
|
||||||
tag.unwrap()
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup,
|
def _unwrap_tag(**kwargs):
|
||||||
rules: List[Dict[str,
|
kwargs["tag"].unwrap()
|
||||||
Union[List[str], str, Dict[str,
|
|
||||||
Union[str, List[Dict[str, str]]]]]]]):
|
|
||||||
"""
|
|
||||||
Function inserts tags into correspond tags
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
|
|
||||||
list of conditions when fire function
|
|
||||||
|
|
||||||
Returns
|
@staticmethod
|
||||||
-------
|
def _insert_tag(**kwargs):
|
||||||
NoReturn
|
|
||||||
Chapter Tag with inserted tags
|
|
||||||
|
|
||||||
"""
|
|
||||||
def insert(tag: Tag):
|
|
||||||
tag_to_insert = \
|
tag_to_insert = \
|
||||||
chapter_tag.new_tag(rule["tag_to_insert"])
|
kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
|
||||||
# insert all items that was in tag to subtag and remove from tag
|
# insert all items that was in tag to subtag and remove from tag
|
||||||
for content in reversed(tag.contents):
|
for content in reversed(kwargs["tag"].contents):
|
||||||
tag_to_insert.insert(0, content.extract())
|
tag_to_insert.insert(0, content.extract())
|
||||||
# wrap subtag with items
|
# wrap subtag with items
|
||||||
tag.append(tag_to_insert)
|
kwargs["tag"].append(tag_to_insert)
|
||||||
|
|
||||||
|
def _process_tags(self,
|
||||||
|
chapter_tag: BeautifulSoup,
|
||||||
|
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
|
||||||
|
action):
|
||||||
|
"""
|
||||||
|
Function do action with tags
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
Tag & contents of the chapter tag
|
||||||
|
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
|
||||||
|
list of conditions when fire function
|
||||||
|
action: function
|
||||||
|
action what to do with tag
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
NoReturn
|
||||||
|
Body Tag with processed certain tags
|
||||||
|
|
||||||
|
"""
|
||||||
for rule in rules:
|
for rule in rules:
|
||||||
tags: List[str] = rule["tags"]
|
tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
|
||||||
if rule["condition"]:
|
if rule["condition"]:
|
||||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||||
if condition_on_tag[0] == "parent_tags":
|
if condition_on_tag[0] == "parent_tags":
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
for tag in chapter_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
|
||||||
if tag.parent.select(condition_on_tag[1]):
|
for tag in tags])):
|
||||||
insert(tag)
|
tag.parent.attrs.update(tag.attrs)
|
||||||
|
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
||||||
elif condition_on_tag[0] == "child_tags":
|
elif condition_on_tag[0] == "child_tags":
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
for tag in chapter_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
|
||||||
if "not" in condition_on_tag[1]:
|
for tag in tags])):
|
||||||
if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])):
|
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
||||||
tag.unwrap()
|
|
||||||
else:
|
|
||||||
if tag.select(condition_on_tag[1]):
|
|
||||||
tag.unwrap()
|
|
||||||
elif condition_on_tag[0] == "attrs":
|
elif condition_on_tag[0] == "attrs":
|
||||||
for attr in rule["condition"]["attrs"]:
|
for attr in rule["condition"]["attrs"]:
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||||
insert(tag)
|
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
||||||
|
# attr replacer
|
||||||
|
elif condition_on_tag[0] == "tags":
|
||||||
|
attr = rule["attr"]
|
||||||
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
|
{attr: re.compile(r".*")}):
|
||||||
|
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
||||||
else:
|
else:
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
insert(tag)
|
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
||||||
|
|
||||||
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
|
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
|
||||||
"""
|
"""
|
||||||
@@ -414,14 +309,14 @@ class HtmlEpubPreprocessor:
|
|||||||
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
||||||
del tag.attrs["class"]
|
del tag.attrs["class"]
|
||||||
|
|
||||||
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
|
def prepare_content(self, title_str: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
|
||||||
"""
|
"""
|
||||||
Function finalise processing/cleaning content
|
Function finalise processing/cleaning content
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
title_str: str
|
title_str: str
|
||||||
|
|
||||||
content_tag: Tag, soup object
|
chapter_tag: Tag, soup object
|
||||||
|
|
||||||
remove_title_from_chapter: bool
|
remove_title_from_chapter: bool
|
||||||
|
|
||||||
@@ -444,18 +339,18 @@ class HtmlEpubPreprocessor:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
# 1. remove comments
|
# 1. remove comments
|
||||||
self._remove_comments(content_tag)
|
self._remove_comments(chapter_tag)
|
||||||
# 2.
|
# 2.
|
||||||
self._wrap_strings_with_p(content_tag)
|
self._wrap_strings_with_p(chapter_tag)
|
||||||
# 3-6.
|
# 3-6.
|
||||||
for rule in self.preset:
|
for rule in self.preset:
|
||||||
func = self.name2function[rule["preset_name"]]
|
action = self.name2action[rule["preset_name"]]
|
||||||
func(content_tag, rule["rules"])
|
self._process_tags(chapter_tag, rule["rules"], action)
|
||||||
# 7.
|
# 7.
|
||||||
if remove_title_from_chapter:
|
if remove_title_from_chapter:
|
||||||
self._remove_headings_content(content_tag, title_str)
|
self._remove_headings_content(chapter_tag, title_str)
|
||||||
# 8.
|
# 8.
|
||||||
self._process_tables(content_tag)
|
self._process_tables(chapter_tag)
|
||||||
# 9. remove classes that weren't created by converter
|
# 9. remove classes that weren't created by converter
|
||||||
self._class_removing(content_tag)
|
self._class_removing(chapter_tag)
|
||||||
return content_tag
|
return chapter_tag
|
||||||
|
|||||||
Reference in New Issue
Block a user