diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py
index 914b683..2947e9d 100644
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -7,16 +7,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from src.util.helpers import BookLogger
-class HtmlEpubPreprocessor:
- def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
+class HtmlEpubProcessor:
+ def __init__(self, preset_path: str = "presets/presets.json", logger: BookLogger = None):
self.preset = json.load(open(preset_path))
self.logger = logger
- self.name2function = {
- "table_wrapper": self._wrap_tags_with_table,
- "replacer": self._tags_to_correspond_livecarta_tag,
- "attr_replacer": self._replace_attrs_in_tags,
- "unwrapper": self._unwrap_tags,
- "inserter": self._insert_tags_into_correspond_tags
+ self.name2action = {
+ "table_wrapper": self._process_tag_using_table,
+ "replacer": self._replace_tag,
+ "attr_replacer": self._replace_attr,
+ "unwrapper": self._unwrap_tag,
+ "inserter": self._insert_tag
}
@staticmethod
@@ -116,208 +116,103 @@ class HtmlEpubPreprocessor:
p_tag.append(str(node))
node.replace_with(p_tag)
- def _wrap_tags_with_table(self,
- chapter_tag: BeautifulSoup,
- rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
- """
- Function wraps with
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
- rules: List[Dict[str, List[str, Dict[str, str]]]]
- list of conditions when fire function
-
- Returns
- -------
- NoReturn
- Chapter Tag with wrapped certain tags with
-
- """
-
+ def _process_tag_using_table(self, **kwargs):
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
- table = chapter_tag.new_tag("table")
+ table = kwargs["chapter_tag"].new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
- chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
- "tr"), chapter_tag.new_tag("td")
+ kwargs["chapter_tag"].new_tag("tbody"), kwargs["chapter_tag"].new_tag(
+ "tr"), kwargs["chapter_tag"].new_tag("td")
td.attrs["bgcolor"] = bg_color
- tag_to_wrap.wrap(td)
+ kwargs["tag"].wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
-
- def process_tag_using_table():
- _wrap_tag_with_table(
- width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
- "width") else "100",
- border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
- "border") else None,
- bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
- self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
- tag_to_wrap.unwrap()
-
- for rule in rules:
- tags = rule["tags"]
- for attr in rule["attrs"]:
- for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
- {attr["name"]: re.compile(fr"{attr['value']}")}):
- process_tag_using_table()
+ _wrap_tag_with_table(
+ width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
+ "width") else "100",
+ border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
+ "border") else None,
+ bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
+ self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["chapter_tag"])
+ kwargs["tag"].unwrap()
@staticmethod
- def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
- rules: List[Dict[str,
- Union[List[str], str, Dict[str,
- Union[str, List[Dict[str, str]]]]]]]):
+ def _replace_tag(**kwargs):
+ tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
+ kwargs["tag"].name = tag_to_replace
+
+ @staticmethod
+ def _replace_attr(**kwargs):
+ attr = kwargs["rule"]["attr"]
+ attr_to_replace = kwargs["rule"]["attr_to_replace"]
+ kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
+ del kwargs["tag"][attr]
+
+ @staticmethod
+ def _unwrap_tag(**kwargs):
+ kwargs["tag"].unwrap()
+
+ @staticmethod
+ def _insert_tag(**kwargs):
+ tag_to_insert = \
+ kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
+ # insert all items that was in tag to subtag and remove from tag
+ for content in reversed(kwargs["tag"].contents):
+ tag_to_insert.insert(0, content.extract())
+ # wrap subtag with items
+ kwargs["tag"].append(tag_to_insert)
+
+ def _process_tags(self,
+ chapter_tag: BeautifulSoup,
+ rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
+ action):
"""
- Function to replace all tags to correspond LiveCarta tags
+ Function do action with tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
- rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
+ rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
list of conditions when fire function
-
+ action: function
+ action what to do with tag
Returns
-------
NoReturn
- Chapter Tag with all tags replaced with LiveCarta tags
+ Body Tag with processed certain tags
"""
for rule in rules:
- tags: List[str] = rule["tags"]
- tag_to_replace: str = rule["tag_to_replace"]
+ tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == "parent_tags":
- for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
- if tag.parent.select(condition_on_tag[1]):
- tag.name = tag_to_replace
+ for tag in chapter_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
+ for tag in tags])):
+ tag.parent.attrs.update(tag.attrs)
+ action(chapter_tag=chapter_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "child_tags":
- for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
- if "not" in condition_on_tag[1]:
- if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])):
- tag.name = tag_to_replace
- else:
- if tag.select(condition_on_tag[1]):
- tag.name = tag_to_replace
+ for tag in chapter_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
+ for tag in tags])):
+ action(chapter_tag=chapter_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
- {attr["name"]: re.compile(fr"{attr['value']}")}):
- tag.name = tag_to_replace
+ {attr["name"]: re.compile(fr"{attr['value']}")}):
+ action(chapter_tag=chapter_tag, tag=tag, rule=rule)
+ # attr replacer
+ elif condition_on_tag[0] == "tags":
+ attr = rule["attr"]
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+ {attr: re.compile(r".*")}):
+ action(chapter_tag=chapter_tag, tag=tag, rule=rule)
else:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
- # todo can cause appearance of \n ...
-> \n
...
\n
(section)
- tag.name = tag_to_replace
-
- @staticmethod
- def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
- """
- Function to replace all tags to correspond LiveCarta tags
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
- rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
- list of conditions when fire function
-
- Returns
- -------
- NoReturn
- Chapter Tag with all tags replaced with LiveCarta tags
-
- """
- for rule in rules:
- attr = rule["attr"]
- tags: List[str] = rule["condition"]["tags"]
- attr_to_replace = rule["attr_to_replace"]
- for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
- {attr: re.compile(r".*")}):
- tag[attr_to_replace] = tag[attr]
- del tag[attr]
-
- def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: List[Dict[str, List[str]]]):
- """
- Function unwrap tags and moves id to span
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
- rules: List[Dict[str, List[str]]]
- list of conditions when fire function
-
- Returns
- -------
- NoReturn
- Chapter Tag with unwrapped certain tags
-
- """
- for rule in rules:
- for tag_name in rule["tags"]:
- for tag in chapter_tag.select(tag_name):
- # if tag is a subtag
- if ">" in tag_name:
- tag.parent.attrs.update(tag.attrs)
- self._add_span_to_save_ids_for_links(tag, chapter_tag)
- tag.unwrap()
-
- @staticmethod
- def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup,
- rules: List[Dict[str,
- Union[List[str], str, Dict[str,
- Union[str, List[Dict[str, str]]]]]]]):
- """
- Function inserts tags into correspond tags
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
- rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
- list of conditions when fire function
-
- Returns
- -------
- NoReturn
- Chapter Tag with inserted tags
-
- """
- def insert(tag: Tag):
- tag_to_insert = \
- chapter_tag.new_tag(rule["tag_to_insert"])
- # insert all items that was in tag to subtag and remove from tag
- for content in reversed(tag.contents):
- tag_to_insert.insert(0, content.extract())
- # wrap subtag with items
- tag.append(tag_to_insert)
-
- for rule in rules:
- tags: List[str] = rule["tags"]
- if rule["condition"]:
- for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
- if condition_on_tag[0] == "parent_tags":
- for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
- if tag.parent.select(condition_on_tag[1]):
- insert(tag)
- elif condition_on_tag[0] == "child_tags":
- for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
- if "not" in condition_on_tag[1]:
- if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])):
- tag.unwrap()
- else:
- if tag.select(condition_on_tag[1]):
- tag.unwrap()
- elif condition_on_tag[0] == "attrs":
- for attr in rule["condition"]["attrs"]:
- for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
- {attr["name"]: re.compile(fr"{attr['value']}")}):
- insert(tag)
- else:
- for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
- insert(tag)
+ action(chapter_tag=chapter_tag, tag=tag, rule=rule)
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
"""
@@ -414,14 +309,14 @@ class HtmlEpubPreprocessor:
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
- def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
+ def prepare_content(self, title_str: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
"""
Function finalise processing/cleaning content
Parameters
----------
title_str: str
- content_tag: Tag, soup object
+ chapter_tag: Tag, soup object
remove_title_from_chapter: bool
@@ -444,18 +339,18 @@ class HtmlEpubPreprocessor:
"""
# 1. remove comments
- self._remove_comments(content_tag)
+ self._remove_comments(chapter_tag)
# 2.
- self._wrap_strings_with_p(content_tag)
+ self._wrap_strings_with_p(chapter_tag)
# 3-6.
for rule in self.preset:
- func = self.name2function[rule["preset_name"]]
- func(content_tag, rule["rules"])
+ action = self.name2action[rule["preset_name"]]
+ self._process_tags(chapter_tag, rule["rules"], action)
# 7.
if remove_title_from_chapter:
- self._remove_headings_content(content_tag, title_str)
+ self._remove_headings_content(chapter_tag, title_str)
# 8.
- self._process_tables(content_tag)
+ self._process_tables(chapter_tag)
# 9. remove classes that weren't created by converter
- self._class_removing(content_tag)
- return content_tag
+ self._class_removing(chapter_tag)
+ return chapter_tag