forked from LiveCarta/BookConverter
453 lines
17 KiB
Python
453 lines
17 KiB
Python
import re
|
|
import json
|
|
from typing import List, Dict, Union
|
|
from bs4.element import PageElement
|
|
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
|
|
|
from src.util.helpers import BookLogger
|
|
|
|
|
|
class HtmlEpubPreprocessor:
|
|
def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
|
|
self.preset = json.load(open(preset_path))
|
|
self.logger = logger
|
|
self.name2function = {
|
|
"table_wrapper": self._wrap_tags_with_table,
|
|
"replacer": self._tags_to_correspond_livecarta_tag,
|
|
"attr_replacer": self._replace_attrs_in_tags,
|
|
"unwrapper": self._unwrap_tags,
|
|
"inserter": self._insert_tags_into_correspond_tags
|
|
}
|
|
|
|
@staticmethod
|
|
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
|
|
chapter_tag: BeautifulSoup):
|
|
"""
|
|
Function adds span with id from tag_to_be_removed
|
|
because this tag will be removed(unwrapped/extract)
|
|
Parameters
|
|
----------
|
|
tag_to_be_removed: Union[PageElement, BeautifulSoup]
|
|
|
|
chapter_tag: BeautifulSoup
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
updated body tag
|
|
|
|
"""
|
|
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
|
|
tag_to_be_removed: Tag,
|
|
id_: str,
|
|
class_: Union[List[str], str]):
|
|
"""Function inserts span before tag aren't supported by LiveCarta"""
|
|
new_tag: Tag = chapter_tag.new_tag("span")
|
|
new_tag.attrs["id"] = id_ or ""
|
|
new_tag.attrs["class"] = class_ or ""
|
|
new_tag.string = "\xa0"
|
|
tag_to_be_removed.insert_before(new_tag)
|
|
|
|
if tag_to_be_removed.attrs.get("id"):
|
|
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
|
|
tag_to_be_removed=tag_to_be_removed,
|
|
id_=tag_to_be_removed.attrs["id"],
|
|
class_=tag_to_be_removed.attrs.get("class"))
|
|
|
|
@staticmethod
|
|
def prepare_title(title_of_chapter: str) -> str:
|
|
"""
|
|
Function finalise processing/cleaning title
|
|
Parameters
|
|
----------
|
|
title_of_chapter: str
|
|
|
|
Returns
|
|
-------
|
|
title: str
|
|
cleaned title
|
|
|
|
"""
|
|
title = BeautifulSoup(title_of_chapter, features="lxml").string
|
|
# clean extra whitespace characters ([\r\n\t\f\v ])
|
|
title = re.sub(r"[\s\xa0]", " ", title).strip()
|
|
return title
|
|
|
|
@staticmethod
|
|
def _remove_comments(chapter_tag: BeautifulSoup):
|
|
"""
|
|
Function remove comments
|
|
Parameters
|
|
----------
|
|
chapter_tag: BeautifulSoup
|
|
Tag & contents of the chapter tag
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
Chapter Tag without comments
|
|
|
|
"""
|
|
for tag in chapter_tag.find_all():
|
|
for element in tag(text=lambda text: isinstance(text, Comment)):
|
|
element.extract()
|
|
|
|
@staticmethod
|
|
def _wrap_strings_with_p(chapter_tag: BeautifulSoup):
|
|
"""
|
|
Function converts headings that aren't supported by LiveCarta with <p>
|
|
Parameters
|
|
----------
|
|
chapter_tag: BeautifulSoup
|
|
Tag & contents of the chapter tag
|
|
|
|
Returns
|
|
-------
|
|
None
|
|
Chapter Tag with wrapped NavigableStrings
|
|
|
|
"""
|
|
for node in chapter_tag:
|
|
if isinstance(node, NavigableString):
|
|
content = str(node)
|
|
content = re.sub(r"([\s\xa0])", " ", content).strip()
|
|
if content:
|
|
p_tag = chapter_tag.new_tag("p")
|
|
p_tag.append(str(node))
|
|
node.replace_with(p_tag)
|
|
|
|
def _wrap_tags_with_table(self,
|
|
chapter_tag: BeautifulSoup,
|
|
rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
|
|
"""
|
|
Function wraps <tag> with <table>
|
|
Parameters
|
|
----------
|
|
chapter_tag: BeautifulSoup
|
|
Tag & contents of the chapter tag
|
|
rules: List[Dict[str, List[str, Dict[str, str]]]]
|
|
list of conditions when fire function
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
Chapter Tag with wrapped certain tags with <table>
|
|
|
|
"""
|
|
|
|
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
|
|
table = chapter_tag.new_tag("table")
|
|
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
|
= border, "center", f"width:{width}%;"
|
|
tbody, tr, td = \
|
|
chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
|
|
"tr"), chapter_tag.new_tag("td")
|
|
td.attrs["bgcolor"] = bg_color
|
|
tag_to_wrap.wrap(td)
|
|
td.wrap(tr)
|
|
tr.wrap(tbody)
|
|
tbody.wrap(table)
|
|
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
|
return table
|
|
|
|
def process_tag_using_table():
|
|
_wrap_tag_with_table(
|
|
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
|
|
"width") else "100",
|
|
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
|
|
"border") else None,
|
|
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
|
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
|
tag_to_wrap.unwrap()
|
|
|
|
for rule in rules:
|
|
tags = rule["tags"]
|
|
for attr in rule["attrs"]:
|
|
for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
|
process_tag_using_table()
|
|
|
|
@staticmethod
|
|
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
|
|
rules: List[Dict[str,
|
|
Union[List[str], str, Dict[str,
|
|
Union[str, List[Dict[str, str]]]]]]]):
|
|
"""
|
|
Function to replace all tags to correspond LiveCarta tags
|
|
Parameters
|
|
----------
|
|
chapter_tag: BeautifulSoup
|
|
Tag & contents of the chapter tag
|
|
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
|
|
list of conditions when fire function
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
Chapter Tag with all tags replaced with LiveCarta tags
|
|
|
|
"""
|
|
for rule in rules:
|
|
tags: List[str] = rule["tags"]
|
|
tag_to_replace: str = rule["tag_to_replace"]
|
|
if rule["condition"]:
|
|
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
|
if condition_on_tag[0] == 'parent_tags':
|
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
if tag.parent.select(condition_on_tag[1]):
|
|
tag.name = tag_to_replace
|
|
elif condition_on_tag[0] == 'child_tags':
|
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
|
|
tag.name = tag_to_replace
|
|
elif condition_on_tag[0] == "attrs":
|
|
for attr in rule["condition"]["attrs"]:
|
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
|
tag.name = tag_to_replace
|
|
else:
|
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
|
tag.name = tag_to_replace
|
|
|
|
@staticmethod
|
|
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
|
|
"""
|
|
Function to replace all tags to correspond LiveCarta tags
|
|
Parameters
|
|
----------
|
|
chapter_tag: BeautifulSoup
|
|
Tag & contents of the chapter tag
|
|
rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
|
|
list of conditions when fire function
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
Chapter Tag with all tags replaced with LiveCarta tags
|
|
|
|
"""
|
|
for rule in rules:
|
|
attr = rule["attr"]
|
|
tags: List[str] = rule["condition"]["tags"]
|
|
attr_to_replace = rule["attr_to_replace"]
|
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
|
{attr: re.compile(r".*")}):
|
|
tag[attr_to_replace] = tag[attr]
|
|
del tag[attr]
|
|
|
|
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]):
|
|
"""
|
|
Function unwrap tags and moves id to span
|
|
Parameters
|
|
----------
|
|
chapter_tag: BeautifulSoup
|
|
Tag & contents of the chapter tag
|
|
rules: Dict[str, List[str]]
|
|
dict of tags to unwrap
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
Chapter Tag with unwrapped certain tags
|
|
|
|
"""
|
|
for tag_name in rules["tags"]:
|
|
for tag in chapter_tag.select(tag_name):
|
|
# if tag is a subtag
|
|
if ">" in tag_name:
|
|
tag.parent.attrs.update(tag.attrs)
|
|
self._add_span_to_save_ids_for_links(tag, chapter_tag)
|
|
tag.unwrap()
|
|
|
|
@staticmethod
|
|
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup,
|
|
rules: List[Dict[str,
|
|
Union[List[str], str, Dict[str,
|
|
Union[str, List[Dict[str, str]]]]]]]):
|
|
"""
|
|
Function inserts tags into correspond tags
|
|
Parameters
|
|
----------
|
|
chapter_tag: BeautifulSoup
|
|
Tag & contents of the chapter tag
|
|
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
|
|
list of conditions when fire function
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
Chapter Tag with inserted tags
|
|
|
|
"""
|
|
def insert(tag: Tag):
|
|
tag_to_insert = \
|
|
chapter_tag.new_tag(rule["tag_to_insert"])
|
|
# insert all items that was in tag to subtag and remove from tag
|
|
for content in reversed(tag.contents):
|
|
tag_to_insert.insert(0, content.extract())
|
|
# wrap subtag with items
|
|
tag.append(tag_to_insert)
|
|
|
|
for rule in rules:
|
|
tags: List[str] = rule["tags"]
|
|
if rule["condition"]:
|
|
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
|
if condition_on_tag[0] == 'parent_tags':
|
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
if tag.parent.select(condition_on_tag[1]):
|
|
insert(tag)
|
|
elif condition_on_tag[0] == 'child_tags':
|
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
|
|
insert(tag)
|
|
elif condition_on_tag[0] == "attrs":
|
|
for attr in rule["condition"]["attrs"]:
|
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
|
insert(tag)
|
|
else:
|
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
insert(tag)
|
|
|
|
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
|
|
"""
|
|
Function
|
|
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
|
|
- adds span with id in order to
|
|
Parameters
|
|
----------
|
|
chapter_tag: Union[BeautifulSoup, PageElement]
|
|
Tag of the page
|
|
title_of_chapter: str
|
|
Chapter title
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
clean/remove headings & add span with id
|
|
|
|
"""
|
|
title_of_chapter = title_of_chapter.lower()
|
|
for tag in chapter_tag.contents:
|
|
tag: PageElement
|
|
text: str = tag if isinstance(tag, NavigableString) else tag.text
|
|
if re.sub(r"[\s\xa0]", "", text):
|
|
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
|
text = text.strip() # delete extra spaces
|
|
if not isinstance(tag, NavigableString):
|
|
if title_of_chapter == text or \
|
|
(title_of_chapter in text and
|
|
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
|
|
self._add_span_to_save_ids_for_links(tag, chapter_tag)
|
|
tag.extract()
|
|
return
|
|
elif not self._remove_headings_content(tag, title_of_chapter):
|
|
break
|
|
else:
|
|
tag.extract()
|
|
return
|
|
|
|
@staticmethod
|
|
def _process_tables(chapter_tag: BeautifulSoup):
|
|
"""
|
|
Function preprocesses tables and tags(td|th|tr)
|
|
Parameters
|
|
----------
|
|
chapter_tag: BeautifulSoup
|
|
Tag & contents of the chapter tag
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
Chapter Tag with processed tables
|
|
|
|
"""
|
|
tables = chapter_tag.find_all("table")
|
|
for table in tables:
|
|
for t_tag in table.find_all(re.compile("td|th|tr")):
|
|
width = ""
|
|
if t_tag.get("style"):
|
|
width_match = re.search(
|
|
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
|
|
if width_match:
|
|
size = width_match.group(1)
|
|
width = size + "px"
|
|
|
|
t_tag.attrs["width"] = t_tag.get("width") or width
|
|
|
|
if t_tag.attrs.get("style"):
|
|
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
|
"border:0;", "")
|
|
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
|
|
del t_tag.attrs["style"]
|
|
|
|
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
|
table.attrs["border"] = "1"
|
|
|
|
@staticmethod
|
|
def _class_removing(chapter_tag: BeautifulSoup):
|
|
"""
|
|
Function removes classes that aren't created by converter
|
|
Parameters
|
|
----------
|
|
chapter_tag: BeautifulSoup
|
|
Tag & contents of the chapter tag
|
|
|
|
Returns
|
|
-------
|
|
NoReturn
|
|
Chapter Tag without original classes of the book
|
|
|
|
"""
|
|
for tag in chapter_tag.find_all(recursive=True):
|
|
if tag.attrs.get("class") \
|
|
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
|
del tag.attrs["class"]
|
|
|
|
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
|
|
"""
|
|
Function finalise processing/cleaning content
|
|
Parameters
|
|
----------
|
|
title_str: str
|
|
|
|
content_tag: Tag, soup object
|
|
|
|
remove_title_from_chapter: bool
|
|
|
|
Steps
|
|
----------
|
|
1. comments removal
|
|
2. wrap NavigableString with tag <p>
|
|
3-6. wrap tags with <table>
|
|
replace tags with correspond LiveCarta tags
|
|
unwrap tags
|
|
insert tags into correspond tags
|
|
7. heading removal
|
|
8. process_tables
|
|
9. class removal
|
|
|
|
Returns
|
|
-------
|
|
content_tag: Tag
|
|
prepared content
|
|
|
|
"""
|
|
# 1. remove comments
|
|
self._remove_comments(content_tag)
|
|
# 2.
|
|
self._wrap_strings_with_p(content_tag)
|
|
# 3-6.
|
|
for rule in self.preset:
|
|
func = self.name2function[rule["preset_name"]]
|
|
func(content_tag, rule['rules'])
|
|
# 7.
|
|
if remove_title_from_chapter:
|
|
self._remove_headings_content(content_tag, title_str)
|
|
# 8.
|
|
self._process_tables(content_tag)
|
|
# 9. remove classes that weren't created by converter
|
|
self._class_removing(content_tag)
|
|
return content_tag
|