This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/epub_converter/html_epub_processor.py
2022-08-03 16:49:17 +03:00

453 lines
17 KiB
Python

import re
import json
from typing import List, Dict, Union
from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from src.util.helpers import BookLogger
class HtmlEpubPreprocessor:
def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
self.preset = json.load(open(preset_path))
self.logger = logger
self.name2function = {
"table_wrapper": self._wrap_tags_with_table,
"replacer": self._tags_to_correspond_livecarta_tag,
"attr_replacer": self._replace_attrs_in_tags,
"unwrapper": self._unwrap_tags,
"inserter": self._insert_tags_into_correspond_tags
}
@staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Union[PageElement, BeautifulSoup]
chapter_tag: BeautifulSoup
Returns
-------
NoReturn
updated body tag
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
tag_to_be_removed: Tag,
id_: str,
class_: Union[List[str], str]):
"""Function inserts span before tag aren't supported by LiveCarta"""
new_tag: Tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
@staticmethod
def prepare_title(title_of_chapter: str) -> str:
"""
Function finalise processing/cleaning title
Parameters
----------
title_of_chapter: str
Returns
-------
title: str
cleaned title
"""
title = BeautifulSoup(title_of_chapter, features="lxml").string
# clean extra whitespace characters ([\r\n\t\f\v ])
title = re.sub(r"[\s\xa0]", " ", title).strip()
return title
@staticmethod
def _remove_comments(chapter_tag: BeautifulSoup):
"""
Function remove comments
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
NoReturn
Chapter Tag without comments
"""
for tag in chapter_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
@staticmethod
def _wrap_strings_with_p(chapter_tag: BeautifulSoup):
"""
Function converts headings that aren't supported by LiveCarta with <p>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with wrapped NavigableStrings
"""
for node in chapter_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r"([\s\xa0])", " ", content).strip()
if content:
p_tag = chapter_tag.new_tag("p")
p_tag.append(str(node))
node.replace_with(p_tag)
def _wrap_tags_with_table(self,
chapter_tag: BeautifulSoup,
rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
"""
Function wraps <tag> with <table>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, List[str, Dict[str, str]]]]
list of conditions when fire function
Returns
-------
NoReturn
Chapter Tag with wrapped certain tags with <table>
"""
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
"tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_wrap.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def process_tag_using_table():
_wrap_tag_with_table(
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
"width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
"border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap()
for rule in rules:
tags = rule["tags"]
for attr in rule["attrs"]:
for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
process_tag_using_table()
@staticmethod
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
rules: List[Dict[str,
Union[List[str], str, Dict[str,
Union[str, List[Dict[str, str]]]]]]]):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
list of conditions when fire function
Returns
-------
NoReturn
Chapter Tag with all tags replaced with LiveCarta tags
"""
for rule in rules:
tags: List[str] = rule["tags"]
tag_to_replace: str = rule["tag_to_replace"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if tag.parent.select(condition_on_tag[1]):
tag.name = tag_to_replace
elif condition_on_tag[0] == 'child_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
tag.name = tag_to_replace
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
tag.name = tag_to_replace
else:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
tag.name = tag_to_replace
@staticmethod
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
list of conditions when fire function
Returns
-------
NoReturn
Chapter Tag with all tags replaced with LiveCarta tags
"""
for rule in rules:
attr = rule["attr"]
tags: List[str] = rule["condition"]["tags"]
attr_to_replace = rule["attr_to_replace"]
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr: re.compile(r".*")}):
tag[attr_to_replace] = tag[attr]
del tag[attr]
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]):
"""
Function unwrap tags and moves id to span
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: Dict[str, List[str]]
dict of tags to unwrap
Returns
-------
NoReturn
Chapter Tag with unwrapped certain tags
"""
for tag_name in rules["tags"]:
for tag in chapter_tag.select(tag_name):
# if tag is a subtag
if ">" in tag_name:
tag.parent.attrs.update(tag.attrs)
self._add_span_to_save_ids_for_links(tag, chapter_tag)
tag.unwrap()
@staticmethod
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup,
rules: List[Dict[str,
Union[List[str], str, Dict[str,
Union[str, List[Dict[str, str]]]]]]]):
"""
Function inserts tags into correspond tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
list of conditions when fire function
Returns
-------
NoReturn
Chapter Tag with inserted tags
"""
def insert(tag: Tag):
tag_to_insert = \
chapter_tag.new_tag(rule["tag_to_insert"])
# insert all items that was in tag to subtag and remove from tag
for content in reversed(tag.contents):
tag_to_insert.insert(0, content.extract())
# wrap subtag with items
tag.append(tag_to_insert)
for rule in rules:
tags: List[str] = rule["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if tag.parent.select(condition_on_tag[1]):
insert(tag)
elif condition_on_tag[0] == 'child_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
insert(tag)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
insert(tag)
else:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
insert(tag)
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
"""
Function
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
- adds span with id in order to
Parameters
----------
chapter_tag: Union[BeautifulSoup, PageElement]
Tag of the page
title_of_chapter: str
Chapter title
Returns
-------
NoReturn
clean/remove headings & add span with id
"""
title_of_chapter = title_of_chapter.lower()
for tag in chapter_tag.contents:
tag: PageElement
text: str = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
if not isinstance(tag, NavigableString):
if title_of_chapter == text or \
(title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
self._add_span_to_save_ids_for_links(tag, chapter_tag)
tag.extract()
return
elif not self._remove_headings_content(tag, title_of_chapter):
break
else:
tag.extract()
return
@staticmethod
def _process_tables(chapter_tag: BeautifulSoup):
"""
Function preprocesses tables and tags(td|th|tr)
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
NoReturn
Chapter Tag with processed tables
"""
tables = chapter_tag.find_all("table")
for table in tables:
for t_tag in table.find_all(re.compile("td|th|tr")):
width = ""
if t_tag.get("style"):
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
if width_match:
size = width_match.group(1)
width = size + "px"
t_tag.attrs["width"] = t_tag.get("width") or width
if t_tag.attrs.get("style"):
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
"border:0;", "")
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
del t_tag.attrs["style"]
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1"
@staticmethod
def _class_removing(chapter_tag: BeautifulSoup):
"""
Function removes classes that aren't created by converter
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
NoReturn
Chapter Tag without original classes of the book
"""
for tag in chapter_tag.find_all(recursive=True):
if tag.attrs.get("class") \
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
"""
Function finalise processing/cleaning content
Parameters
----------
title_str: str
content_tag: Tag, soup object
remove_title_from_chapter: bool
Steps
----------
1. comments removal
2. wrap NavigableString with tag <p>
3-6. wrap tags with <table>
replace tags with correspond LiveCarta tags
unwrap tags
insert tags into correspond tags
7. heading removal
8. process_tables
9. class removal
Returns
-------
content_tag: Tag
prepared content
"""
# 1. remove comments
self._remove_comments(content_tag)
# 2.
self._wrap_strings_with_p(content_tag)
# 3-6.
for rule in self.preset:
func = self.name2function[rule["preset_name"]]
func(content_tag, rule['rules'])
# 7.
if remove_title_from_chapter:
self._remove_headings_content(content_tag, title_str)
# 8.
self._process_tables(content_tag)
# 9. remove classes that weren't created by converter
self._class_removing(content_tag)
return content_tag