Annotations in Epub converter

This commit is contained in:
Kiryl
2022-08-03 14:39:13 +03:00
parent 7453029295
commit 78e3ad8911
16 changed files with 259 additions and 192 deletions

View File

@@ -1,14 +1,16 @@
import re
import json
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
from typing import List, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4.element import PageElement
from src.util.helpers import BookLogger
class HtmlEpubPreprocessor:
def __init__(self, preset_path="../../presets/presets.json", logger=None):
def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
self.preset = json.load(open(preset_path))
self.logger: BookLogger = logger
self.logger = logger
self.name2function = {
"table_wrapper": self._wrap_tags_with_table,
"replacer": self._tags_to_correspond_livecarta_tag,
@@ -18,33 +20,37 @@ class HtmlEpubPreprocessor:
}
@staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Soup object
tag_to_be_removed: Union[PageElement, BeautifulSoup]
chapter_tag: BeautifulSoup
Returns
-------
None
NoReturn
updated body tag
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
class_: list):
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
tag_to_be_removed: Tag,
id_: str,
class_: Union[List[str], str]):
"""Function inserts span before tag aren't supported by LiveCarta"""
new_tag = chapter_tag.new_tag("span")
new_tag: Tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
@@ -78,7 +84,7 @@ class HtmlEpubPreprocessor:
Returns
-------
None
NoReturn
Chapter Tag without comments
"""
@@ -110,27 +116,32 @@ class HtmlEpubPreprocessor:
p_tag.append(str(node))
node.replace_with(p_tag)
def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: list):
def _wrap_tags_with_table(self,
chapter_tag: BeautifulSoup,
rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
"""
Function wraps <tag> with <table>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, List[str, Dict[str, str]]]]
list of conditions when fire function
Returns
-------
None
NoReturn
Chapter Tag with wrapped certain tags with <table>
"""
def _wrap_tag_with_table(width="100", border="", bg_color=None):
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
"tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_wrap.wrap(td)
td.wrap(tr)
@@ -141,8 +152,10 @@ class HtmlEpubPreprocessor:
def process_tag_using_table():
_wrap_tag_with_table(
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
"width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
"border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap()
@@ -155,23 +168,26 @@ class HtmlEpubPreprocessor:
process_tag_using_table()
@staticmethod
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: list):
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
list of conditions when fire function
Returns
-------
None
NoReturn
Chapter Tag with all tags replaced with LiveCarta tags
"""
for rule in rules:
tags = rule["tags"]
tag_to_replace = rule["tag_to_replace"]
tags: List[str] = rule["tags"]
tag_to_replace: str = rule["tag_to_replace"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
@@ -193,40 +209,44 @@ class HtmlEpubPreprocessor:
tag.name = tag_to_replace
@staticmethod
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list):
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
list of conditions when fire function
Returns
-------
None
NoReturn
Chapter Tag with all tags replaced with LiveCarta tags
"""
for rule in rules:
attr = rule["attr"]
tags = rule["condition"]["tags"]
tags: List[str] = rule["condition"]["tags"]
attr_to_replace = rule["attr_to_replace"]
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr: re.compile(r".*")}):
{attr: re.compile(r".*")}):
tag[attr_to_replace] = tag[attr]
del tag[attr]
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]):
"""
Function unwrap tags and moves id to span
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: Dict[str, List[str]]
dict of tags to unwrap
Returns
-------
None
NoReturn
Chapter Tag with unwrapped certain tags
"""
@@ -239,21 +259,23 @@ class HtmlEpubPreprocessor:
tag.unwrap()
@staticmethod
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: list):
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]):
"""
Function inserts tags into correspond tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
list of conditions when fire function
Returns
-------
None
NoReturn
Chapter Tag with inserted tags
"""
def insert(tag):
def insert(tag: Tag):
tag_to_insert = \
chapter_tag.new_tag(rule["tag_to_insert"])
# insert all items that was in tag to subtag and remove from tag
@@ -263,7 +285,7 @@ class HtmlEpubPreprocessor:
tag.append(tag_to_insert)
for rule in rules:
tags = rule["tags"]
tags: List[str] = rule["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
@@ -283,29 +305,28 @@ class HtmlEpubPreprocessor:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
insert(tag)
def _remove_headings_content(self, chapter_tag, title_of_chapter: str):
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
"""
Function
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
- adds span with id in order to
Parameters
----------
chapter_tag: soup object
chapter_tag: Union[BeautifulSoup, PageElement]
Tag of the page
title_of_chapter: str
Chapter title
Returns
-------
None
NoReturn
clean/remove headings & add span with id
"""
title_of_chapter = title_of_chapter.lower()
if title_of_chapter == "chapter 1":
pass
for tag in chapter_tag.contents:
text = tag if isinstance(tag, NavigableString) else tag.text
tag: PageElement
text: str = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
@@ -333,7 +354,7 @@ class HtmlEpubPreprocessor:
Returns
-------
None
NoReturn
Chapter Tag with processed tables
"""
@@ -370,7 +391,7 @@ class HtmlEpubPreprocessor:
Returns
-------
None
NoReturn
Chapter Tag without original classes of the book
"""
@@ -413,9 +434,9 @@ class HtmlEpubPreprocessor:
# 2.
self._wrap_strings_with_p(content_tag)
# 3-6.
for dict in self.preset:
func = self.name2function[dict["preset_name"]]
func(content_tag, dict['rules'])
for rule in self.preset:
func = self.name2function[rule["preset_name"]]
func(content_tag, rule['rules'])
# 7.
if remove_title_from_chapter:
self._remove_headings_content(content_tag, title_str)