forked from LiveCarta/BookConverter
Annotations in Epub converter
This commit is contained in:
@@ -1,14 +1,16 @@
|
||||
import re
|
||||
import json
|
||||
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
|
||||
from typing import List, Dict, Union
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
||||
from bs4.element import PageElement
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
|
||||
|
||||
class HtmlEpubPreprocessor:
|
||||
def __init__(self, preset_path="../../presets/presets.json", logger=None):
|
||||
def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
|
||||
self.preset = json.load(open(preset_path))
|
||||
self.logger: BookLogger = logger
|
||||
self.logger = logger
|
||||
self.name2function = {
|
||||
"table_wrapper": self._wrap_tags_with_table,
|
||||
"replacer": self._tags_to_correspond_livecarta_tag,
|
||||
@@ -18,33 +20,37 @@ class HtmlEpubPreprocessor:
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
|
||||
chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function adds span with id from tag_to_be_removed
|
||||
because this tag will be removed(unwrapped/extract)
|
||||
Parameters
|
||||
----------
|
||||
tag_to_be_removed: Soup object
|
||||
tag_to_be_removed: Union[PageElement, BeautifulSoup]
|
||||
|
||||
chapter_tag: BeautifulSoup
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
updated body tag
|
||||
|
||||
"""
|
||||
|
||||
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
|
||||
class_: list):
|
||||
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
|
||||
tag_to_be_removed: Tag,
|
||||
id_: str,
|
||||
class_: Union[List[str], str]):
|
||||
"""Function inserts span before tag aren't supported by LiveCarta"""
|
||||
new_tag = chapter_tag.new_tag("span")
|
||||
new_tag: Tag = chapter_tag.new_tag("span")
|
||||
new_tag.attrs["id"] = id_ or ""
|
||||
new_tag.attrs["class"] = class_ or ""
|
||||
new_tag.string = "\xa0"
|
||||
tag_to_be_removed.insert_before(new_tag)
|
||||
|
||||
if tag_to_be_removed.attrs.get("id"):
|
||||
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
|
||||
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
|
||||
tag_to_be_removed=tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs["id"],
|
||||
class_=tag_to_be_removed.attrs.get("class"))
|
||||
|
||||
@@ -78,7 +84,7 @@ class HtmlEpubPreprocessor:
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag without comments
|
||||
|
||||
"""
|
||||
@@ -110,27 +116,32 @@ class HtmlEpubPreprocessor:
|
||||
p_tag.append(str(node))
|
||||
node.replace_with(p_tag)
|
||||
|
||||
def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: list):
|
||||
def _wrap_tags_with_table(self,
|
||||
chapter_tag: BeautifulSoup,
|
||||
rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
|
||||
"""
|
||||
Function wraps <tag> with <table>
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: List[Dict[str, List[str, Dict[str, str]]]]
|
||||
list of conditions when fire function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with wrapped certain tags with <table>
|
||||
|
||||
"""
|
||||
|
||||
def _wrap_tag_with_table(width="100", border="", bg_color=None):
|
||||
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
|
||||
table = chapter_tag.new_tag("table")
|
||||
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
||||
= border, "center", f"width:{width}%;"
|
||||
tbody, tr, td = \
|
||||
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||
chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
|
||||
"tr"), chapter_tag.new_tag("td")
|
||||
td.attrs["bgcolor"] = bg_color
|
||||
tag_to_wrap.wrap(td)
|
||||
td.wrap(tr)
|
||||
@@ -141,8 +152,10 @@ class HtmlEpubPreprocessor:
|
||||
|
||||
def process_tag_using_table():
|
||||
_wrap_tag_with_table(
|
||||
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
|
||||
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
|
||||
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
|
||||
"width") else "100",
|
||||
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
|
||||
"border") else None,
|
||||
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
||||
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
||||
tag_to_wrap.unwrap()
|
||||
@@ -155,23 +168,26 @@ class HtmlEpubPreprocessor:
|
||||
process_tag_using_table()
|
||||
|
||||
@staticmethod
|
||||
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: list):
|
||||
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
|
||||
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]):
|
||||
"""
|
||||
Function to replace all tags to correspond LiveCarta tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
|
||||
list of conditions when fire function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with all tags replaced with LiveCarta tags
|
||||
|
||||
"""
|
||||
for rule in rules:
|
||||
tags = rule["tags"]
|
||||
tag_to_replace = rule["tag_to_replace"]
|
||||
tags: List[str] = rule["tags"]
|
||||
tag_to_replace: str = rule["tag_to_replace"]
|
||||
if rule["condition"]:
|
||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||
if condition_on_tag[0] == 'parent_tags':
|
||||
@@ -193,40 +209,44 @@ class HtmlEpubPreprocessor:
|
||||
tag.name = tag_to_replace
|
||||
|
||||
@staticmethod
|
||||
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list):
|
||||
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
|
||||
"""
|
||||
Function to replace all tags to correspond LiveCarta tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
|
||||
list of conditions when fire function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with all tags replaced with LiveCarta tags
|
||||
|
||||
"""
|
||||
for rule in rules:
|
||||
attr = rule["attr"]
|
||||
tags = rule["condition"]["tags"]
|
||||
tags: List[str] = rule["condition"]["tags"]
|
||||
attr_to_replace = rule["attr_to_replace"]
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||
{attr: re.compile(r".*")}):
|
||||
{attr: re.compile(r".*")}):
|
||||
tag[attr_to_replace] = tag[attr]
|
||||
del tag[attr]
|
||||
|
||||
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
|
||||
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]):
|
||||
"""
|
||||
Function unwrap tags and moves id to span
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: Dict[str, List[str]]
|
||||
dict of tags to unwrap
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with unwrapped certain tags
|
||||
|
||||
"""
|
||||
@@ -239,21 +259,23 @@ class HtmlEpubPreprocessor:
|
||||
tag.unwrap()
|
||||
|
||||
@staticmethod
|
||||
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: list):
|
||||
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]):
|
||||
"""
|
||||
Function inserts tags into correspond tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
|
||||
list of conditions when fire function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with inserted tags
|
||||
|
||||
"""
|
||||
def insert(tag):
|
||||
def insert(tag: Tag):
|
||||
tag_to_insert = \
|
||||
chapter_tag.new_tag(rule["tag_to_insert"])
|
||||
# insert all items that was in tag to subtag and remove from tag
|
||||
@@ -263,7 +285,7 @@ class HtmlEpubPreprocessor:
|
||||
tag.append(tag_to_insert)
|
||||
|
||||
for rule in rules:
|
||||
tags = rule["tags"]
|
||||
tags: List[str] = rule["tags"]
|
||||
if rule["condition"]:
|
||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||
if condition_on_tag[0] == 'parent_tags':
|
||||
@@ -283,29 +305,28 @@ class HtmlEpubPreprocessor:
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
insert(tag)
|
||||
|
||||
def _remove_headings_content(self, chapter_tag, title_of_chapter: str):
|
||||
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
|
||||
"""
|
||||
Function
|
||||
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
|
||||
- adds span with id in order to
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: soup object
|
||||
chapter_tag: Union[BeautifulSoup, PageElement]
|
||||
Tag of the page
|
||||
title_of_chapter: str
|
||||
Chapter title
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
clean/remove headings & add span with id
|
||||
|
||||
"""
|
||||
title_of_chapter = title_of_chapter.lower()
|
||||
if title_of_chapter == "chapter 1":
|
||||
pass
|
||||
for tag in chapter_tag.contents:
|
||||
text = tag if isinstance(tag, NavigableString) else tag.text
|
||||
tag: PageElement
|
||||
text: str = tag if isinstance(tag, NavigableString) else tag.text
|
||||
if re.sub(r"[\s\xa0]", "", text):
|
||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||
text = text.strip() # delete extra spaces
|
||||
@@ -333,7 +354,7 @@ class HtmlEpubPreprocessor:
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with processed tables
|
||||
|
||||
"""
|
||||
@@ -370,7 +391,7 @@ class HtmlEpubPreprocessor:
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag without original classes of the book
|
||||
|
||||
"""
|
||||
@@ -413,9 +434,9 @@ class HtmlEpubPreprocessor:
|
||||
# 2.
|
||||
self._wrap_strings_with_p(content_tag)
|
||||
# 3-6.
|
||||
for dict in self.preset:
|
||||
func = self.name2function[dict["preset_name"]]
|
||||
func(content_tag, dict['rules'])
|
||||
for rule in self.preset:
|
||||
func = self.name2function[rule["preset_name"]]
|
||||
func(content_tag, rule['rules'])
|
||||
# 7.
|
||||
if remove_title_from_chapter:
|
||||
self._remove_headings_content(content_tag, title_str)
|
||||
|
||||
Reference in New Issue
Block a user