forked from LiveCarta/BookConverter
Annotations in Epub converter
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import re
|
||||
import cssutils
|
||||
from typing import Tuple, Dict
|
||||
from bs4 import BeautifulSoup
|
||||
from os.path import dirname, normpath, join
|
||||
|
||||
@@ -41,13 +42,13 @@ class CSSPreprocessor:
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def get_text_color(x):
|
||||
def get_text_color(x: str) -> str:
|
||||
color = str2hex(x)
|
||||
color = color if color not in ["#000000", "#000", "black"] else ""
|
||||
return color
|
||||
|
||||
@staticmethod
|
||||
def get_bg_color(x):
|
||||
def get_bg_color(x: str) -> str:
|
||||
color = str2hex(x)
|
||||
color = color if color not in ["#ffffff", "#fff", "white"] else ""
|
||||
return color
|
||||
@@ -114,7 +115,7 @@ class CSSPreprocessor:
|
||||
return cleaned_value
|
||||
|
||||
@staticmethod
|
||||
def style_conditions(style_value: str, style_name: str) -> tuple[bool, bool]:
|
||||
def style_conditions(style_value: str, style_name: str) -> Tuple[bool, bool]:
|
||||
constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get(
|
||||
style_name)
|
||||
value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[
|
||||
@@ -156,7 +157,7 @@ class CSSPreprocessor:
|
||||
style = "; ".join(split_style)
|
||||
return style
|
||||
|
||||
def process_inline_styles_in_html_soup(self, html_href2html_body_soup: dict):
|
||||
def process_inline_styles_in_html_soup(self, html_href2html_body_soup: Dict[str, BeautifulSoup]):
|
||||
"""This function is designed to convert inline html styles"""
|
||||
for html_href in html_href2html_body_soup:
|
||||
html_content: BeautifulSoup = html_href2html_body_soup[html_href]
|
||||
@@ -169,7 +170,7 @@ class CSSPreprocessor:
|
||||
self.build_inline_style_content(inline_style)
|
||||
|
||||
@staticmethod
|
||||
def get_css_content(css_href, html_href, ebooklib_book):
|
||||
def get_css_content(css_href: str, html_href: str, ebooklib_book) -> str:
|
||||
path_to_css_from_html = css_href
|
||||
html_folder = dirname(html_href)
|
||||
path_to_css_from_root = normpath(
|
||||
|
||||
@@ -9,8 +9,8 @@ from pathlib import Path
|
||||
from itertools import chain
|
||||
from premailer import transform
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Union, List
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
from typing import List, Tuple, Dict, Union
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
from src.epub_converter.css_processor import CSSPreprocessor
|
||||
@@ -39,7 +39,8 @@ class EpubConverter:
|
||||
|
||||
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
|
||||
# key = -1 for top level NavPoints
|
||||
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
|
||||
self.adjacency_list: Dict[Union[NavPoint, -1],
|
||||
Union[List[NavPoint], None]] = {}
|
||||
|
||||
# list to offset Chapter_i on 1st level
|
||||
self.offset_sub_nodes = []
|
||||
@@ -70,7 +71,8 @@ class EpubConverter:
|
||||
BeautifulSoup] = self.build_href2soup_content()
|
||||
|
||||
self.logger.log("CSS inline style processing.")
|
||||
self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
|
||||
self.css_processor.process_inline_styles_in_html_soup(
|
||||
self.html_href2html_body_soup)
|
||||
self.logger.log("CSS files processing.")
|
||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||
self.logger.log("CSS styles fusion(inline+file).")
|
||||
@@ -107,7 +109,6 @@ class EpubConverter:
|
||||
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
||||
# using EpubElements
|
||||
# for now just for HTML objects, as it is the simplest chapter
|
||||
|
||||
nodes = dict()
|
||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
html_body_text = item.get_body_content()
|
||||
@@ -116,7 +117,7 @@ class EpubConverter:
|
||||
nodes[item.file_name] = soup
|
||||
return nodes
|
||||
|
||||
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
||||
def build_html_and_css_relations(self) -> Tuple[Dict[str, List[str]], Dict[str, str]]:
|
||||
"""
|
||||
Function is designed to get 2 dictionaries:
|
||||
The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them
|
||||
@@ -130,8 +131,8 @@ class EpubConverter:
|
||||
|
||||
"""
|
||||
# dictionary: href of html to related css files
|
||||
html_href2css_href: defaultdict = defaultdict(list)
|
||||
css_href2css_content: dict = {}
|
||||
html_href2css_href: Dict[str, List[str]] = defaultdict(list)
|
||||
css_href2css_content: Dict[str, str] = {}
|
||||
|
||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
html_content = item.content
|
||||
@@ -213,7 +214,9 @@ class EpubConverter:
|
||||
html_content, css)
|
||||
self.html_href2html_body_soup[html_href] = html_content
|
||||
|
||||
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
||||
def build_adjacency_list_from_toc(self,
|
||||
element: Union[Link, Tuple[Section, List], List[Union[Link, Tuple]]],
|
||||
lvl: int = 0) -> NavPoint:
|
||||
"""
|
||||
Function
|
||||
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
|
||||
@@ -304,7 +307,7 @@ class EpubConverter:
|
||||
self.adjacency_list[-1].append(nav_point)
|
||||
self.hrefs_added_to_toc.add(nav_point.href)
|
||||
|
||||
def add_not_added_files_to_adjacency_list(self, not_added: list):
|
||||
def add_not_added_files_to_adjacency_list(self, not_added: List[str]):
|
||||
"""Function add files that not added to adjacency list"""
|
||||
for i, file in enumerate(not_added):
|
||||
nav_point = NavPoint(
|
||||
@@ -315,7 +318,7 @@ class EpubConverter:
|
||||
def label_subchapters_with_lc_tag(self):
|
||||
for html_href in self.html_href2html_body_soup:
|
||||
ids, soup = self.html_href2subchapters_ids[html_href], \
|
||||
self.html_href2html_body_soup[html_href]
|
||||
self.html_href2html_body_soup[html_href]
|
||||
for i in ids:
|
||||
tag = soup.find(id=i)
|
||||
tmp_tag = soup.new_tag("lc_tmp")
|
||||
@@ -345,10 +348,13 @@ class EpubConverter:
|
||||
mark.parent.unwrap()
|
||||
|
||||
@staticmethod
|
||||
def create_unique_id(href, id_):
|
||||
def create_unique_id(href: str, id_: str) -> str:
|
||||
return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
|
||||
|
||||
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
|
||||
def match_href_to_path_from_toc(self,
|
||||
cur_file_path: str,
|
||||
href_in_link: str,
|
||||
internal_link_tag: Tag) -> Union[None, str]:
|
||||
"""
|
||||
Function used to find full path to file that is parsed from tag link
|
||||
TOC: a/b/c.xhtml
|
||||
@@ -387,7 +393,7 @@ class EpubConverter:
|
||||
return full_path[0]
|
||||
|
||||
@staticmethod
|
||||
def create_new_anchor_span(soup, id_):
|
||||
def create_new_anchor_span(soup: BeautifulSoup, id_: str) -> Tag:
|
||||
new_anchor_span = soup.new_tag("span")
|
||||
new_anchor_span.attrs["id"] = id_
|
||||
new_anchor_span.attrs["class"] = "link-anchor"
|
||||
@@ -415,7 +421,8 @@ class EpubConverter:
|
||||
for toc_href in self.hrefs_added_to_toc:
|
||||
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
|
||||
if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
|
||||
new_id = self.create_unique_id(toc_href, tag.attrs["id"])
|
||||
new_id = self.create_unique_id(
|
||||
toc_href, tag.attrs["id"])
|
||||
tag.attrs["id"] = new_id
|
||||
|
||||
def process_file_anchor():
|
||||
@@ -427,11 +434,13 @@ class EpubConverter:
|
||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
|
||||
toc_href, a_tag_href, internal_link_tag)
|
||||
if a_tag_href_matched_to_toc:
|
||||
new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
|
||||
new_id = self.create_unique_id(
|
||||
a_tag_href_matched_to_toc, "")
|
||||
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
|
||||
if new_id not in self.internal_anchors:
|
||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
||||
new_anchor_span = self.create_new_anchor_span(
|
||||
soup, new_id)
|
||||
# insert a new span to the beginning of the file
|
||||
anchor_soup.insert(0, new_anchor_span)
|
||||
self.internal_anchors.add(new_id)
|
||||
@@ -442,7 +451,8 @@ class EpubConverter:
|
||||
soup = self.html_href2html_body_soup[toc_href]
|
||||
# process_file_element_anchor
|
||||
for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
|
||||
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#")
|
||||
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
|
||||
"#")
|
||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
|
||||
toc_href, a_tag_href, internal_link_tag) if a_tag_href \
|
||||
else path.normpath(toc_href).replace("\\", "/")
|
||||
@@ -452,7 +462,8 @@ class EpubConverter:
|
||||
|
||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||
anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
|
||||
anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote
|
||||
anchor_soup.find_all(
|
||||
attrs={"id": a_tag_id}) # if link is a footnote
|
||||
if anchor_tags:
|
||||
if len(anchor_tags) > 1:
|
||||
self.logger.log(f"Warning in {toc_href}: multiple anchors:"
|
||||
@@ -487,7 +498,9 @@ class EpubConverter:
|
||||
process_file_element_anchor()
|
||||
|
||||
@staticmethod
|
||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||
def get_tags_between_chapter_marks(first_id: str,
|
||||
href: str,
|
||||
html_soup: BeautifulSoup) -> List[Union[Tag, NavigableString]]:
|
||||
"""
|
||||
Get tags between LiveCarta chapter marks
|
||||
Parameters
|
||||
@@ -568,7 +581,7 @@ class EpubConverter:
|
||||
for tl_nav_point in top_level_nav_points:
|
||||
self.detect_one_chapter(tl_nav_point)
|
||||
|
||||
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl: int = 1) -> ChapterItem:
|
||||
"""
|
||||
Function prepare style, tags to json structure
|
||||
Parameters
|
||||
@@ -584,18 +597,18 @@ class EpubConverter:
|
||||
built chapter
|
||||
|
||||
"""
|
||||
title = nav_point.title
|
||||
title: str = nav_point.title
|
||||
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
|
||||
if nav_point.id else self.html_href2html_body_soup[nav_point.href]
|
||||
|
||||
indent = " " * lvl
|
||||
indent: str = " " * lvl
|
||||
self.logger.log(indent + f"Chapter: {title} is processing.")
|
||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||
is_chapter: bool = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||
self.logger.log(indent + "Process title.")
|
||||
title_preprocessed = self.html_processor.prepare_title(title)
|
||||
title_preprocessed: str = self.html_processor.prepare_title(title)
|
||||
self.logger.log(indent + "Process content.")
|
||||
content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
|
||||
remove_title_from_chapter=is_chapter)
|
||||
content_preprocessed: BeautifulSoup = self.html_processor.prepare_content(
|
||||
title_preprocessed, content, remove_title_from_chapter=is_chapter)
|
||||
|
||||
self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
|
||||
self.img_href2img_bytes,
|
||||
@@ -613,7 +626,7 @@ class EpubConverter:
|
||||
sub_nodes.append(sub_chapter_item)
|
||||
return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes)
|
||||
|
||||
def convert_to_dict(self) -> dict:
|
||||
def convert_to_dict(self) -> Dict[str, List[Dict[str, Union[List, str]]]]:
|
||||
"""Function which convert list of html nodes to appropriate json structure"""
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
top_level_chapters = []
|
||||
@@ -633,7 +646,7 @@ class EpubConverter:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
epub_file_path = "../../books/epub/9780763774134.epub"
|
||||
epub_file_path = "../../books/epub/9781119646044.epub"
|
||||
logger_object = BookLogger(
|
||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ from src.epub_converter.epub_converter import EpubConverter
|
||||
class EpubBook(BookSolver):
|
||||
"""Class of .epub type book - child of BookSolver"""
|
||||
|
||||
def __init__(self, book_id=0, access=None, main_logger=None):
|
||||
def __init__(self, book_id: int = 0, access=None, main_logger=None):
|
||||
super().__init__(book_id, access, main_logger)
|
||||
self.book_type = "epub"
|
||||
|
||||
@@ -28,7 +28,8 @@ class EpubBook(BookSolver):
|
||||
|
||||
"""
|
||||
css_processor = CSSPreprocessor()
|
||||
html_processor = HtmlEpubPreprocessor(self.preset_path, logger=self.logger_object)
|
||||
html_processor = HtmlEpubPreprocessor(
|
||||
self.preset_path, logger=self.logger_object)
|
||||
json_converter = EpubConverter(
|
||||
self.book_path, access=self.access, logger=self.logger_object,
|
||||
css_processor=css_processor, html_processor=html_processor)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import re
|
||||
from typing import Tuple
|
||||
from typing import List, Tuple
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
|
||||
@@ -16,8 +16,8 @@ def _replace_with_livecarta_anchor_tag(anchor, i):
|
||||
return new_tag
|
||||
|
||||
|
||||
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name="epub:type") \
|
||||
-> Tuple[list, list, list]:
|
||||
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name: str = "epub:type") \
|
||||
-> Tuple[List, List, List]:
|
||||
"""
|
||||
This function preprocessing footnotes
|
||||
This function should be earlier that adding fonts in pipeline.
|
||||
@@ -87,5 +87,4 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
noteref.attrs["data-id"] = i + 1
|
||||
noteref.attrs["id"] = f"footnote-{i + 1}"
|
||||
footnote.attrs["href"] = f"#footnote-{i + 1}"
|
||||
|
||||
return footnotes, new_noterefs_tags, new_footnotes_tags
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
import re
|
||||
import json
|
||||
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
|
||||
from typing import List, Dict, Union
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
||||
from bs4.element import PageElement
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
|
||||
|
||||
class HtmlEpubPreprocessor:
|
||||
def __init__(self, preset_path="../../presets/presets.json", logger=None):
|
||||
def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
|
||||
self.preset = json.load(open(preset_path))
|
||||
self.logger: BookLogger = logger
|
||||
self.logger = logger
|
||||
self.name2function = {
|
||||
"table_wrapper": self._wrap_tags_with_table,
|
||||
"replacer": self._tags_to_correspond_livecarta_tag,
|
||||
@@ -18,33 +20,37 @@ class HtmlEpubPreprocessor:
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
|
||||
chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function adds span with id from tag_to_be_removed
|
||||
because this tag will be removed(unwrapped/extract)
|
||||
Parameters
|
||||
----------
|
||||
tag_to_be_removed: Soup object
|
||||
tag_to_be_removed: Union[PageElement, BeautifulSoup]
|
||||
|
||||
chapter_tag: BeautifulSoup
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
updated body tag
|
||||
|
||||
"""
|
||||
|
||||
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
|
||||
class_: list):
|
||||
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
|
||||
tag_to_be_removed: Tag,
|
||||
id_: str,
|
||||
class_: Union[List[str], str]):
|
||||
"""Function inserts span before tag aren't supported by LiveCarta"""
|
||||
new_tag = chapter_tag.new_tag("span")
|
||||
new_tag: Tag = chapter_tag.new_tag("span")
|
||||
new_tag.attrs["id"] = id_ or ""
|
||||
new_tag.attrs["class"] = class_ or ""
|
||||
new_tag.string = "\xa0"
|
||||
tag_to_be_removed.insert_before(new_tag)
|
||||
|
||||
if tag_to_be_removed.attrs.get("id"):
|
||||
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
|
||||
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
|
||||
tag_to_be_removed=tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs["id"],
|
||||
class_=tag_to_be_removed.attrs.get("class"))
|
||||
|
||||
@@ -78,7 +84,7 @@ class HtmlEpubPreprocessor:
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag without comments
|
||||
|
||||
"""
|
||||
@@ -110,27 +116,32 @@ class HtmlEpubPreprocessor:
|
||||
p_tag.append(str(node))
|
||||
node.replace_with(p_tag)
|
||||
|
||||
def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: list):
|
||||
def _wrap_tags_with_table(self,
|
||||
chapter_tag: BeautifulSoup,
|
||||
rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
|
||||
"""
|
||||
Function wraps <tag> with <table>
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: List[Dict[str, List[str, Dict[str, str]]]]
|
||||
list of conditions when fire function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with wrapped certain tags with <table>
|
||||
|
||||
"""
|
||||
|
||||
def _wrap_tag_with_table(width="100", border="", bg_color=None):
|
||||
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
|
||||
table = chapter_tag.new_tag("table")
|
||||
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
||||
= border, "center", f"width:{width}%;"
|
||||
tbody, tr, td = \
|
||||
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||
chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
|
||||
"tr"), chapter_tag.new_tag("td")
|
||||
td.attrs["bgcolor"] = bg_color
|
||||
tag_to_wrap.wrap(td)
|
||||
td.wrap(tr)
|
||||
@@ -141,8 +152,10 @@ class HtmlEpubPreprocessor:
|
||||
|
||||
def process_tag_using_table():
|
||||
_wrap_tag_with_table(
|
||||
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
|
||||
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
|
||||
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
|
||||
"width") else "100",
|
||||
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
|
||||
"border") else None,
|
||||
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
||||
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
||||
tag_to_wrap.unwrap()
|
||||
@@ -155,23 +168,26 @@ class HtmlEpubPreprocessor:
|
||||
process_tag_using_table()
|
||||
|
||||
@staticmethod
|
||||
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: list):
|
||||
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
|
||||
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]):
|
||||
"""
|
||||
Function to replace all tags to correspond LiveCarta tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
|
||||
list of conditions when fire function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with all tags replaced with LiveCarta tags
|
||||
|
||||
"""
|
||||
for rule in rules:
|
||||
tags = rule["tags"]
|
||||
tag_to_replace = rule["tag_to_replace"]
|
||||
tags: List[str] = rule["tags"]
|
||||
tag_to_replace: str = rule["tag_to_replace"]
|
||||
if rule["condition"]:
|
||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||
if condition_on_tag[0] == 'parent_tags':
|
||||
@@ -193,40 +209,44 @@ class HtmlEpubPreprocessor:
|
||||
tag.name = tag_to_replace
|
||||
|
||||
@staticmethod
|
||||
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list):
|
||||
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
|
||||
"""
|
||||
Function to replace all tags to correspond LiveCarta tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
|
||||
list of conditions when fire function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with all tags replaced with LiveCarta tags
|
||||
|
||||
"""
|
||||
for rule in rules:
|
||||
attr = rule["attr"]
|
||||
tags = rule["condition"]["tags"]
|
||||
tags: List[str] = rule["condition"]["tags"]
|
||||
attr_to_replace = rule["attr_to_replace"]
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||
{attr: re.compile(r".*")}):
|
||||
{attr: re.compile(r".*")}):
|
||||
tag[attr_to_replace] = tag[attr]
|
||||
del tag[attr]
|
||||
|
||||
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
|
||||
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]):
|
||||
"""
|
||||
Function unwrap tags and moves id to span
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: Dict[str, List[str]]
|
||||
dict of tags to unwrap
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with unwrapped certain tags
|
||||
|
||||
"""
|
||||
@@ -239,21 +259,23 @@ class HtmlEpubPreprocessor:
|
||||
tag.unwrap()
|
||||
|
||||
@staticmethod
|
||||
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: list):
|
||||
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]):
|
||||
"""
|
||||
Function inserts tags into correspond tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
|
||||
list of conditions when fire function
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with inserted tags
|
||||
|
||||
"""
|
||||
def insert(tag):
|
||||
def insert(tag: Tag):
|
||||
tag_to_insert = \
|
||||
chapter_tag.new_tag(rule["tag_to_insert"])
|
||||
# insert all items that was in tag to subtag and remove from tag
|
||||
@@ -263,7 +285,7 @@ class HtmlEpubPreprocessor:
|
||||
tag.append(tag_to_insert)
|
||||
|
||||
for rule in rules:
|
||||
tags = rule["tags"]
|
||||
tags: List[str] = rule["tags"]
|
||||
if rule["condition"]:
|
||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||
if condition_on_tag[0] == 'parent_tags':
|
||||
@@ -283,29 +305,28 @@ class HtmlEpubPreprocessor:
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
insert(tag)
|
||||
|
||||
def _remove_headings_content(self, chapter_tag, title_of_chapter: str):
|
||||
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
|
||||
"""
|
||||
Function
|
||||
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
|
||||
- adds span with id in order to
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: soup object
|
||||
chapter_tag: Union[BeautifulSoup, PageElement]
|
||||
Tag of the page
|
||||
title_of_chapter: str
|
||||
Chapter title
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
clean/remove headings & add span with id
|
||||
|
||||
"""
|
||||
title_of_chapter = title_of_chapter.lower()
|
||||
if title_of_chapter == "chapter 1":
|
||||
pass
|
||||
for tag in chapter_tag.contents:
|
||||
text = tag if isinstance(tag, NavigableString) else tag.text
|
||||
tag: PageElement
|
||||
text: str = tag if isinstance(tag, NavigableString) else tag.text
|
||||
if re.sub(r"[\s\xa0]", "", text):
|
||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||
text = text.strip() # delete extra spaces
|
||||
@@ -333,7 +354,7 @@ class HtmlEpubPreprocessor:
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag with processed tables
|
||||
|
||||
"""
|
||||
@@ -370,7 +391,7 @@ class HtmlEpubPreprocessor:
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
NoReturn
|
||||
Chapter Tag without original classes of the book
|
||||
|
||||
"""
|
||||
@@ -413,9 +434,9 @@ class HtmlEpubPreprocessor:
|
||||
# 2.
|
||||
self._wrap_strings_with_p(content_tag)
|
||||
# 3-6.
|
||||
for dict in self.preset:
|
||||
func = self.name2function[dict["preset_name"]]
|
||||
func(content_tag, dict['rules'])
|
||||
for rule in self.preset:
|
||||
func = self.name2function[rule["preset_name"]]
|
||||
func(content_tag, rule['rules'])
|
||||
# 7.
|
||||
if remove_title_from_chapter:
|
||||
self._remove_headings_content(content_tag, title_str)
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Dict
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.access import Access
|
||||
|
||||
|
||||
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
|
||||
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str) -> str:
|
||||
"""Function saves all images to Amazon web service"""
|
||||
link_path = access.send_image(
|
||||
link_path: str = access.send_image(
|
||||
img_file_path, doc_id=book_id, img_content=img_content)
|
||||
return link_path
|
||||
|
||||
@@ -27,11 +28,11 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
|
||||
|
||||
|
||||
def update_images_src_links(body_tag: BeautifulSoup,
|
||||
img_href2img_content: dict,
|
||||
img_href2img_content: Dict[str, bytes],
|
||||
path_to_html: str,
|
||||
access=None,
|
||||
path2aws_path: dict = None,
|
||||
book_id: str = None) -> dict:
|
||||
access: Access = None,
|
||||
path2aws_path: Dict[str, str] = None,
|
||||
book_id: str = None) -> Dict[str, str]:
|
||||
"""Function makes dictionary image_src_path -> Amazon web service_path"""
|
||||
img_tags = body_tag.find_all("img")
|
||||
for img in img_tags:
|
||||
@@ -43,7 +44,7 @@ def update_images_src_links(body_tag: BeautifulSoup,
|
||||
assert path_to_img_from_root in img_href2img_content, \
|
||||
f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest."
|
||||
|
||||
img_content = img_href2img_content[path_to_img_from_root]
|
||||
img_content: bytes = img_href2img_content[path_to_img_from_root]
|
||||
if access is not None:
|
||||
if path_to_img_from_root in path2aws_path:
|
||||
new_folder = path2aws_path[path_to_img_from_root]
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
import re
|
||||
import cssutils
|
||||
from typing import List
|
||||
|
||||
from logging import CRITICAL
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
|
||||
@@ -11,13 +10,13 @@ cssutils.log.setLevel(CRITICAL)
|
||||
|
||||
|
||||
class TagInlineStyleProcessor:
|
||||
def __init__(self, tag_inline_style):
|
||||
def __init__(self, tag_inline_style: Tag):
|
||||
# tag with inline style + style parsed from css file
|
||||
self.tag_inline_style = tag_inline_style
|
||||
self.tag_inline_style.attrs['style'] = self.process_inline_style()
|
||||
self.tag_inline_style.attrs['style']: str = self.process_inline_style()
|
||||
|
||||
@staticmethod
|
||||
def remove_white_if_no_bgcolor(style_, tag):
|
||||
def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str:
|
||||
"""Function remove text white color if there is no bg color"""
|
||||
if "background" in style_:
|
||||
style_ = style_.replace(
|
||||
@@ -62,13 +61,13 @@ class TagInlineStyleProcessor:
|
||||
# return split_style
|
||||
|
||||
@staticmethod
|
||||
def indents_processing(split_style: list) -> str:
|
||||
def indents_processing(split_style: List[str]) -> str:
|
||||
"""
|
||||
Function process indents from left using
|
||||
formula_of_indent: indent = abs(margin - text_indent)
|
||||
Parameters
|
||||
----------
|
||||
split_style: list
|
||||
split_style: List[str]
|
||||
list of styles split by ";"
|
||||
|
||||
Returns
|
||||
@@ -111,7 +110,7 @@ class TagInlineStyleProcessor:
|
||||
return processed_style
|
||||
return processed_style
|
||||
|
||||
def process_inline_style(self):
|
||||
def process_inline_style(self) -> str:
|
||||
"""
|
||||
Function processes final(css+initial inline) inline style
|
||||
Steps
|
||||
@@ -180,7 +179,7 @@ class TagInlineStyleProcessor:
|
||||
self.tag_inline_style.append(correspond_tag)
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_tag_to_save_style_attrs(initial_tag):
|
||||
def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag):
|
||||
"""Function designed to save style attrs that cannot be in tag.name -> span"""
|
||||
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG))
|
||||
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
|
||||
@@ -212,7 +211,7 @@ class TagInlineStyleProcessor:
|
||||
initial_tag.attrs["style"] = span_style
|
||||
initial_tag.wrap(tag)
|
||||
|
||||
def convert_initial_tag(self):
|
||||
def convert_initial_tag(self) -> Tag:
|
||||
self.change_attrs_with_corresponding_tags()
|
||||
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
|
||||
return self.tag_inline_style
|
||||
|
||||
Reference in New Issue
Block a user