Annotations in Epub converter

This commit is contained in:
Kiryl
2022-08-03 14:39:13 +03:00
parent 7453029295
commit 78e3ad8911
16 changed files with 259 additions and 192 deletions

View File

@@ -1,5 +1,6 @@
import re
import cssutils
from typing import Tuple, Dict
from bs4 import BeautifulSoup
from os.path import dirname, normpath, join
@@ -41,13 +42,13 @@ class CSSPreprocessor:
}
@staticmethod
def get_text_color(x):
def get_text_color(x: str) -> str:
color = str2hex(x)
color = color if color not in ["#000000", "#000", "black"] else ""
return color
@staticmethod
def get_bg_color(x):
def get_bg_color(x: str) -> str:
color = str2hex(x)
color = color if color not in ["#ffffff", "#fff", "white"] else ""
return color
@@ -114,7 +115,7 @@ class CSSPreprocessor:
return cleaned_value
@staticmethod
def style_conditions(style_value: str, style_name: str) -> tuple[bool, bool]:
def style_conditions(style_value: str, style_name: str) -> Tuple[bool, bool]:
constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get(
style_name)
value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[
@@ -156,7 +157,7 @@ class CSSPreprocessor:
style = "; ".join(split_style)
return style
def process_inline_styles_in_html_soup(self, html_href2html_body_soup: dict):
def process_inline_styles_in_html_soup(self, html_href2html_body_soup: Dict[str, BeautifulSoup]):
"""This function is designed to convert inline html styles"""
for html_href in html_href2html_body_soup:
html_content: BeautifulSoup = html_href2html_body_soup[html_href]
@@ -169,7 +170,7 @@ class CSSPreprocessor:
self.build_inline_style_content(inline_style)
@staticmethod
def get_css_content(css_href, html_href, ebooklib_book):
def get_css_content(css_href: str, html_href: str, ebooklib_book) -> str:
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(

View File

@@ -9,8 +9,8 @@ from pathlib import Path
from itertools import chain
from premailer import transform
from collections import defaultdict
from typing import Dict, Union, List
from bs4 import BeautifulSoup, NavigableString, Tag
from typing import List, Tuple, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString
from src.util.helpers import BookLogger
from src.epub_converter.css_processor import CSSPreprocessor
@@ -39,7 +39,8 @@ class EpubConverter:
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
# key = -1 for top level NavPoints
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
self.adjacency_list: Dict[Union[NavPoint, -1],
Union[List[NavPoint], None]] = {}
# list to offset Chapter_i on 1st level
self.offset_sub_nodes = []
@@ -70,7 +71,8 @@ class EpubConverter:
BeautifulSoup] = self.build_href2soup_content()
self.logger.log("CSS inline style processing.")
self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
self.css_processor.process_inline_styles_in_html_soup(
self.html_href2html_body_soup)
self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log("CSS styles fusion(inline+file).")
@@ -107,7 +109,6 @@ class EpubConverter:
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
# for now just for HTML objects, as it is the simplest chapter
nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_body_text = item.get_body_content()
@@ -116,7 +117,7 @@ class EpubConverter:
nodes[item.file_name] = soup
return nodes
def build_html_and_css_relations(self) -> tuple[dict, dict]:
def build_html_and_css_relations(self) -> Tuple[Dict[str, List[str]], Dict[str, str]]:
"""
Function is designed to get 2 dictionaries:
The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them
@@ -130,8 +131,8 @@ class EpubConverter:
"""
# dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list)
css_href2css_content: dict = {}
html_href2css_href: Dict[str, List[str]] = defaultdict(list)
css_href2css_content: Dict[str, str] = {}
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content
@@ -213,7 +214,9 @@ class EpubConverter:
html_content, css)
self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
def build_adjacency_list_from_toc(self,
element: Union[Link, Tuple[Section, List], List[Union[Link, Tuple]]],
lvl: int = 0) -> NavPoint:
"""
Function
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
@@ -304,7 +307,7 @@ class EpubConverter:
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added: list):
def add_not_added_files_to_adjacency_list(self, not_added: List[str]):
"""Function add files that not added to adjacency list"""
for i, file in enumerate(not_added):
nav_point = NavPoint(
@@ -315,7 +318,7 @@ class EpubConverter:
def label_subchapters_with_lc_tag(self):
for html_href in self.html_href2html_body_soup:
ids, soup = self.html_href2subchapters_ids[html_href], \
self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href]
for i in ids:
tag = soup.find(id=i)
tmp_tag = soup.new_tag("lc_tmp")
@@ -345,10 +348,13 @@ class EpubConverter:
mark.parent.unwrap()
@staticmethod
def create_unique_id(href, id_):
def create_unique_id(href: str, id_: str) -> str:
return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
def match_href_to_path_from_toc(self,
cur_file_path: str,
href_in_link: str,
internal_link_tag: Tag) -> Union[None, str]:
"""
Function used to find full path to file that is parsed from tag link
TOC: a/b/c.xhtml
@@ -387,7 +393,7 @@ class EpubConverter:
return full_path[0]
@staticmethod
def create_new_anchor_span(soup, id_):
def create_new_anchor_span(soup: BeautifulSoup, id_: str) -> Tag:
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs["class"] = "link-anchor"
@@ -415,7 +421,8 @@ class EpubConverter:
for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
new_id = self.create_unique_id(toc_href, tag.attrs["id"])
new_id = self.create_unique_id(
toc_href, tag.attrs["id"])
tag.attrs["id"] = new_id
def process_file_anchor():
@@ -427,11 +434,13 @@ class EpubConverter:
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag)
if a_tag_href_matched_to_toc:
new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
new_id = self.create_unique_id(
a_tag_href_matched_to_toc, "")
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
# insert a new span to the beginning of the file
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
@@ -442,7 +451,8 @@ class EpubConverter:
soup = self.html_href2html_body_soup[toc_href]
# process_file_element_anchor
for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#")
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
"#")
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag) if a_tag_href \
else path.normpath(toc_href).replace("\\", "/")
@@ -452,7 +462,8 @@ class EpubConverter:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote
anchor_soup.find_all(
attrs={"id": a_tag_id}) # if link is a footnote
if anchor_tags:
if len(anchor_tags) > 1:
self.logger.log(f"Warning in {toc_href}: multiple anchors:"
@@ -487,7 +498,9 @@ class EpubConverter:
process_file_element_anchor()
@staticmethod
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
def get_tags_between_chapter_marks(first_id: str,
href: str,
html_soup: BeautifulSoup) -> List[Union[Tag, NavigableString]]:
"""
Get tags between LiveCarta chapter marks
Parameters
@@ -568,7 +581,7 @@ class EpubConverter:
for tl_nav_point in top_level_nav_points:
self.detect_one_chapter(tl_nav_point)
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl: int = 1) -> ChapterItem:
"""
Function prepare style, tags to json structure
Parameters
@@ -584,18 +597,18 @@ class EpubConverter:
built chapter
"""
title = nav_point.title
title: str = nav_point.title
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
if nav_point.id else self.html_href2html_body_soup[nav_point.href]
indent = " " * lvl
indent: str = " " * lvl
self.logger.log(indent + f"Chapter: {title} is processing.")
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
is_chapter: bool = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
self.logger.log(indent + "Process title.")
title_preprocessed = self.html_processor.prepare_title(title)
title_preprocessed: str = self.html_processor.prepare_title(title)
self.logger.log(indent + "Process content.")
content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
content_preprocessed: BeautifulSoup = self.html_processor.prepare_content(
title_preprocessed, content, remove_title_from_chapter=is_chapter)
self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
self.img_href2img_bytes,
@@ -613,7 +626,7 @@ class EpubConverter:
sub_nodes.append(sub_chapter_item)
return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes)
def convert_to_dict(self) -> dict:
def convert_to_dict(self) -> Dict[str, List[Dict[str, Union[List, str]]]]:
"""Function which convert list of html nodes to appropriate json structure"""
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
@@ -633,7 +646,7 @@ class EpubConverter:
if __name__ == "__main__":
epub_file_path = "../../books/epub/9780763774134.epub"
epub_file_path = "../../books/epub/9781119646044.epub"
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])

View File

@@ -7,7 +7,7 @@ from src.epub_converter.epub_converter import EpubConverter
class EpubBook(BookSolver):
"""Class of .epub type book - child of BookSolver"""
def __init__(self, book_id=0, access=None, main_logger=None):
def __init__(self, book_id: int = 0, access=None, main_logger=None):
super().__init__(book_id, access, main_logger)
self.book_type = "epub"
@@ -28,7 +28,8 @@ class EpubBook(BookSolver):
"""
css_processor = CSSPreprocessor()
html_processor = HtmlEpubPreprocessor(self.preset_path, logger=self.logger_object)
html_processor = HtmlEpubPreprocessor(
self.preset_path, logger=self.logger_object)
json_converter = EpubConverter(
self.book_path, access=self.access, logger=self.logger_object,
css_processor=css_processor, html_processor=html_processor)

View File

@@ -1,5 +1,5 @@
import re
from typing import Tuple
from typing import List, Tuple
from bs4 import BeautifulSoup, Tag
@@ -16,8 +16,8 @@ def _replace_with_livecarta_anchor_tag(anchor, i):
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name="epub:type") \
-> Tuple[list, list, list]:
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name: str = "epub:type") \
-> Tuple[List, List, List]:
"""
This function preprocessing footnotes
This function should be earlier that adding fonts in pipeline.
@@ -87,5 +87,4 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
noteref.attrs["data-id"] = i + 1
noteref.attrs["id"] = f"footnote-{i + 1}"
footnote.attrs["href"] = f"#footnote-{i + 1}"
return footnotes, new_noterefs_tags, new_footnotes_tags

View File

@@ -1,14 +1,16 @@
import re
import json
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
from typing import List, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4.element import PageElement
from src.util.helpers import BookLogger
class HtmlEpubPreprocessor:
def __init__(self, preset_path="../../presets/presets.json", logger=None):
def __init__(self, preset_path: str = "../../presets/presets.json", logger: BookLogger = None):
self.preset = json.load(open(preset_path))
self.logger: BookLogger = logger
self.logger = logger
self.name2function = {
"table_wrapper": self._wrap_tags_with_table,
"replacer": self._tags_to_correspond_livecarta_tag,
@@ -18,33 +20,37 @@ class HtmlEpubPreprocessor:
}
@staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Soup object
tag_to_be_removed: Union[PageElement, BeautifulSoup]
chapter_tag: BeautifulSoup
Returns
-------
None
NoReturn
updated body tag
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
class_: list):
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
tag_to_be_removed: Tag,
id_: str,
class_: Union[List[str], str]):
"""Function inserts span before tag aren't supported by LiveCarta"""
new_tag = chapter_tag.new_tag("span")
new_tag: Tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
@@ -78,7 +84,7 @@ class HtmlEpubPreprocessor:
Returns
-------
None
NoReturn
Chapter Tag without comments
"""
@@ -110,27 +116,32 @@ class HtmlEpubPreprocessor:
p_tag.append(str(node))
node.replace_with(p_tag)
def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: list):
def _wrap_tags_with_table(self,
chapter_tag: BeautifulSoup,
rules: List[Dict[str, List[Union[str, Dict[str, str]]]]]):
"""
Function wraps <tag> with <table>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, List[str, Dict[str, str]]]]
list of conditions when fire function
Returns
-------
None
NoReturn
Chapter Tag with wrapped certain tags with <table>
"""
def _wrap_tag_with_table(width="100", border="", bg_color=None):
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
chapter_tag.new_tag("tbody"), chapter_tag.new_tag(
"tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_wrap.wrap(td)
td.wrap(tr)
@@ -141,8 +152,10 @@ class HtmlEpubPreprocessor:
def process_tag_using_table():
_wrap_tag_with_table(
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get(
"width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get(
"border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap()
@@ -155,23 +168,26 @@ class HtmlEpubPreprocessor:
process_tag_using_table()
@staticmethod
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: list):
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup,
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[List[str], str, int, Dict[str, Union[str, int]]]]]
list of conditions when fire function
Returns
-------
None
NoReturn
Chapter Tag with all tags replaced with LiveCarta tags
"""
for rule in rules:
tags = rule["tags"]
tag_to_replace = rule["tag_to_replace"]
tags: List[str] = rule["tags"]
tag_to_replace: str = rule["tag_to_replace"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
@@ -193,40 +209,44 @@ class HtmlEpubPreprocessor:
tag.name = tag_to_replace
@staticmethod
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list):
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[str, Dict[str, List[str]]]]]
list of conditions when fire function
Returns
-------
None
NoReturn
Chapter Tag with all tags replaced with LiveCarta tags
"""
for rule in rules:
attr = rule["attr"]
tags = rule["condition"]["tags"]
tags: List[str] = rule["condition"]["tags"]
attr_to_replace = rule["attr_to_replace"]
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr: re.compile(r".*")}):
{attr: re.compile(r".*")}):
tag[attr_to_replace] = tag[attr]
del tag[attr]
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]):
"""
Function unwrap tags and moves id to span
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: Dict[str, List[str]]
dict of tags to unwrap
Returns
-------
None
NoReturn
Chapter Tag with unwrapped certain tags
"""
@@ -239,21 +259,23 @@ class HtmlEpubPreprocessor:
tag.unwrap()
@staticmethod
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: list):
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]):
"""
Function inserts tags into correspond tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[str, int]]]]]
list of conditions when fire function
Returns
-------
None
NoReturn
Chapter Tag with inserted tags
"""
def insert(tag):
def insert(tag: Tag):
tag_to_insert = \
chapter_tag.new_tag(rule["tag_to_insert"])
# insert all items that was in tag to subtag and remove from tag
@@ -263,7 +285,7 @@ class HtmlEpubPreprocessor:
tag.append(tag_to_insert)
for rule in rules:
tags = rule["tags"]
tags: List[str] = rule["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
@@ -283,29 +305,28 @@ class HtmlEpubPreprocessor:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
insert(tag)
def _remove_headings_content(self, chapter_tag, title_of_chapter: str):
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
"""
Function
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
- adds span with id in order to
Parameters
----------
chapter_tag: soup object
chapter_tag: Union[BeautifulSoup, PageElement]
Tag of the page
title_of_chapter: str
Chapter title
Returns
-------
None
NoReturn
clean/remove headings & add span with id
"""
title_of_chapter = title_of_chapter.lower()
if title_of_chapter == "chapter 1":
pass
for tag in chapter_tag.contents:
text = tag if isinstance(tag, NavigableString) else tag.text
tag: PageElement
text: str = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
@@ -333,7 +354,7 @@ class HtmlEpubPreprocessor:
Returns
-------
None
NoReturn
Chapter Tag with processed tables
"""
@@ -370,7 +391,7 @@ class HtmlEpubPreprocessor:
Returns
-------
None
NoReturn
Chapter Tag without original classes of the book
"""
@@ -413,9 +434,9 @@ class HtmlEpubPreprocessor:
# 2.
self._wrap_strings_with_p(content_tag)
# 3-6.
for dict in self.preset:
func = self.name2function[dict["preset_name"]]
func(content_tag, dict['rules'])
for rule in self.preset:
func = self.name2function[rule["preset_name"]]
func(content_tag, rule['rules'])
# 7.
if remove_title_from_chapter:
self._remove_headings_content(content_tag, title_str)

View File

@@ -1,13 +1,14 @@
import os
import pathlib
from typing import Dict
from bs4 import BeautifulSoup
from src.access import Access
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str) -> str:
"""Function saves all images to Amazon web service"""
link_path = access.send_image(
link_path: str = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link_path
@@ -27,11 +28,11 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
def update_images_src_links(body_tag: BeautifulSoup,
img_href2img_content: dict,
img_href2img_content: Dict[str, bytes],
path_to_html: str,
access=None,
path2aws_path: dict = None,
book_id: str = None) -> dict:
access: Access = None,
path2aws_path: Dict[str, str] = None,
book_id: str = None) -> Dict[str, str]:
"""Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all("img")
for img in img_tags:
@@ -43,7 +44,7 @@ def update_images_src_links(body_tag: BeautifulSoup,
assert path_to_img_from_root in img_href2img_content, \
f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest."
img_content = img_href2img_content[path_to_img_from_root]
img_content: bytes = img_href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]

View File

@@ -1,9 +1,8 @@
import re
import cssutils
from typing import List
from logging import CRITICAL
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from src.livecarta_config import LiveCartaConfig
@@ -11,13 +10,13 @@ cssutils.log.setLevel(CRITICAL)
class TagInlineStyleProcessor:
def __init__(self, tag_inline_style):
def __init__(self, tag_inline_style: Tag):
# tag with inline style + style parsed from css file
self.tag_inline_style = tag_inline_style
self.tag_inline_style.attrs['style'] = self.process_inline_style()
self.tag_inline_style.attrs['style']: str = self.process_inline_style()
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str:
"""Function remove text white color if there is no bg color"""
if "background" in style_:
style_ = style_.replace(
@@ -62,13 +61,13 @@ class TagInlineStyleProcessor:
# return split_style
@staticmethod
def indents_processing(split_style: list) -> str:
def indents_processing(split_style: List[str]) -> str:
"""
Function process indents from left using
formula_of_indent: indent = abs(margin - text_indent)
Parameters
----------
split_style: list
split_style: List[str]
list of styles split by ";"
Returns
@@ -111,7 +110,7 @@ class TagInlineStyleProcessor:
return processed_style
return processed_style
def process_inline_style(self):
def process_inline_style(self) -> str:
"""
Function processes final(css+initial inline) inline style
Steps
@@ -180,7 +179,7 @@ class TagInlineStyleProcessor:
self.tag_inline_style.append(correspond_tag)
@staticmethod
def wrap_span_in_tag_to_save_style_attrs(initial_tag):
def wrap_span_in_tag_to_save_style_attrs(initial_tag: Tag):
"""Function designed to save style attrs that cannot be in tag.name -> span"""
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG))
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
@@ -212,7 +211,7 @@ class TagInlineStyleProcessor:
initial_tag.attrs["style"] = span_style
initial_tag.wrap(tag)
def convert_initial_tag(self):
def convert_initial_tag(self) -> Tag:
self.change_attrs_with_corresponding_tags()
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
return self.tag_inline_style