forked from LiveCarta/BookConverter
Add attr replacer & svg -> img
This commit is contained in:
@@ -589,13 +589,6 @@ class EpubConverter:
|
|||||||
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
|
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
|
||||||
if nav_point.id else self.html_href2html_body_soup[nav_point.href]
|
if nav_point.id else self.html_href2html_body_soup[nav_point.href]
|
||||||
|
|
||||||
self.book_image_src_path2aws_path = update_images_src_links(content,
|
|
||||||
self.img_href2img_bytes,
|
|
||||||
path_to_html=nav_point.href,
|
|
||||||
access=self.access,
|
|
||||||
path2aws_path=self.book_image_src_path2aws_path,
|
|
||||||
book_id=Path(self.file_path).stem)
|
|
||||||
|
|
||||||
indent = " " * lvl
|
indent = " " * lvl
|
||||||
self.logger.log(indent + f"Chapter: {title} is processing.")
|
self.logger.log(indent + f"Chapter: {title} is processing.")
|
||||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||||
@@ -604,6 +597,13 @@ class EpubConverter:
|
|||||||
self.logger.log(indent + "Process content.")
|
self.logger.log(indent + "Process content.")
|
||||||
content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
|
content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
|
||||||
remove_title_from_chapter=is_chapter)
|
remove_title_from_chapter=is_chapter)
|
||||||
|
|
||||||
|
self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
|
||||||
|
self.img_href2img_bytes,
|
||||||
|
path_to_html=nav_point.href,
|
||||||
|
access=self.access,
|
||||||
|
path2aws_path=self.book_image_src_path2aws_path,
|
||||||
|
book_id=Path(self.file_path).stem)
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
# warning! not EpubHtmlItems won't be added to chapter
|
# warning! not EpubHtmlItems won't be added to chapter
|
||||||
# if it doesn't have subchapters
|
# if it doesn't have subchapters
|
||||||
@@ -612,7 +612,7 @@ class EpubConverter:
|
|||||||
sub_chapter_item = self.html_node_to_livecarta_chapter_item(
|
sub_chapter_item = self.html_node_to_livecarta_chapter_item(
|
||||||
sub_node, lvl + 1)
|
sub_node, lvl + 1)
|
||||||
sub_nodes.append(sub_chapter_item)
|
sub_nodes.append(sub_chapter_item)
|
||||||
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
|
return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes)
|
||||||
|
|
||||||
def convert_to_dict(self) -> dict:
|
def convert_to_dict(self) -> dict:
|
||||||
"""Function which convert list of html nodes to appropriate json structure"""
|
"""Function which convert list of html nodes to appropriate json structure"""
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ class HtmlEpubPreprocessor:
|
|||||||
self.name2function = {
|
self.name2function = {
|
||||||
"table_wrapper": self._wrap_tags_with_table,
|
"table_wrapper": self._wrap_tags_with_table,
|
||||||
"replacer": self._tags_to_correspond_livecarta_tag,
|
"replacer": self._tags_to_correspond_livecarta_tag,
|
||||||
|
"attr_replacer": self._replace_attrs_in_tags,
|
||||||
"unwrapper": self._unwrap_tags,
|
"unwrapper": self._unwrap_tags,
|
||||||
"inserter": self._insert_tags_into_correspond_tags
|
"inserter": self._insert_tags_into_correspond_tags
|
||||||
}
|
}
|
||||||
@@ -190,6 +191,30 @@ class HtmlEpubPreprocessor:
|
|||||||
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
||||||
tag.name = tag_to_replace
|
tag.name = tag_to_replace
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list):
|
||||||
|
"""
|
||||||
|
Function to replace all tags to correspond LiveCarta tags
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
Tag & contents of the chapter tag
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
Chapter Tag with all tags replaced with LiveCarta tags
|
||||||
|
|
||||||
|
"""
|
||||||
|
for rule in rules:
|
||||||
|
attr = rule["attr"]
|
||||||
|
tags = rule["condition"]["tags"]
|
||||||
|
attr_to_replace = rule["attr_to_replace"]
|
||||||
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
|
{attr: re.compile(r".*")}):
|
||||||
|
tag[attr_to_replace] = tag[attr]
|
||||||
|
del tag[attr]
|
||||||
|
|
||||||
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
|
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
|
||||||
"""
|
"""
|
||||||
Function unwrap tags and moves id to span
|
Function unwrap tags and moves id to span
|
||||||
@@ -353,7 +378,7 @@ class HtmlEpubPreprocessor:
|
|||||||
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
||||||
del tag.attrs["class"]
|
del tag.attrs["class"]
|
||||||
|
|
||||||
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
|
||||||
"""
|
"""
|
||||||
Function finalise processing/cleaning content
|
Function finalise processing/cleaning content
|
||||||
Parameters
|
Parameters
|
||||||
@@ -378,7 +403,7 @@ class HtmlEpubPreprocessor:
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
content_tag: str
|
content_tag: Tag
|
||||||
prepared content
|
prepared content
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -397,4 +422,4 @@ class HtmlEpubPreprocessor:
|
|||||||
self._process_tables(content_tag)
|
self._process_tables(content_tag)
|
||||||
# 9. remove classes that weren't created by converter
|
# 9. remove classes that weren't created by converter
|
||||||
self._class_removing(content_tag)
|
self._class_removing(content_tag)
|
||||||
return str(content_tag)
|
return content_tag
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
|
|||||||
|
|
||||||
|
|
||||||
def update_images_src_links(body_tag: BeautifulSoup,
|
def update_images_src_links(body_tag: BeautifulSoup,
|
||||||
href2img_content: dict,
|
img_href2img_content: dict,
|
||||||
path_to_html: str,
|
path_to_html: str,
|
||||||
access=None,
|
access=None,
|
||||||
path2aws_path: dict = None,
|
path2aws_path: dict = None,
|
||||||
@@ -40,10 +40,10 @@ def update_images_src_links(body_tag: BeautifulSoup,
|
|||||||
path_to_img_from_root = os.path.normpath(os.path.join(
|
path_to_img_from_root = os.path.normpath(os.path.join(
|
||||||
html_folder, path_to_img_from_html)).replace("\\", "/")
|
html_folder, path_to_img_from_html)).replace("\\", "/")
|
||||||
|
|
||||||
assert path_to_img_from_root in href2img_content, \
|
assert path_to_img_from_root in img_href2img_content, \
|
||||||
f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest."
|
f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest."
|
||||||
|
|
||||||
img_content = href2img_content[path_to_img_from_root]
|
img_content = img_href2img_content[path_to_img_from_root]
|
||||||
if access is not None:
|
if access is not None:
|
||||||
if path_to_img_from_root in path2aws_path:
|
if path_to_img_from_root in path2aws_path:
|
||||||
new_folder = path2aws_path[path_to_img_from_root]
|
new_folder = path2aws_path[path_to_img_from_root]
|
||||||
|
|||||||
Reference in New Issue
Block a user