From ea4dd77155131c5dcf75e92d251c96ece8cd507f Mon Sep 17 00:00:00 2001
From: Kiryl ...
\n
...
\n
(section) tag.name = tag_to_replace + @staticmethod + def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list): + """ + Function to replace all tags to correspond LiveCarta tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with all tags replaced with LiveCarta tags + + """ + for rule in rules: + attr = rule["attr"] + tags = rule["condition"]["tags"] + attr_to_replace = rule["attr_to_replace"] + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr: re.compile(r".*")}): + tag[attr_to_replace] = tag[attr] + del tag[attr] + def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict): """ Function unwrap tags and moves id to span @@ -353,7 +378,7 @@ class HtmlEpubPreprocessor: and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): del tag.attrs["class"] - def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: + def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: """ Function finalise processing/cleaning content Parameters @@ -378,7 +403,7 @@ class HtmlEpubPreprocessor: Returns ------- - content_tag: str + content_tag: Tag prepared content """ @@ -397,4 +422,4 @@ class HtmlEpubPreprocessor: self._process_tables(content_tag) # 9. remove classes that weren't created by converter self._class_removing(content_tag) - return str(content_tag) + return content_tag diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py index be0246e..e568aaa 100644 --- a/src/epub_converter/image_processing.py +++ b/src/epub_converter/image_processing.py @@ -27,7 +27,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): def update_images_src_links(body_tag: BeautifulSoup, - href2img_content: dict, + img_href2img_content: dict, path_to_html: str, access=None, path2aws_path: dict = None, @@ -40,10 +40,10 @@ def update_images_src_links(body_tag: BeautifulSoup, path_to_img_from_root = os.path.normpath(os.path.join( html_folder, path_to_img_from_html)).replace("\\", "/") - assert path_to_img_from_root in href2img_content, \ + assert path_to_img_from_root in img_href2img_content, \ f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest." - img_content = href2img_content[path_to_img_from_root] + img_content = img_href2img_content[path_to_img_from_root] if access is not None: if path_to_img_from_root in path2aws_path: new_folder = path2aws_path[path_to_img_from_root]