diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 7e8ab8a..f2c3232 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -589,13 +589,6 @@ class EpubConverter: content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \ if nav_point.id else self.html_href2html_body_soup[nav_point.href] - self.book_image_src_path2aws_path = update_images_src_links(content, - self.img_href2img_bytes, - path_to_html=nav_point.href, - access=self.access, - path2aws_path=self.book_image_src_path2aws_path, - book_id=Path(self.file_path).stem) - indent = " " * lvl self.logger.log(indent + f"Chapter: {title} is processing.") is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS @@ -604,6 +597,13 @@ class EpubConverter: self.logger.log(indent + "Process content.") content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content, remove_title_from_chapter=is_chapter) + + self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed, + self.img_href2img_bytes, + path_to_html=nav_point.href, + access=self.access, + path2aws_path=self.book_image_src_path2aws_path, + book_id=Path(self.file_path).stem) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter # if it doesn't have subchapters @@ -612,7 +612,7 @@ class EpubConverter: sub_chapter_item = self.html_node_to_livecarta_chapter_item( sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) - return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) + return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes) def convert_to_dict(self) -> dict: """Function which convert list of html nodes to appropriate json structure""" diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index 752c4ac..0df4908 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -11,6 +11,7 @@ class HtmlEpubPreprocessor: self.name2function = { "table_wrapper": self._wrap_tags_with_table, "replacer": self._tags_to_correspond_livecarta_tag, + "attr_replacer": self._replace_attrs_in_tags, "unwrapper": self._unwrap_tags, "inserter": self._insert_tags_into_correspond_tags } @@ -190,6 +191,30 @@ class HtmlEpubPreprocessor: # todo can cause appearance of \n
...
->\n
...
\n
(section) tag.name = tag_to_replace + @staticmethod + def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list): + """ + Function to replace all tags to correspond LiveCarta tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with all tags replaced with LiveCarta tags + + """ + for rule in rules: + attr = rule["attr"] + tags = rule["condition"]["tags"] + attr_to_replace = rule["attr_to_replace"] + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr: re.compile(r".*")}): + tag[attr_to_replace] = tag[attr] + del tag[attr] + def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict): """ Function unwrap tags and moves id to span @@ -353,7 +378,7 @@ class HtmlEpubPreprocessor: and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): del tag.attrs["class"] - def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: + def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: """ Function finalise processing/cleaning content Parameters @@ -378,7 +403,7 @@ class HtmlEpubPreprocessor: Returns ------- - content_tag: str + content_tag: Tag prepared content """ @@ -397,4 +422,4 @@ class HtmlEpubPreprocessor: self._process_tables(content_tag) # 9. remove classes that weren't created by converter self._class_removing(content_tag) - return str(content_tag) + return content_tag diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py index be0246e..e568aaa 100644 --- a/src/epub_converter/image_processing.py +++ b/src/epub_converter/image_processing.py @@ -27,7 +27,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): def update_images_src_links(body_tag: BeautifulSoup, - href2img_content: dict, + img_href2img_content: dict, path_to_html: str, access=None, path2aws_path: dict = None, @@ -40,10 +40,10 @@ def update_images_src_links(body_tag: BeautifulSoup, path_to_img_from_root = os.path.normpath(os.path.join( html_folder, path_to_img_from_html)).replace("\\", "/") - assert path_to_img_from_root in href2img_content, \ + assert path_to_img_from_root in img_href2img_content, \ f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest." - img_content = href2img_content[path_to_img_from_root] + img_content = img_href2img_content[path_to_img_from_root] if access is not None: if path_to_img_from_root in path2aws_path: new_folder = path2aws_path[path_to_img_from_root]