From ea4dd77155131c5dcf75e92d251c96ece8cd507f Mon Sep 17 00:00:00 2001
From: Kiryl <kiryl.miatselitsa@teqniksoft.com>
Date: Wed, 20 Jul 2022 15:45:44 +0300
Subject: [PATCH] Add attr replacer & svg -> img

---
 src/epub_converter/epub_converter.py      | 16 ++++++------
 src/epub_converter/html_epub_processor.py | 31 ++++++++++++++++++++---
 src/epub_converter/image_processing.py    |  6 ++---
 3 files changed, 39 insertions(+), 14 deletions(-)
diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py
index 7e8ab8a..f2c3232 100644
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -589,13 +589,6 @@ class EpubConverter:
         content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
             if nav_point.id else self.html_href2html_body_soup[nav_point.href]
 
-        self.book_image_src_path2aws_path = update_images_src_links(content,
-                                                                    self.img_href2img_bytes,
-                                                                    path_to_html=nav_point.href,
-                                                                    access=self.access,
-                                                                    path2aws_path=self.book_image_src_path2aws_path,
-                                                                    book_id=Path(self.file_path).stem)
-
         indent = " " * lvl
         self.logger.log(indent + f"Chapter: {title} is processing.")
         is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
@@ -604,6 +597,13 @@ class EpubConverter:
         self.logger.log(indent + "Process content.")
         content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
                                                                    remove_title_from_chapter=is_chapter)
+
+        self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
+                                                                    self.img_href2img_bytes,
+                                                                    path_to_html=nav_point.href,
+                                                                    access=self.access,
+                                                                    path2aws_path=self.book_image_src_path2aws_path,
+                                                                    book_id=Path(self.file_path).stem)
         sub_nodes = []
         # warning! not EpubHtmlItems won't be added to chapter
         # if it doesn't have subchapters
@@ -612,7 +612,7 @@ class EpubConverter:
                 sub_chapter_item = self.html_node_to_livecarta_chapter_item(
                     sub_node, lvl + 1)
                 sub_nodes.append(sub_chapter_item)
-        return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
+        return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes)
 
     def convert_to_dict(self) -> dict:
         """Function which convert list of html nodes to appropriate json structure"""
diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py
index 752c4ac..0df4908 100644
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -11,6 +11,7 @@ class HtmlEpubPreprocessor:
         self.name2function = {
             "table_wrapper": self._wrap_tags_with_table,
             "replacer": self._tags_to_correspond_livecarta_tag,
+            "attr_replacer": self._replace_attrs_in_tags,
             "unwrapper": self._unwrap_tags,
             "inserter": self._insert_tags_into_correspond_tags
         }
@@ -190,6 +191,30 @@ class HtmlEpubPreprocessor:
                     # todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
                     tag.name = tag_to_replace
 
+    @staticmethod
+    def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list):
+        """
+        Function to replace all tags to correspond LiveCarta tags
+        Parameters
+        ----------
+        chapter_tag: BeautifulSoup
+            Tag & contents of the chapter tag
+
+        Returns
+        -------
+        None
+            Chapter Tag with all tags replaced with LiveCarta tags
+
+        """
+        for rule in rules:
+            attr = rule["attr"]
+            tags = rule["condition"]["tags"]
+            attr_to_replace = rule["attr_to_replace"]
+            for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+                                                            {attr: re.compile(r".*")}):
+                tag[attr_to_replace] = tag[attr]
+                del tag[attr]
+
     def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
         """
         Function unwrap tags and moves id to span
@@ -353,7 +378,7 @@ class HtmlEpubPreprocessor:
                     and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
                 del tag.attrs["class"]
 
-    def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
+    def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
         """
         Function finalise processing/cleaning content
         Parameters
@@ -378,7 +403,7 @@ class HtmlEpubPreprocessor:
 
         Returns
         -------
-        content_tag: str
+        content_tag: Tag
             prepared content
 
         """
@@ -397,4 +422,4 @@ class HtmlEpubPreprocessor:
         self._process_tables(content_tag)
         # 9. remove classes that weren't created by converter
         self._class_removing(content_tag)
-        return str(content_tag)
+        return content_tag
diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py
index be0246e..e568aaa 100644
--- a/src/epub_converter/image_processing.py
+++ b/src/epub_converter/image_processing.py
@@ -27,7 +27,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
 
 
 def update_images_src_links(body_tag: BeautifulSoup,
-                            href2img_content: dict,
+                            img_href2img_content: dict,
                             path_to_html: str,
                             access=None,
                             path2aws_path: dict = None,
@@ -40,10 +40,10 @@ def update_images_src_links(body_tag: BeautifulSoup,
         path_to_img_from_root = os.path.normpath(os.path.join(
             html_folder, path_to_img_from_html)).replace("\\", "/")
 
-        assert path_to_img_from_root in href2img_content, \
+        assert path_to_img_from_root in img_href2img_content, \
             f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest."
 
-        img_content = href2img_content[path_to_img_from_root]
+        img_content = img_href2img_content[path_to_img_from_root]
         if access is not None:
             if path_to_img_from_root in path2aws_path:
                 new_folder = path2aws_path[path_to_img_from_root]