From f4e8eed13c0037c44615211c201ca984f2bc0b00 Mon Sep 17 00:00:00 2001
From: shirshasa <katerinagorbac@gmail.com>
Date: Tue, 22 Jun 2021 13:58:21 +0300
Subject: [PATCH] epub converter: prettify

---
 src/html_epub_preprocessor.py | 37 +++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)
diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py
index 6915b41..ab34ba1 100644
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -119,16 +119,28 @@ def _process_lists(body_tag):
             il_tag.p.unwrap()
 
 
-def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
-    for sub_tag in tag_to_be_removed.find_all():
-        if sub_tag.attrs.get('id'):
-            new_tag = body_tag.new_tag("span")
-            new_tag.attrs['id'] = sub_tag.attrs['id']
-            new_tag.attrs['class'] = sub_tag.attrs.get('class')
-            tag_to_be_removed.insert_before(new_tag)
+def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
+    new_tag = main_tag.new_tag("span")
+    new_tag.attrs['id'] = id_ or ''
+    new_tag.attrs['class'] = class_ or ''
+    tag.insert_before(new_tag)
 
 
 def clean_headings_content(content: Tag, title: str):
+    def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
+        if tag_to_be_removed.attrs.get('id'):
+            insert_span_with_attrs_before_tag(body_tag,
+                                              tag_to_be_removed,
+                                              id_=tag_to_be_removed.attrs.get('id'),
+                                              class_=tag_to_be_removed.attrs.get('class'))
+
+        for sub_tag in tag_to_be_removed.find_all():
+            if sub_tag.attrs.get('id'):
+                insert_span_with_attrs_before_tag(body_tag,
+                                                  tag_to_be_removed,
+                                                  id_=sub_tag.attrs['id'],
+                                                  class_=sub_tag.attrs.get('class'))
+
     title = title.lower()
     for child in content.contents:
         if isinstance(child, NavigableString):
@@ -249,16 +261,15 @@ def unwrap_structural_tags(body_tag):
 
     def _add_span_to_save_ids_for_links(tag_to_be_removed):
         if tag_to_be_removed.attrs.get('id'):
-            new_tag = body_tag.new_tag("span")
-            new_tag.attrs['id'] = tag_to_be_removed.attrs['id']
-            tag_to_be_removed.insert_before(new_tag)
+            insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
+                                              id_=tag_to_be_removed.attrs['id'],
+                                              class_=tag_to_be_removed.attrs.get('class'))
 
     structural_tags_names = [
         'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
         'figure', 'footer', 'iframe', 'span', 'p'
     ]
     # should be before other tags processing, not to remove converter empty tags with id
-    # not all cases, if span has <p>s and NavigableString, it won't unwrap
     for s in body_tag.find_all("span"):
         if (s.attrs.get('epub:type') == 'pagebreak') or s.attrs.get('id'):
             continue
@@ -266,6 +277,8 @@ def unwrap_structural_tags(body_tag):
             is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
             if all(is_not_struct_tag):
                 continue
+
+        _add_span_to_save_ids_for_links(s)
         s.unwrap()
 
     for div in body_tag.find_all("div"):
@@ -320,7 +333,7 @@ def unwrap_structural_tags(body_tag):
     if not all(parents_marks_are_body):
         for x in marks:
             while x.parent != body_tag:
-                x.parent.unwrap()  # warning! could reflect on formatting/internal links in some cases
+                x.parent.unwrap()  # todo warning! could reflect on formatting/internal links in some cases
 
     parents_marks_are_body = [x.parent == body_tag for x in marks]
     assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'