epub converter: prettify

2021-06-22 13:58:21 +03:00
parent bd364cd097
commit f4e8eed13c
1 changed files with 25 additions and 12 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -119,16 +119,28 @@ def _process_lists(body_tag):
            il_tag.p.unwrap()


-def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
-    for sub_tag in tag_to_be_removed.find_all():
-        if sub_tag.attrs.get('id'):
-            new_tag = body_tag.new_tag("span")
-            new_tag.attrs['id'] = sub_tag.attrs['id']
-            new_tag.attrs['class'] = sub_tag.attrs.get('class')
-            tag_to_be_removed.insert_before(new_tag)
+def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
+    new_tag = main_tag.new_tag("span")
+    new_tag.attrs['id'] = id_ or ''
+    new_tag.attrs['class'] = class_ or ''
+    tag.insert_before(new_tag)


 def clean_headings_content(content: Tag, title: str):
+    def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
+        if tag_to_be_removed.attrs.get('id'):
+            insert_span_with_attrs_before_tag(body_tag,
+                                              tag_to_be_removed,
+                                              id_=tag_to_be_removed.attrs.get('id'),
+                                              class_=tag_to_be_removed.attrs.get('class'))
+
+        for sub_tag in tag_to_be_removed.find_all():
+            if sub_tag.attrs.get('id'):
+                insert_span_with_attrs_before_tag(body_tag,
+                                                  tag_to_be_removed,
+                                                  id_=sub_tag.attrs['id'],
+                                                  class_=sub_tag.attrs.get('class'))
+
    title = title.lower()
    for child in content.contents:
        if isinstance(child, NavigableString):
@@ -249,16 +261,15 @@ def unwrap_structural_tags(body_tag):

    def _add_span_to_save_ids_for_links(tag_to_be_removed):
        if tag_to_be_removed.attrs.get('id'):
-            new_tag = body_tag.new_tag("span")
-            new_tag.attrs['id'] = tag_to_be_removed.attrs['id']
-            tag_to_be_removed.insert_before(new_tag)
+            insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
+                                              id_=tag_to_be_removed.attrs['id'],
+                                              class_=tag_to_be_removed.attrs.get('class'))

    structural_tags_names = [
        'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
        'figure', 'footer', 'iframe', 'span', 'p'
    ]
    # should be before other tags processing, not to remove converter empty tags with id
-    # not all cases, if span has <p>s and NavigableString, it won't unwrap
    for s in body_tag.find_all("span"):
        if (s.attrs.get('epub:type') == 'pagebreak') or s.attrs.get('id'):
            continue
@@ -266,6 +277,8 @@ def unwrap_structural_tags(body_tag):
            is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
            if all(is_not_struct_tag):
                continue
+
+        _add_span_to_save_ids_for_links(s)
        s.unwrap()

    for div in body_tag.find_all("div"):
@@ -320,7 +333,7 @@ def unwrap_structural_tags(body_tag):
    if not all(parents_marks_are_body):
        for x in marks:
            while x.parent != body_tag:
-                x.parent.unwrap()  # warning! could reflect on formatting/internal links in some cases
+                x.parent.unwrap()  # todo warning! could reflect on formatting/internal links in some cases

    parents_marks_are_body = [x.parent == body_tag for x in marks]
    assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'