epub converter: prettify

This commit is contained in:
shirshasa
2021-06-22 13:58:21 +03:00
parent bd364cd097
commit f4e8eed13c

View File

@@ -119,16 +119,28 @@ def _process_lists(body_tag):
il_tag.p.unwrap()
def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
new_tag = body_tag.new_tag("span")
new_tag.attrs['id'] = sub_tag.attrs['id']
new_tag.attrs['class'] = sub_tag.attrs.get('class')
tag_to_be_removed.insert_before(new_tag)
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
tag.insert_before(new_tag)
def clean_headings_content(content: Tag, title: str):
def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=tag_to_be_removed.attrs.get('id'),
class_=tag_to_be_removed.attrs.get('class'))
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class'))
title = title.lower()
for child in content.contents:
if isinstance(child, NavigableString):
@@ -249,16 +261,15 @@ def unwrap_structural_tags(body_tag):
def _add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
new_tag = body_tag.new_tag("span")
new_tag.attrs['id'] = tag_to_be_removed.attrs['id']
tag_to_be_removed.insert_before(new_tag)
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
# should be before other tags processing, not to remove converter empty tags with id
# not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in body_tag.find_all("span"):
if (s.attrs.get('epub:type') == 'pagebreak') or s.attrs.get('id'):
continue
@@ -266,6 +277,8 @@ def unwrap_structural_tags(body_tag):
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
if all(is_not_struct_tag):
continue
_add_span_to_save_ids_for_links(s)
s.unwrap()
for div in body_tag.find_all("div"):
@@ -320,7 +333,7 @@ def unwrap_structural_tags(body_tag):
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # warning! could reflect on formatting/internal links in some cases
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'