forked from LiveCarta/BookConverter
epub converter: prettify
This commit is contained in:
@@ -119,16 +119,28 @@ def _process_lists(body_tag):
|
||||
il_tag.p.unwrap()
|
||||
|
||||
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
|
||||
for sub_tag in tag_to_be_removed.find_all():
|
||||
if sub_tag.attrs.get('id'):
|
||||
new_tag = body_tag.new_tag("span")
|
||||
new_tag.attrs['id'] = sub_tag.attrs['id']
|
||||
new_tag.attrs['class'] = sub_tag.attrs.get('class')
|
||||
tag_to_be_removed.insert_before(new_tag)
|
||||
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||
new_tag = main_tag.new_tag("span")
|
||||
new_tag.attrs['id'] = id_ or ''
|
||||
new_tag.attrs['class'] = class_ or ''
|
||||
tag.insert_before(new_tag)
|
||||
|
||||
|
||||
def clean_headings_content(content: Tag, title: str):
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
|
||||
if tag_to_be_removed.attrs.get('id'):
|
||||
insert_span_with_attrs_before_tag(body_tag,
|
||||
tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs.get('id'),
|
||||
class_=tag_to_be_removed.attrs.get('class'))
|
||||
|
||||
for sub_tag in tag_to_be_removed.find_all():
|
||||
if sub_tag.attrs.get('id'):
|
||||
insert_span_with_attrs_before_tag(body_tag,
|
||||
tag_to_be_removed,
|
||||
id_=sub_tag.attrs['id'],
|
||||
class_=sub_tag.attrs.get('class'))
|
||||
|
||||
title = title.lower()
|
||||
for child in content.contents:
|
||||
if isinstance(child, NavigableString):
|
||||
@@ -249,16 +261,15 @@ def unwrap_structural_tags(body_tag):
|
||||
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed):
|
||||
if tag_to_be_removed.attrs.get('id'):
|
||||
new_tag = body_tag.new_tag("span")
|
||||
new_tag.attrs['id'] = tag_to_be_removed.attrs['id']
|
||||
tag_to_be_removed.insert_before(new_tag)
|
||||
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs['id'],
|
||||
class_=tag_to_be_removed.attrs.get('class'))
|
||||
|
||||
structural_tags_names = [
|
||||
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
|
||||
'figure', 'footer', 'iframe', 'span', 'p'
|
||||
]
|
||||
# should be before other tags processing, not to remove converter empty tags with id
|
||||
# not all cases, if span has <p>s and NavigableString, it won't unwrap
|
||||
for s in body_tag.find_all("span"):
|
||||
if (s.attrs.get('epub:type') == 'pagebreak') or s.attrs.get('id'):
|
||||
continue
|
||||
@@ -266,6 +277,8 @@ def unwrap_structural_tags(body_tag):
|
||||
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
|
||||
if all(is_not_struct_tag):
|
||||
continue
|
||||
|
||||
_add_span_to_save_ids_for_links(s)
|
||||
s.unwrap()
|
||||
|
||||
for div in body_tag.find_all("div"):
|
||||
@@ -320,7 +333,7 @@ def unwrap_structural_tags(body_tag):
|
||||
if not all(parents_marks_are_body):
|
||||
for x in marks:
|
||||
while x.parent != body_tag:
|
||||
x.parent.unwrap() # warning! could reflect on formatting/internal links in some cases
|
||||
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
|
||||
|
||||
parents_marks_are_body = [x.parent == body_tag for x in marks]
|
||||
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
|
||||
|
||||
Reference in New Issue
Block a user