From f4e8eed13c0037c44615211c201ca984f2bc0b00 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Tue, 22 Jun 2021 13:58:21 +0300 Subject: [PATCH] epub converter: prettify --- src/html_epub_preprocessor.py | 37 +++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 6915b41..ab34ba1 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -119,16 +119,28 @@ def _process_lists(body_tag): il_tag.p.unwrap() -def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag): - for sub_tag in tag_to_be_removed.find_all(): - if sub_tag.attrs.get('id'): - new_tag = body_tag.new_tag("span") - new_tag.attrs['id'] = sub_tag.attrs['id'] - new_tag.attrs['class'] = sub_tag.attrs.get('class') - tag_to_be_removed.insert_before(new_tag) +def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): + new_tag = main_tag.new_tag("span") + new_tag.attrs['id'] = id_ or '' + new_tag.attrs['class'] = class_ or '' + tag.insert_before(new_tag) def clean_headings_content(content: Tag, title: str): + def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag): + if tag_to_be_removed.attrs.get('id'): + insert_span_with_attrs_before_tag(body_tag, + tag_to_be_removed, + id_=tag_to_be_removed.attrs.get('id'), + class_=tag_to_be_removed.attrs.get('class')) + + for sub_tag in tag_to_be_removed.find_all(): + if sub_tag.attrs.get('id'): + insert_span_with_attrs_before_tag(body_tag, + tag_to_be_removed, + id_=sub_tag.attrs['id'], + class_=sub_tag.attrs.get('class')) + title = title.lower() for child in content.contents: if isinstance(child, NavigableString): @@ -249,16 +261,15 @@ def unwrap_structural_tags(body_tag): def _add_span_to_save_ids_for_links(tag_to_be_removed): if tag_to_be_removed.attrs.get('id'): - new_tag = body_tag.new_tag("span") - new_tag.attrs['id'] = tag_to_be_removed.attrs['id'] - tag_to_be_removed.insert_before(new_tag) + insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed, + id_=tag_to_be_removed.attrs['id'], + class_=tag_to_be_removed.attrs.get('class')) structural_tags_names = [ 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p' ] # should be before other tags processing, not to remove converter empty tags with id - # not all cases, if span has

s and NavigableString, it won't unwrap for s in body_tag.find_all("span"): if (s.attrs.get('epub:type') == 'pagebreak') or s.attrs.get('id'): continue @@ -266,6 +277,8 @@ def unwrap_structural_tags(body_tag): is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents] if all(is_not_struct_tag): continue + + _add_span_to_save_ids_for_links(s) s.unwrap() for div in body_tag.find_all("div"): @@ -320,7 +333,7 @@ def unwrap_structural_tags(body_tag): if not all(parents_marks_are_body): for x in marks: while x.parent != body_tag: - x.parent.unwrap() # warning! could reflect on formatting/internal links in some cases + x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases parents_marks_are_body = [x.parent == body_tag for x in marks] assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'