From 90c55875706dff5ad5d211baf55a84a5d4d35728 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Fri, 9 Jul 2021 12:24:36 +0300 Subject: [PATCH] epub converter: update footnotes --- src/epub_postprocessor.py | 9 ++++++++- src/html_epub_preprocessor.py | 13 +++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index 42e5f8d..b7813bc 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -48,8 +48,15 @@ class EpubPostprocessor: self.logger.log('Footnotes processing.') self.footnotes = [] + self.noterefs = [] for href in self.href2soup_html: - self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html)) + footnotes, noterefs = preprocess_footnotes(self.href2soup_html[href], self.href2soup_html) + self.footnotes.extend(footnotes) + self.noterefs.extend(noterefs) + for i, noteref in enumerate(self.noterefs): + noteref.attrs['data-id'] = i + 1 + noteref.attrs['id'] = f'footnote-{i + 1}' + self.logger.log(f'Added {len(self.footnotes)} footnotes.') self.logger.log('TOC processing.') self.href2subchapter_ids = defaultdict(list) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 886d072..d783a47 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -194,10 +194,14 @@ def replace_with_livecarta_anchor_tag(anchor, i): new_tag['data-id'] = i + 1 new_tag['id'] = f'footnote-{i + 1}' new_tag.string = '*' + if anchor.parent.name == 'sup': + anchor.parent.unwrap() anchor.replace_with(new_tag) + return new_tag -def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> List[str]: +def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> Tuple[ + list, list]: """ This function should be earlier that adding fonts in pipeline. @@ -209,6 +213,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'}) bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')]) noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags] + new_noterefs_tags = [] [tag.decompose() for tag in bad_noterefs_tags] def parse_a_tag_href(s: str): @@ -257,13 +262,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note footnote_tag = expected_footnote_tags[0] if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote': footnote_tag = footnote_tag.parent - replace_with_livecarta_anchor_tag(noteref_tag, i) + new_noterefs_tags.append(replace_with_livecarta_anchor_tag(noteref_tag, i)) content = footnote_tag.text footnote_tag.decompose() footnotes.append(content) - return footnotes + return footnotes, new_noterefs_tags def unwrap_structural_tags(body_tag): @@ -503,7 +508,7 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr preprocess_block_tags(chapter_tag) # 2. class removal for tag in chapter_tag.find_all(recursive=True): - if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']): + if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor','footnote-element']): del tag.attrs['class'] # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))