forked from LiveCarta/BookConverter
improve heading removal
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import re
|
||||
from typing import Union
|
||||
from typing import List, Union
|
||||
from bs4.element import PageElement
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
||||
|
||||
@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
|
||||
clean/remove headings & add span with id
|
||||
|
||||
"""
|
||||
title_of_chapter = title_of_chapter.lower()
|
||||
for tag in chapter_tag.contents:
|
||||
tag: PageElement
|
||||
def text_preparing(tag: PageElement):
|
||||
text: str = tag if isinstance(tag, NavigableString) else tag.text
|
||||
if re.sub(r"[\s\xa0]", "", text):
|
||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||
text = text.strip() # delete extra spaces
|
||||
if not isinstance(tag, NavigableString):
|
||||
if title_of_chapter == text or \
|
||||
(title_of_chapter in text and
|
||||
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
|
||||
self.html_preprocessor._add_span_to_save_ids_for_links(
|
||||
tag, chapter_tag)
|
||||
tag.extract()
|
||||
return
|
||||
elif not self._remove_headings_content(tag, title_of_chapter):
|
||||
break
|
||||
else:
|
||||
tag.extract()
|
||||
return
|
||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||
text = text.strip() # delete extra spaces
|
||||
return text
|
||||
|
||||
title_of_chapter: str = title_of_chapter.lower()
|
||||
title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
|
||||
(title_of_chapter in text_preparing(tag) and
|
||||
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
|
||||
|
||||
text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
|
||||
if title_in_text:
|
||||
self.html_preprocessor._add_span_to_save_ids_for_links(
|
||||
title_in_text[-1], chapter_tag)
|
||||
title_in_text[-1].extract()
|
||||
elif text_in_title:
|
||||
[self.html_preprocessor._add_span_to_save_ids_for_links(
|
||||
tag, chapter_tag) for tag in text_in_title]
|
||||
[tag.extract() for tag in text_in_title]
|
||||
|
||||
@staticmethod
|
||||
def _class_removing(chapter_tag: BeautifulSoup):
|
||||
|
||||
Reference in New Issue
Block a user