forked from LiveCarta/BookConverter
Optimize heading cleaning
This commit is contained in:
@@ -126,7 +126,7 @@ def _remove_headings_content(content_tag, title_of_chapter: str):
|
|||||||
title_of_chapter = title_of_chapter.lower()
|
title_of_chapter = title_of_chapter.lower()
|
||||||
for tag in content_tag.contents:
|
for tag in content_tag.contents:
|
||||||
text = tag if isinstance(tag, NavigableString) else tag.text
|
text = tag if isinstance(tag, NavigableString) else tag.text
|
||||||
if re.sub(r'([\s\xa0])', '', text):
|
if re.sub(r"[\s\xa0]", "", text):
|
||||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||||
text = text.strip() # delete extra spaces
|
text = text.strip() # delete extra spaces
|
||||||
if title_of_chapter == text or \
|
if title_of_chapter == text or \
|
||||||
@@ -134,9 +134,10 @@ def _remove_headings_content(content_tag, title_of_chapter: str):
|
|||||||
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
|
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
|
||||||
_add_span_to_save_ids_for_links(tag, content_tag)
|
_add_span_to_save_ids_for_links(tag, content_tag)
|
||||||
tag.extract()
|
tag.extract()
|
||||||
|
return
|
||||||
elif not isinstance(tag, NavigableString):
|
elif not isinstance(tag, NavigableString):
|
||||||
_remove_headings_content(tag, title_of_chapter)
|
if not _remove_headings_content(tag, title_of_chapter):
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
||||||
@@ -275,13 +276,13 @@ def _preprocess_div_tags(chapter_tag):
|
|||||||
Function replace <div> with <table>:
|
Function replace <div> with <table>:
|
||||||
"""
|
"""
|
||||||
for div in chapter_tag.find_all("div"):
|
for div in chapter_tag.find_all("div"):
|
||||||
if any(attr in ['width', 'border', 'bgcolor'] for attr in div.attrs):
|
if any(attr in ["width", "border", "bgcolor"] for attr in div.attrs):
|
||||||
_wrap_tag_with_table(
|
_wrap_tag_with_table(
|
||||||
chapter_tag,
|
chapter_tag,
|
||||||
tag_to_be_wrapped=div,
|
tag_to_be_wrapped=div,
|
||||||
width=div.attrs['width'] if div.attrs.get('width') else '100',
|
width=div.attrs["width"] if div.attrs.get("width") else "100",
|
||||||
border=div.attrs['border'] if div.attrs.get('border') else None,
|
border=div.attrs["border"] if div.attrs.get("border") else None,
|
||||||
bg_color=div.attrs['bgcolor'] if div.attrs.get('bgcolor') else None)
|
bg_color=div.attrs["bgcolor"] if div.attrs.get("bgcolor") else None)
|
||||||
else:
|
else:
|
||||||
div.name = "p"
|
div.name = "p"
|
||||||
continue
|
continue
|
||||||
|
|||||||
Reference in New Issue
Block a user