diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py
index 10714cf..265ae61 100644
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -302,6 +302,11 @@ def unwrap_structural_tags(body_tag):
'figure', 'footer', 'iframe', 'span', 'p'
]
+ # comments removal
+ for tag in body_tag.find_all():
+ for element in tag(text=lambda text: isinstance(text, Comment)):
+ element.extract()
+
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0]
@@ -500,7 +505,7 @@ def preprocess_pre_tags(chapter_tag):
for child in pre.children:
if isinstance(child, NavigableString):
- cleaned_text = _prepare_formatted(pre.text)
+ cleaned_text = _prepare_formatted(str(child))
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
for string in sub_strings:
new_tag.append(NavigableString(string))
@@ -519,8 +524,7 @@ def preprocess_pre_tags(chapter_tag):
new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
"font-size: 14px; white-space: nowrap;"
- pre.insert_before(new_tag)
- pre.extract()
+ pre.replace_with(new_tag)
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
p_for_br = chapter_tag.new_tag("p")
p_for_br.string = "\xa0"
@@ -570,10 +574,6 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']):
del tag.attrs['class']
- # 3. comments removal
- for tag in chapter_tag.find_all():
- for element in tag(text=lambda text: isinstance(text, Comment)):
- element.extract()
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str)
return title_str, str(chapter_tag)