diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py
index 48a5baf..3298004 100644
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -429,6 +429,7 @@ class HTMLPreprocessor:
"""
Function to remove digits from headers.
"""
+ title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
@@ -458,7 +459,8 @@ class HTMLPreprocessor:
func(tag)
else:
children = list(tag.children)
- self.apply_func_to_last_child(children[0], func)
+ if children:
+ self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self):
"""
@@ -573,6 +575,11 @@ class HTMLPreprocessor:
content = list(tag.children)
+ # do not take into account rubbish empty tags like , but don't remove them
+ content = [item for item in content if
+ (type(item) is not NavigableString and item.text != '')
+ or (type(item) is NavigableString)]
+
for i, item in enumerate(content):
if type(content[i]) is NavigableString:
cleaned = re.sub(r'(\s+)+', ' ', content[i])