diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 48a5baf..3298004 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -429,6 +429,7 @@ class HTMLPreprocessor: """ Function to remove digits from headers. """ + title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) @@ -458,7 +459,8 @@ class HTMLPreprocessor: func(tag) else: children = list(tag.children) - self.apply_func_to_last_child(children[0], func) + if children: + self.apply_func_to_last_child(children[0], func) def _preprocessing_headings(self): """ @@ -573,6 +575,11 @@ class HTMLPreprocessor: content = list(tag.children) + # do not take into account rubbish empty tags like , but don't remove them + content = [item for item in content if + (type(item) is not NavigableString and item.text != '') + or (type(item) is NavigableString)] + for i, item in enumerate(content): if type(content[i]) is NavigableString: cleaned = re.sub(r'(\s+)+', ' ', content[i])