From a1d7ab0a8d70cc231a1e30f8b3dfdb7804573370 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Fri, 23 Oct 2020 12:54:54 +0300 Subject: [PATCH] converter fix --- src/html_preprocessor.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 48a5baf..3298004 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -429,6 +429,7 @@ class HTMLPreprocessor: """ Function to remove digits from headers. """ + title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) @@ -458,7 +459,8 @@ class HTMLPreprocessor: func(tag) else: children = list(tag.children) - self.apply_func_to_last_child(children[0], func) + if children: + self.apply_func_to_last_child(children[0], func) def _preprocessing_headings(self): """ @@ -573,6 +575,11 @@ class HTMLPreprocessor: content = list(tag.children) + # do not take into account rubbish empty tags like , but don't remove them + content = [item for item in content if + (type(item) is not NavigableString and item.text != '') + or (type(item) is NavigableString)] + for i, item in enumerate(content): if type(content[i]) is NavigableString: cleaned = re.sub(r'(\s+)+', ' ', content[i])