From e929ac08ffee186959430d4e9bed559f480c1655 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Fri, 30 Oct 2020 00:30:52 +0300 Subject: [PATCH] converter fix: cleaning bad utf symbols in hrefs --- src/html_preprocessor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 5962d18..71496f7 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -277,6 +277,12 @@ class HTMLPreprocessor: tag.string = tag.text.replace('\u200c', '') tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') + a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^(?!#sdfootnote)')}) + for tag in a_tags_with_href: + tag.string = tag.text.replace('\u200c', '') + tag.string = tag.text.replace('\u200b', '') + tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') + @staticmethod def _clean_footnote_content(content): content = content.strip()