converter fix: cleaning bad utf symbols in hrefs

This commit is contained in:
shirshasa
2020-10-30 00:30:52 +03:00
parent fb6a9aea6c
commit e929ac08ff

View File

@@ -277,6 +277,12 @@ class HTMLPreprocessor:
tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace('\u200c', '')
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^(?!#sdfootnote)')})
for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '')
tag.string = tag.text.replace('\u200b', '')
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
@staticmethod @staticmethod
def _clean_footnote_content(content): def _clean_footnote_content(content):
content = content.strip() content = content.strip()