From d22103239ffbee60cc20b9e626e210f3a7bfe373 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Tue, 13 Oct 2020 18:50:11 +0300 Subject: [PATCH] [LAW-3626] fix --- src/html_preprocessor.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 2126ff5..2a3eb3c 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -519,10 +519,14 @@ class HTMLPreprocessor: if is_first_span: cleaned_text = self.clean_header_title(text) else: - cleaned_text = re.sub(r'\s+', ' ', text).strip() + cleaned_text = text # re.sub(r'\s+', ' ', text).strip() tag.string = cleaned_text + if cleaned_text == '': + tag.unwrap() + return + for i, child in enumerate(tag.find_all(recursive=False)): if is_first_span and i == 0: self._clean_header_by_children(child, True) @@ -550,14 +554,13 @@ class HTMLPreprocessor: self._clean_header_by_children(tag, is_first_span=True) - span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')}) - if span_with_style_font: - for span in span_with_style_font: - span.unwrap() + b_tags = tag.find_all("b") + [tag.unwrap() for tag in b_tags] - span_with_face = tag.find_all("span", {'face': re.compile(r'^.+')}) - if span_with_face: - for span in span_with_face: + spans = tag.find_all("span") + if spans: + for span in spans: + style = span.attrs.get("style") span.unwrap() tag.attrs = {}