From d22103239ffbee60cc20b9e626e210f3a7bfe373 Mon Sep 17 00:00:00 2001
From: shirshasa <katerinagorbac@gmail.com>
Date: Tue, 13 Oct 2020 18:50:11 +0300
Subject: [PATCH] [LAW-3626] fix

---
 src/html_preprocessor.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py
index 2126ff5..2a3eb3c 100644
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -519,10 +519,14 @@ class HTMLPreprocessor:
             if is_first_span:
                 cleaned_text = self.clean_header_title(text)
             else:
-                cleaned_text = re.sub(r'\s+', ' ', text).strip()
+                cleaned_text = text  # re.sub(r'\s+', ' ', text).strip()
 
             tag.string = cleaned_text
 
+            if cleaned_text == '':
+                tag.unwrap()
+                return
+
         for i, child in enumerate(tag.find_all(recursive=False)):
             if is_first_span and i == 0:
                 self._clean_header_by_children(child, True)
@@ -550,14 +554,13 @@ class HTMLPreprocessor:
 
                 self._clean_header_by_children(tag, is_first_span=True)
 
-                span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')})
-                if span_with_style_font:
-                    for span in span_with_style_font:
-                        span.unwrap()
+                b_tags = tag.find_all("b")
+                [tag.unwrap() for tag in b_tags]
 
-                span_with_face = tag.find_all("span", {'face': re.compile(r'^.+')})
-                if span_with_face:
-                    for span in span_with_face:
+                spans = tag.find_all("span")
+                if spans:
+                    for span in spans:
+                        style = span.attrs.get("style")
                         span.unwrap()
 
                 tag.attrs = {}