diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py
index 2126ff5..2a3eb3c 100644
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -519,10 +519,14 @@ class HTMLPreprocessor:
if is_first_span:
cleaned_text = self.clean_header_title(text)
else:
- cleaned_text = re.sub(r'\s+', ' ', text).strip()
+ cleaned_text = text # re.sub(r'\s+', ' ', text).strip()
tag.string = cleaned_text
+ if cleaned_text == '':
+ tag.unwrap()
+ return
+
for i, child in enumerate(tag.find_all(recursive=False)):
if is_first_span and i == 0:
self._clean_header_by_children(child, True)
@@ -550,14 +554,13 @@ class HTMLPreprocessor:
self._clean_header_by_children(tag, is_first_span=True)
- span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')})
- if span_with_style_font:
- for span in span_with_style_font:
- span.unwrap()
+ b_tags = tag.find_all("b")
+ [tag.unwrap() for tag in b_tags]
- span_with_face = tag.find_all("span", {'face': re.compile(r'^.+')})
- if span_with_face:
- for span in span_with_face:
+ spans = tag.find_all("span")
+ if spans:
+ for span in spans:
+ style = span.attrs.get("style")
span.unwrap()
tag.attrs = {}