diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 2126ff5..2a3eb3c 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -519,10 +519,14 @@ class HTMLPreprocessor: if is_first_span: cleaned_text = self.clean_header_title(text) else: - cleaned_text = re.sub(r'\s+', ' ', text).strip() + cleaned_text = text # re.sub(r'\s+', ' ', text).strip() tag.string = cleaned_text + if cleaned_text == '': + tag.unwrap() + return + for i, child in enumerate(tag.find_all(recursive=False)): if is_first_span and i == 0: self._clean_header_by_children(child, True) @@ -550,14 +554,13 @@ class HTMLPreprocessor: self._clean_header_by_children(tag, is_first_span=True) - span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')}) - if span_with_style_font: - for span in span_with_style_font: - span.unwrap() + b_tags = tag.find_all("b") + [tag.unwrap() for tag in b_tags] - span_with_face = tag.find_all("span", {'face': re.compile(r'^.+')}) - if span_with_face: - for span in span_with_face: + spans = tag.find_all("span") + if spans: + for span in spans: + style = span.attrs.get("style") span.unwrap() tag.attrs = {}