From d33a3864d2730250c475a36f3eca69ed03ae987e Mon Sep 17 00:00:00 2001 From: shirshasa Date: Mon, 28 Sep 2020 10:22:14 +0300 Subject: [PATCH] add color processing in headings --- src/html_preprocessor.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 3a3ff13..eb85028 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -512,6 +512,23 @@ class HTMLPreprocessor: for i in range(0, len(self.top_level_headers)): self.top_level_headers[i]['should_be_numbered'] = True + def _dfs(self, tag, is_first_span=None): + children = tag.find_all(recursive=False) + if not children: + text = tag.text + if is_first_span: + cleaned_text = self.clean_header_title(text) + else: + cleaned_text = re.sub(r'\s+', ' ', text).strip() + + tag.string = cleaned_text + + for i, child in enumerate(tag.find_all(recursive=False)): + if is_first_span and i == 0: + self._dfs(child, True) + else: + self._dfs(child) + def _process_headings(self): """ Function to process tags . @@ -530,12 +547,21 @@ class HTMLPreprocessor: else: assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \ f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' - # if tag.name in ["h4", "h5", "h6"]: - # tag.name = "h3" # All the lower level headings will be transformed to h3 headings - new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name) - new_tag.string = title - tag.replace_with(new_tag) + self._dfs(tag, is_first_span=True) + + span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')}) + + if span_with_style_font: + for span in span_with_style_font: + span.unwrap() + + span_with_face = tag.find_all("span", {'face': re.compile(r'^.+')}) + if span_with_face: + for span in span_with_face: + span.unwrap() + + tag.attrs = {} def _process_lists(self): """