diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py
index 0b36191..6ea1130 100644
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -425,17 +425,40 @@ class HTMLPreprocessor:
f'Tag name: {tag.name}')
@staticmethod
- def clean_header_title(title):
+ def clean_title_from_numbering(title: str):
"""
- Function to remove digits and extra spaces from headers.
-
- :param title: Title to process.
+ Function to remove digits from headers.
"""
- title = re.sub(r'\s+', ' ', title).strip()
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
- return title.strip()
+ return title
+
+ @staticmethod
+ def clean_tag_from_tabs(tag: NavigableString):
+ cleaned = re.sub(r'(\s+)+', ' ', tag)
+ this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+ tag.replace_with(this)
+ # print('input: ', repr(tag))
+ # print('test: ', repr(cleaned))
+
+ def clean_tag_from_numbering(self, tag):
+ cleaned = self.clean_title_from_numbering(tag)
+ this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+ tag.replace_with(this)
+ # print('input: ', repr(tag))
+ # print('test: ', repr(cleaned))
+
+ def apply_func_to_last_child(self, tag, func=None):
+ """
+ works only with constructions like (((child to work with)))
+ where child is object of NavigableString
+ """
+ if type(tag) is NavigableString:
+ func(tag)
+ else:
+ children = list(tag.children)
+ self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self):
"""
@@ -476,7 +499,7 @@ class HTMLPreprocessor:
number = re.match(r'^(?:\.?\d+\.? ?)+', title)
is_numbered = number is not None
- cleaned_title = self.clean_header_title(tag.text)
+ cleaned_title = self.clean_title_from_numbering(tag.text)
is_introduction = cleaned_title.lower() == 'introduction'
headers_info.append({
@@ -513,32 +536,27 @@ class HTMLPreprocessor:
for i in range(0, len(self.top_level_headers)):
self.top_level_headers[i]['should_be_numbered'] = True
- def _clean_header_by_children(self, tag, is_first_span=None):
- children = tag.find_all(recursive=False)
- if not children:
- text = tag.text
- if is_first_span:
- cleaned_text = self.clean_header_title(text)
- else:
- cleaned_text = text # re.sub(r'\s+', ' ', text).strip()
-
- tag.string = cleaned_text
-
- if cleaned_text == '':
- tag.unwrap()
- return
-
- for i, child in enumerate(tag.find_all(recursive=False)):
- if is_first_span and i == 0:
- self._clean_header_by_children(child, True)
- else:
- self._clean_header_by_children(child)
-
def _process_headings(self):
"""
Function to process tags .
"""
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
+
+ # 1. remove ,
+ for tag in header_tags:
+ b_tags = tag.find_all("b")
+ [tag.unwrap() for tag in b_tags]
+
+ spans = tag.find_all("span")
+ if spans:
+ for span in spans:
+ style = span.attrs.get("style")
+ span.unwrap()
+ tag.attrs = {}
+
+ header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
+
+ # 2. clean text in header from numbering and \n
for tag in header_tags:
if tag.parent.name == "li":
tag.parent.unwrap()
@@ -546,25 +564,34 @@ class HTMLPreprocessor:
tag.parent.unwrap()
title = tag.text
- title = self.clean_header_title(title)
+ title = self.clean_title_from_numbering(title)
if title == "":
tag.unwrap()
else:
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
- self._clean_header_by_children(tag, is_first_span=True)
+ content = list(tag.children)
- b_tags = tag.find_all("b")
- [tag.unwrap() for tag in b_tags]
+ for i, item in enumerate(content):
+ if type(content[i]) is NavigableString:
+ cleaned = re.sub(r'(\s+)+', ' ', content[i])
+ this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+ content[i].replace_with(this)
+ content[i] = this
+ else:
+ self.apply_func_to_last_child(content[i], self.clean_tag_from_tabs)
- spans = tag.find_all("span")
- if spans:
- for span in spans:
- style = span.attrs.get("style")
- span.unwrap()
+ content[0] = '' if content[0] == ' ' else content[0]
+ content = [item for item in content if item != '']
- tag.attrs = {}
+ if type(content[0]) is NavigableString:
+ cleaned = self.clean_title_from_numbering(content[0])
+ this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+ content[0].replace_with(this)
+ content[0] = this
+ else:
+ self.apply_func_to_last_child(content[0], self.clean_tag_from_numbering)
def _process_lists(self):
"""