From adb9b17500244a07642242c61c1602c4346819ac Mon Sep 17 00:00:00 2001 From: shirshasa Date: Thu, 22 Oct 2020 14:21:02 +0300 Subject: [PATCH] fix chapter formatting --- src/html_preprocessor.py | 103 ++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 0b36191..6ea1130 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -425,17 +425,40 @@ class HTMLPreprocessor: f'Tag name: {tag.name}') @staticmethod - def clean_header_title(title): + def clean_title_from_numbering(title: str): """ - Function to remove digits and extra spaces from headers. - - :param title: Title to process. + Function to remove digits from headers. """ - title = re.sub(r'\s+', ' ', title).strip() title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) - return title.strip() + return title + + @staticmethod + def clean_tag_from_tabs(tag: NavigableString): + cleaned = re.sub(r'(\s+)+', ' ', tag) + this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) + tag.replace_with(this) + # print('input: ', repr(tag)) + # print('test: ', repr(cleaned)) + + def clean_tag_from_numbering(self, tag): + cleaned = self.clean_title_from_numbering(tag) + this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) + tag.replace_with(this) + # print('input: ', repr(tag)) + # print('test: ', repr(cleaned)) + + def apply_func_to_last_child(self, tag, func=None): + """ + works only with constructions like (((child to work with))) + where child is object of NavigableString + """ + if type(tag) is NavigableString: + func(tag) + else: + children = list(tag.children) + self.apply_func_to_last_child(children[0], func) def _preprocessing_headings(self): """ @@ -476,7 +499,7 @@ class HTMLPreprocessor: number = re.match(r'^(?:\.?\d+\.? ?)+', title) is_numbered = number is not None - cleaned_title = self.clean_header_title(tag.text) + cleaned_title = self.clean_title_from_numbering(tag.text) is_introduction = cleaned_title.lower() == 'introduction' headers_info.append({ @@ -513,32 +536,27 @@ class HTMLPreprocessor: for i in range(0, len(self.top_level_headers)): self.top_level_headers[i]['should_be_numbered'] = True - def _clean_header_by_children(self, tag, is_first_span=None): - children = tag.find_all(recursive=False) - if not children: - text = tag.text - if is_first_span: - cleaned_text = self.clean_header_title(text) - else: - cleaned_text = text # re.sub(r'\s+', ' ', text).strip() - - tag.string = cleaned_text - - if cleaned_text == '': - tag.unwrap() - return - - for i, child in enumerate(tag.find_all(recursive=False)): - if is_first_span and i == 0: - self._clean_header_by_children(child, True) - else: - self._clean_header_by_children(child) - def _process_headings(self): """ Function to process tags . """ header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) + + # 1. remove , + for tag in header_tags: + b_tags = tag.find_all("b") + [tag.unwrap() for tag in b_tags] + + spans = tag.find_all("span") + if spans: + for span in spans: + style = span.attrs.get("style") + span.unwrap() + tag.attrs = {} + + header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) + + # 2. clean text in header from numbering and \n for tag in header_tags: if tag.parent.name == "li": tag.parent.unwrap() @@ -546,25 +564,34 @@ class HTMLPreprocessor: tag.parent.unwrap() title = tag.text - title = self.clean_header_title(title) + title = self.clean_title_from_numbering(title) if title == "": tag.unwrap() else: assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \ f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' - self._clean_header_by_children(tag, is_first_span=True) + content = list(tag.children) - b_tags = tag.find_all("b") - [tag.unwrap() for tag in b_tags] + for i, item in enumerate(content): + if type(content[i]) is NavigableString: + cleaned = re.sub(r'(\s+)+', ' ', content[i]) + this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) + content[i].replace_with(this) + content[i] = this + else: + self.apply_func_to_last_child(content[i], self.clean_tag_from_tabs) - spans = tag.find_all("span") - if spans: - for span in spans: - style = span.attrs.get("style") - span.unwrap() + content[0] = '' if content[0] == ' ' else content[0] + content = [item for item in content if item != ''] - tag.attrs = {} + if type(content[0]) is NavigableString: + cleaned = self.clean_title_from_numbering(content[0]) + this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) + content[0].replace_with(this) + content[0] = this + else: + self.apply_func_to_last_child(content[0], self.clean_tag_from_numbering) def _process_lists(self): """