From adb9b17500244a07642242c61c1602c4346819ac Mon Sep 17 00:00:00 2001
From: shirshasa <katerinagorbac@gmail.com>
Date: Thu, 22 Oct 2020 14:21:02 +0300
Subject: [PATCH] fix chapter formatting

---
 src/html_preprocessor.py | 103 ++++++++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 38 deletions(-)
diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py
index 0b36191..6ea1130 100644
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -425,17 +425,40 @@ class HTMLPreprocessor:
                                        f'Tag name: {tag.name}')
 
     @staticmethod
-    def clean_header_title(title):
+    def clean_title_from_numbering(title: str):
         """
-        Function to remove digits and extra spaces from headers.
-
-        :param title: Title to process.
+        Function to remove digits  from headers.
         """
-        title = re.sub(r'\s+', ' ', title).strip()
         title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
         # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title)  # delete chapter numbering from the title
         title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
-        return title.strip()
+        return title
+
+    @staticmethod
+    def clean_tag_from_tabs(tag: NavigableString):
+        cleaned = re.sub(r'(\s+)+', ' ', tag)
+        this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+        tag.replace_with(this)
+        # print('input: ', repr(tag))
+        # print('test: ', repr(cleaned))
+
+    def clean_tag_from_numbering(self, tag):
+        cleaned = self.clean_title_from_numbering(tag)
+        this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+        tag.replace_with(this)
+        # print('input: ', repr(tag))
+        # print('test: ', repr(cleaned))
+
+    def apply_func_to_last_child(self, tag, func=None):
+        """
+        works only with constructions like (((child to work with)))
+        where child is object of NavigableString
+        """
+        if type(tag) is NavigableString:
+            func(tag)
+        else:
+            children = list(tag.children)
+            self.apply_func_to_last_child(children[0], func)
 
     def _preprocessing_headings(self):
         """
@@ -476,7 +499,7 @@ class HTMLPreprocessor:
                 number = re.match(r'^(?:\.?\d+\.? ?)+', title)
                 is_numbered = number is not None
 
-                cleaned_title = self.clean_header_title(tag.text)
+                cleaned_title = self.clean_title_from_numbering(tag.text)
                 is_introduction = cleaned_title.lower() == 'introduction'
 
                 headers_info.append({
@@ -513,32 +536,27 @@ class HTMLPreprocessor:
             for i in range(0, len(self.top_level_headers)):
                 self.top_level_headers[i]['should_be_numbered'] = True
 
-    def _clean_header_by_children(self, tag, is_first_span=None):
-        children = tag.find_all(recursive=False)
-        if not children:
-            text = tag.text
-            if is_first_span:
-                cleaned_text = self.clean_header_title(text)
-            else:
-                cleaned_text = text  # re.sub(r'\s+', ' ', text).strip()
-
-            tag.string = cleaned_text
-
-            if cleaned_text == '':
-                tag.unwrap()
-                return
-
-        for i, child in enumerate(tag.find_all(recursive=False)):
-            if is_first_span and i == 0:
-                self._clean_header_by_children(child, True)
-            else:
-                self._clean_header_by_children(child)
-
     def _process_headings(self):
         """
         Function to process tags <h>.
         """
         header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
+
+        # 1. remove <b>, <span>
+        for tag in header_tags:
+            b_tags = tag.find_all("b")
+            [tag.unwrap() for tag in b_tags]
+
+            spans = tag.find_all("span")
+            if spans:
+                for span in spans:
+                    style = span.attrs.get("style")
+                    span.unwrap()
+            tag.attrs = {}
+
+        header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
+
+        # 2. clean text in header from numbering and \n
         for tag in header_tags:
             if tag.parent.name == "li":
                 tag.parent.unwrap()
@@ -546,25 +564,34 @@ class HTMLPreprocessor:
                     tag.parent.unwrap()
 
             title = tag.text
-            title = self.clean_header_title(title)
+            title = self.clean_title_from_numbering(title)
             if title == "":
                 tag.unwrap()
             else:
                 assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
                     f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
 
-                self._clean_header_by_children(tag, is_first_span=True)
+                content = list(tag.children)
 
-                b_tags = tag.find_all("b")
-                [tag.unwrap() for tag in b_tags]
+                for i, item in enumerate(content):
+                    if type(content[i]) is NavigableString:
+                        cleaned = re.sub(r'(\s+)+', ' ', content[i])
+                        this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+                        content[i].replace_with(this)
+                        content[i] = this
+                    else:
+                        self.apply_func_to_last_child(content[i], self.clean_tag_from_tabs)
 
-                spans = tag.find_all("span")
-                if spans:
-                    for span in spans:
-                        style = span.attrs.get("style")
-                        span.unwrap()
+                content[0] = '' if content[0] == ' ' else content[0]
+                content = [item for item in content if item != '']
 
-                tag.attrs = {}
+                if type(content[0]) is NavigableString:
+                    cleaned = self.clean_title_from_numbering(content[0])
+                    this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+                    content[0].replace_with(this)
+                    content[0] = this
+                else:
+                    self.apply_func_to_last_child(content[0], self.clean_tag_from_numbering)
 
     def _process_lists(self):
         """