From 7f59c5d381c33492c29542bf5d5db11cf88ad4cc Mon Sep 17 00:00:00 2001
From: shirshasa <katerinagorbac@gmail.com>
Date: Fri, 22 May 2020 13:42:53 +0300
Subject: [PATCH] LAW-3261 adding is_introduction field to resulted json

---
 src/book.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
diff --git a/src/book.py b/src/book.py
index 131f47a..f3d95a3 100644
--- a/src/book.py
+++ b/src/book.py
@@ -47,6 +47,7 @@ class Book:
         self.content = list()
         self.footnotes = list()
         self.images = list()
+        self.top_level_headers = None
         self.content_dict = dict()
         self.tables_amount = 0
 
@@ -593,6 +594,72 @@ class Book:
         for tag in header_tags:
             tag.name = 'p'
 
+    def _get_top_level_headers(self):
+        """
+        Function for gathering info about top-level chapters.
+
+        Assume:
+            - Headers with smallest outline(or digit in <h>) are top level chapters.
+            [ It is consistent with a recursive algorithm
+            for saving content to a resulted json structure,
+            which happens in  header_to_json()]
+
+        """
+        headers_info = []
+        header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
+        headers_outline = [int(re.sub(r"^h", "", tag.name)) for tag in header_tags]
+        top_level_outline = min(headers_outline)
+        top_level_headers = [tag for tag in header_tags
+                             if int(re.sub(r"^h", "", tag.name)) == top_level_outline]
+
+        for tag in top_level_headers:
+            if tag.parent.name == "li":
+                tag.parent.unwrap()
+                while tag.parent.name == "ol":
+                    tag.parent.unwrap()
+
+            title = tag.text
+            title = re.sub(r'\s+', ' ', title).strip()
+            number = re.match(r'^(?:\.?\d+\.? ?)+', title)
+            is_numbered = number is not None
+
+            cleaned_title = self.clean_header_title(tag.text)
+            is_introduction = cleaned_title.lower() == 'introduction'
+
+            headers_info.append({
+                'title': cleaned_title,
+                'is_numbered': is_numbered,
+                'is_introduction': is_introduction})
+
+        return headers_info
+
+    def _mark_introduction_headers(self):
+        """
+        Function to find out:
+        what header shouldn't be numbered and can be treated as introductive chapter
+
+        Assume  header(s) to be introductive if:
+            1. one header not numbered, before 1 numbered header
+            2. it is first header from the top level list and it equals to 'introduction'
+
+        Result :
+        Mark each top-level header with flag should_be_numbered = true/false
+        """
+        is_numbered_header = [header['is_numbered'] for header in self.top_level_headers]
+        is_title = [header['is_introduction'] for header in self.top_level_headers]
+
+        first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
+        second_is_numbered_or_not_exist = all(is_numbered_header[1:2])
+        first_header_is_introduction = is_title and is_title[0]
+
+        if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction:
+            self.top_level_headers[0]['should_be_numbered'] = False
+            for i in range(1, len(self.top_level_headers)):
+                self.top_level_headers[i]['should_be_numbered'] = True
+        else:
+            for i in range(0, len(self.top_level_headers)):
+                self.top_level_headers[i]['should_be_numbered'] = True
+
     def _process_headings(self):
         """
         Function to process tags <h>.
@@ -663,6 +730,10 @@ class Book:
 
             self.log(f'Processing TOC and headers.')
             self._process_toc_links()
+
+            self.top_level_headers = self._get_top_level_headers()
+            self._mark_introduction_headers()
+
             self._process_headings()
 
             self.content = self.body_tag.find_all(recursive=False)
@@ -703,17 +774,21 @@ class Book:
             ind += 1
 
             while ind < len(self.content):
+                # 1. next tag is a header
                 if self.content[ind].name in self.SUPPORTED_HEADERS:
                     outline = int(re.sub(r"^h", "", self.content[ind].name))
+                    # - recursion step until h_i > h_initial
                     if outline > curr_outline:
                         res, ind = self.header_to_json(ind)
                         if ch_content:
                             result[title].append("".join(ch_content))
                         ch_content = []
                         result[title].append(res)
+                    # - current h_i <= h_initial, end of recursion
                     else:
                         # return result, ind
                         break
+                # 2. next tag is not a header. add new paragraphs
                 else:
                     res = self.format_html(str(self.content[ind]))
                     # result[title].append(res)
@@ -776,6 +851,13 @@ class Book:
             self.set_error_status()
             raise exc
 
+        # add is_introduction field if first chapter is Introduction
+        if not self.top_level_headers[0]['should_be_numbered']:
+            # title should be checked as after deleting content
+            # before toc, some chapters can be deleted
+            if self.top_level_headers[0]['title'] in json_strc[0].keys():
+                json_strc[0]['is_introduction'] = True
+
         self.content_dict = {
             "content": json_strc,
             "footnotes": self.footnotes