From 7f59c5d381c33492c29542bf5d5db11cf88ad4cc Mon Sep 17 00:00:00 2001 From: shirshasa Date: Fri, 22 May 2020 13:42:53 +0300 Subject: [PATCH] LAW-3261 adding is_introduction field to resulted json --- src/book.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/src/book.py b/src/book.py index 131f47a..f3d95a3 100644 --- a/src/book.py +++ b/src/book.py @@ -47,6 +47,7 @@ class Book: self.content = list() self.footnotes = list() self.images = list() + self.top_level_headers = None self.content_dict = dict() self.tables_amount = 0 @@ -593,6 +594,72 @@ class Book: for tag in header_tags: tag.name = 'p' + def _get_top_level_headers(self): + """ + Function for gathering info about top-level chapters. + + Assume: + - Headers with smallest outline(or digit in ) are top level chapters. + [ It is consistent with a recursive algorithm + for saving content to a resulted json structure, + which happens in header_to_json()] + + """ + headers_info = [] + header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) + headers_outline = [int(re.sub(r"^h", "", tag.name)) for tag in header_tags] + top_level_outline = min(headers_outline) + top_level_headers = [tag for tag in header_tags + if int(re.sub(r"^h", "", tag.name)) == top_level_outline] + + for tag in top_level_headers: + if tag.parent.name == "li": + tag.parent.unwrap() + while tag.parent.name == "ol": + tag.parent.unwrap() + + title = tag.text + title = re.sub(r'\s+', ' ', title).strip() + number = re.match(r'^(?:\.?\d+\.? ?)+', title) + is_numbered = number is not None + + cleaned_title = self.clean_header_title(tag.text) + is_introduction = cleaned_title.lower() == 'introduction' + + headers_info.append({ + 'title': cleaned_title, + 'is_numbered': is_numbered, + 'is_introduction': is_introduction}) + + return headers_info + + def _mark_introduction_headers(self): + """ + Function to find out: + what header shouldn't be numbered and can be treated as introductive chapter + + Assume header(s) to be introductive if: + 1. one header not numbered, before 1 numbered header + 2. it is first header from the top level list and it equals to 'introduction' + + Result : + Mark each top-level header with flag should_be_numbered = true/false + """ + is_numbered_header = [header['is_numbered'] for header in self.top_level_headers] + is_title = [header['is_introduction'] for header in self.top_level_headers] + + first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 + second_is_numbered_or_not_exist = all(is_numbered_header[1:2]) + first_header_is_introduction = is_title and is_title[0] + + if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction: + self.top_level_headers[0]['should_be_numbered'] = False + for i in range(1, len(self.top_level_headers)): + self.top_level_headers[i]['should_be_numbered'] = True + else: + for i in range(0, len(self.top_level_headers)): + self.top_level_headers[i]['should_be_numbered'] = True + def _process_headings(self): """ Function to process tags . @@ -663,6 +730,10 @@ class Book: self.log(f'Processing TOC and headers.') self._process_toc_links() + + self.top_level_headers = self._get_top_level_headers() + self._mark_introduction_headers() + self._process_headings() self.content = self.body_tag.find_all(recursive=False) @@ -703,17 +774,21 @@ class Book: ind += 1 while ind < len(self.content): + # 1. next tag is a header if self.content[ind].name in self.SUPPORTED_HEADERS: outline = int(re.sub(r"^h", "", self.content[ind].name)) + # - recursion step until h_i > h_initial if outline > curr_outline: res, ind = self.header_to_json(ind) if ch_content: result[title].append("".join(ch_content)) ch_content = [] result[title].append(res) + # - current h_i <= h_initial, end of recursion else: # return result, ind break + # 2. next tag is not a header. add new paragraphs else: res = self.format_html(str(self.content[ind])) # result[title].append(res) @@ -776,6 +851,13 @@ class Book: self.set_error_status() raise exc + # add is_introduction field if first chapter is Introduction + if not self.top_level_headers[0]['should_be_numbered']: + # title should be checked as after deleting content + # before toc, some chapters can be deleted + if self.top_level_headers[0]['title'] in json_strc[0].keys(): + json_strc[0]['is_introduction'] = True + self.content_dict = { "content": json_strc, "footnotes": self.footnotes