From 5917fe51e0be30673ad84c62b21bc53a7aab6b92 Mon Sep 17 00:00:00 2001
From: shirshasa <katerinagorbac@gmail.com>
Date: Mon, 28 Sep 2020 16:40:20 +0300
Subject: [PATCH] fix heading in json 3.0

---
 src/html_preprocessor.py |  8 ++++----
 src/json_converter.py    | 13 +++++--------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py
index bd213ec..2126ff5 100644
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -512,7 +512,7 @@ class HTMLPreprocessor:
             for i in range(0, len(self.top_level_headers)):
                 self.top_level_headers[i]['should_be_numbered'] = True
 
-    def _dfs(self, tag, is_first_span=None):
+    def _clean_header_by_children(self, tag, is_first_span=None):
         children = tag.find_all(recursive=False)
         if not children:
             text = tag.text
@@ -525,9 +525,9 @@ class HTMLPreprocessor:
 
         for i, child in enumerate(tag.find_all(recursive=False)):
             if is_first_span and i == 0:
-                self._dfs(child, True)
+                self._clean_header_by_children(child, True)
             else:
-                self._dfs(child)
+                self._clean_header_by_children(child)
 
     def _process_headings(self):
         """
@@ -548,7 +548,7 @@ class HTMLPreprocessor:
                 assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
                     f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
 
-                self._dfs(tag, is_first_span=True)
+                self._clean_header_by_children(tag, is_first_span=True)
 
                 span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')})
                 if span_with_style_font:
diff --git a/src/json_converter.py b/src/json_converter.py
index 2d19210..a956282 100644
--- a/src/json_converter.py
+++ b/src/json_converter.py
@@ -1,14 +1,10 @@
 import logging
 import re
-import codecs
-import json
-
 from copy import copy
 
 from config import LawCartaConfig
 
 
-
 class JSONConverter:
     def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
         self.content_dict = None
@@ -37,9 +33,10 @@ class JSONConverter:
         :param ind: Index of header in content list.
         """
         if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
-            title = ''
-            for child in self.content[ind].find_all(recursive=False):
-                title += str(child)
+            title = str(self.content[ind])
+            title = title.replace(f'<{self.content[ind].name}>', '')
+            title = title.replace(f'</{self.content[ind].name}>', '')
+            title = re.sub(r'^\n', '', title)
 
             curr_outline = int(re.sub(r"^h", "", self.content[ind].name))  # extract outline from tag
             result = {
@@ -140,7 +137,7 @@ class JSONConverter:
             same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
             is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
 
-            json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
+            json_strc[0]['is_introduction'] = is_first_header_introduction
 
         self.content_dict = {
             "content": json_strc,