fix heading in json 3.0

2020-09-28 16:40:20 +03:00
parent e913a5f49a
commit 5917fe51e0
2 changed files with 9 additions and 12 deletions
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -512,7 +512,7 @@ class HTMLPreprocessor:
            for i in range(0, len(self.top_level_headers)):
                self.top_level_headers[i]['should_be_numbered'] = True

-    def _dfs(self, tag, is_first_span=None):
+    def _clean_header_by_children(self, tag, is_first_span=None):
        children = tag.find_all(recursive=False)
        if not children:
            text = tag.text
@@ -525,9 +525,9 @@ class HTMLPreprocessor:

        for i, child in enumerate(tag.find_all(recursive=False)):
            if is_first_span and i == 0:
-                self._dfs(child, True)
+                self._clean_header_by_children(child, True)
            else:
-                self._dfs(child)
+                self._clean_header_by_children(child)

    def _process_headings(self):
        """
@@ -548,7 +548,7 @@ class HTMLPreprocessor:
                assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
                    f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'

-                self._dfs(tag, is_first_span=True)
+                self._clean_header_by_children(tag, is_first_span=True)

                span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')})
                if span_with_style_font:
--- a/src/json_converter.py
+++ b/src/json_converter.py
@@ -1,14 +1,10 @@
 import logging
 import re
-import codecs
-import json
-
 from copy import copy

 from config import LawCartaConfig


-
 class JSONConverter:
    def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
        self.content_dict = None
@@ -37,9 +33,10 @@ class JSONConverter:
        :param ind: Index of header in content list.
        """
        if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
-            title = ''
-            for child in self.content[ind].find_all(recursive=False):
-                title += str(child)
+            title = str(self.content[ind])
+            title = title.replace(f'<{self.content[ind].name}>', '')
+            title = title.replace(f'</{self.content[ind].name}>', '')
+            title = re.sub(r'^\n', '', title)

            curr_outline = int(re.sub(r"^h", "", self.content[ind].name))  # extract outline from tag
            result = {
@@ -140,7 +137,7 @@ class JSONConverter:
            same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
            is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']

-            json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
+            json_strc[0]['is_introduction'] = is_first_header_introduction

        self.content_dict = {
            "content": json_strc,