From 5917fe51e0be30673ad84c62b21bc53a7aab6b92 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Mon, 28 Sep 2020 16:40:20 +0300 Subject: [PATCH] fix heading in json 3.0 --- src/html_preprocessor.py | 8 ++++---- src/json_converter.py | 13 +++++-------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index bd213ec..2126ff5 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -512,7 +512,7 @@ class HTMLPreprocessor: for i in range(0, len(self.top_level_headers)): self.top_level_headers[i]['should_be_numbered'] = True - def _dfs(self, tag, is_first_span=None): + def _clean_header_by_children(self, tag, is_first_span=None): children = tag.find_all(recursive=False) if not children: text = tag.text @@ -525,9 +525,9 @@ class HTMLPreprocessor: for i, child in enumerate(tag.find_all(recursive=False)): if is_first_span and i == 0: - self._dfs(child, True) + self._clean_header_by_children(child, True) else: - self._dfs(child) + self._clean_header_by_children(child) def _process_headings(self): """ @@ -548,7 +548,7 @@ class HTMLPreprocessor: assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \ f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' - self._dfs(tag, is_first_span=True) + self._clean_header_by_children(tag, is_first_span=True) span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')}) if span_with_style_font: diff --git a/src/json_converter.py b/src/json_converter.py index 2d19210..a956282 100644 --- a/src/json_converter.py +++ b/src/json_converter.py @@ -1,14 +1,10 @@ import logging import re -import codecs -import json - from copy import copy from config import LawCartaConfig - class JSONConverter: def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None): self.content_dict = None @@ -37,9 +33,10 @@ class JSONConverter: :param ind: Index of header in content list. """ if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS: - title = '' - for child in self.content[ind].find_all(recursive=False): - title += str(child) + title = str(self.content[ind]) + title = title.replace(f'<{self.content[ind].name}>', '') + title = title.replace(f'', '') + title = re.sub(r'^\n', '', title) curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag result = { @@ -140,7 +137,7 @@ class JSONConverter: same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title'] is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered'] - json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles + json_strc[0]['is_introduction'] = is_first_header_introduction self.content_dict = { "content": json_strc,