forked from LiveCarta/BookConverter
fix heading in json 3.0
This commit is contained in:
@@ -512,7 +512,7 @@ class HTMLPreprocessor:
|
|||||||
for i in range(0, len(self.top_level_headers)):
|
for i in range(0, len(self.top_level_headers)):
|
||||||
self.top_level_headers[i]['should_be_numbered'] = True
|
self.top_level_headers[i]['should_be_numbered'] = True
|
||||||
|
|
||||||
def _dfs(self, tag, is_first_span=None):
|
def _clean_header_by_children(self, tag, is_first_span=None):
|
||||||
children = tag.find_all(recursive=False)
|
children = tag.find_all(recursive=False)
|
||||||
if not children:
|
if not children:
|
||||||
text = tag.text
|
text = tag.text
|
||||||
@@ -525,9 +525,9 @@ class HTMLPreprocessor:
|
|||||||
|
|
||||||
for i, child in enumerate(tag.find_all(recursive=False)):
|
for i, child in enumerate(tag.find_all(recursive=False)):
|
||||||
if is_first_span and i == 0:
|
if is_first_span and i == 0:
|
||||||
self._dfs(child, True)
|
self._clean_header_by_children(child, True)
|
||||||
else:
|
else:
|
||||||
self._dfs(child)
|
self._clean_header_by_children(child)
|
||||||
|
|
||||||
def _process_headings(self):
|
def _process_headings(self):
|
||||||
"""
|
"""
|
||||||
@@ -548,7 +548,7 @@ class HTMLPreprocessor:
|
|||||||
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
|
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
|
||||||
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
||||||
|
|
||||||
self._dfs(tag, is_first_span=True)
|
self._clean_header_by_children(tag, is_first_span=True)
|
||||||
|
|
||||||
span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')})
|
span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')})
|
||||||
if span_with_style_font:
|
if span_with_style_font:
|
||||||
|
|||||||
@@ -1,14 +1,10 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import codecs
|
|
||||||
import json
|
|
||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from config import LawCartaConfig
|
from config import LawCartaConfig
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class JSONConverter:
|
class JSONConverter:
|
||||||
def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
|
def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
|
||||||
self.content_dict = None
|
self.content_dict = None
|
||||||
@@ -37,9 +33,10 @@ class JSONConverter:
|
|||||||
:param ind: Index of header in content list.
|
:param ind: Index of header in content list.
|
||||||
"""
|
"""
|
||||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||||
title = ''
|
title = str(self.content[ind])
|
||||||
for child in self.content[ind].find_all(recursive=False):
|
title = title.replace(f'<{self.content[ind].name}>', '')
|
||||||
title += str(child)
|
title = title.replace(f'</{self.content[ind].name}>', '')
|
||||||
|
title = re.sub(r'^\n', '', title)
|
||||||
|
|
||||||
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag
|
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag
|
||||||
result = {
|
result = {
|
||||||
@@ -140,7 +137,7 @@ class JSONConverter:
|
|||||||
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
|
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
|
||||||
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
|
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
|
||||||
|
|
||||||
json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
|
json_strc[0]['is_introduction'] = is_first_header_introduction
|
||||||
|
|
||||||
self.content_dict = {
|
self.content_dict = {
|
||||||
"content": json_strc,
|
"content": json_strc,
|
||||||
|
|||||||
Reference in New Issue
Block a user