forked from LiveCarta/BookConverter
add color processing in headings
This commit is contained in:
@@ -512,6 +512,23 @@ class HTMLPreprocessor:
|
|||||||
for i in range(0, len(self.top_level_headers)):
|
for i in range(0, len(self.top_level_headers)):
|
||||||
self.top_level_headers[i]['should_be_numbered'] = True
|
self.top_level_headers[i]['should_be_numbered'] = True
|
||||||
|
|
||||||
|
def _dfs(self, tag, is_first_span=None):
|
||||||
|
children = tag.find_all(recursive=False)
|
||||||
|
if not children:
|
||||||
|
text = tag.text
|
||||||
|
if is_first_span:
|
||||||
|
cleaned_text = self.clean_header_title(text)
|
||||||
|
else:
|
||||||
|
cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
|
||||||
|
tag.string = cleaned_text
|
||||||
|
|
||||||
|
for i, child in enumerate(tag.find_all(recursive=False)):
|
||||||
|
if is_first_span and i == 0:
|
||||||
|
self._dfs(child, True)
|
||||||
|
else:
|
||||||
|
self._dfs(child)
|
||||||
|
|
||||||
def _process_headings(self):
|
def _process_headings(self):
|
||||||
"""
|
"""
|
||||||
Function to process tags <h>.
|
Function to process tags <h>.
|
||||||
@@ -530,12 +547,21 @@ class HTMLPreprocessor:
|
|||||||
else:
|
else:
|
||||||
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
|
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
|
||||||
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
||||||
# if tag.name in ["h4", "h5", "h6"]:
|
|
||||||
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
|
|
||||||
|
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
|
self._dfs(tag, is_first_span=True)
|
||||||
new_tag.string = title
|
|
||||||
tag.replace_with(new_tag)
|
span_with_style_font = tag.find_all("span", {'style': re.compile(r'^font.+')})
|
||||||
|
|
||||||
|
if span_with_style_font:
|
||||||
|
for span in span_with_style_font:
|
||||||
|
span.unwrap()
|
||||||
|
|
||||||
|
span_with_face = tag.find_all("span", {'face': re.compile(r'^.+')})
|
||||||
|
if span_with_face:
|
||||||
|
for span in span_with_face:
|
||||||
|
span.unwrap()
|
||||||
|
|
||||||
|
tag.attrs = {}
|
||||||
|
|
||||||
def _process_lists(self):
|
def _process_lists(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
Reference in New Issue
Block a user