forked from LiveCarta/BookConverter
fix marking paragraphs as headings
This commit is contained in:
@@ -5,6 +5,7 @@ import re
|
||||
from shutil import copyfile
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
|
||||
from config import LawCartaConfig, BookLogger, BookApiWrapper
|
||||
|
||||
|
||||
@@ -388,6 +389,16 @@ class HTMLPreprocessor:
|
||||
for div in divs:
|
||||
div.unwrap()
|
||||
|
||||
def _check_parent_link_exist_in_toc(self, tag_with_link):
|
||||
toc_links = []
|
||||
for a_tag in tag_with_link.find_all('a'):
|
||||
link_name = a_tag.attrs['name']
|
||||
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
|
||||
if toc_item:
|
||||
toc_links.append(toc_item)
|
||||
|
||||
return len(toc_links) > 0
|
||||
|
||||
def _process_toc_links(self):
|
||||
"""
|
||||
Function to extract nodes which contains TOC links, remove links from file and detect headers.
|
||||
@@ -399,12 +410,9 @@ class HTMLPreprocessor:
|
||||
if re.search(r"^h\d$", tag.name):
|
||||
tag.a.unwrap()
|
||||
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
|
||||
# TODO: escape from recounting paragraphs every time
|
||||
elif tag.name == "p":
|
||||
link_name = tag.a.attrs['name']
|
||||
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
|
||||
# TODO: if it is needed, check existence of the link in toc
|
||||
if tag in self.body_tag.find_all("p"):
|
||||
exist_in_toc = self._check_parent_link_exist_in_toc(tag)
|
||||
if tag in self.body_tag.find_all("p") and exist_in_toc:
|
||||
new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
|
||||
text = tag.text
|
||||
tag.replaceWith(new_tag)
|
||||
|
||||
Reference in New Issue
Block a user