fix marking paragraphs as headings

This commit is contained in:
shirshasa
2020-09-25 13:12:34 +03:00
parent 3546380cf8
commit dafe1d5980

View File

@@ -5,6 +5,7 @@ import re
from shutil import copyfile
from bs4 import BeautifulSoup, NavigableString
from config import LawCartaConfig, BookLogger, BookApiWrapper
@@ -388,6 +389,16 @@ class HTMLPreprocessor:
for div in divs:
div.unwrap()
def _check_parent_link_exist_in_toc(self, tag_with_link):
toc_links = []
for a_tag in tag_with_link.find_all('a'):
link_name = a_tag.attrs['name']
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
if toc_item:
toc_links.append(toc_item)
return len(toc_links) > 0
def _process_toc_links(self):
"""
Function to extract nodes which contains TOC links, remove links from file and detect headers.
@@ -399,12 +410,9 @@ class HTMLPreprocessor:
if re.search(r"^h\d$", tag.name):
tag.a.unwrap()
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
# TODO: escape from recounting paragraphs every time
elif tag.name == "p":
link_name = tag.a.attrs['name']
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
# TODO: if it is needed, check existence of the link in toc
if tag in self.body_tag.find_all("p"):
exist_in_toc = self._check_parent_link_exist_in_toc(tag)
if tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
text = tag.text
tag.replaceWith(new_tag)