diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 3ed60a4..3a3ff13 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -5,6 +5,7 @@ import re from shutil import copyfile from bs4 import BeautifulSoup, NavigableString + from config import LawCartaConfig, BookLogger, BookApiWrapper @@ -388,6 +389,16 @@ class HTMLPreprocessor: for div in divs: div.unwrap() + def _check_parent_link_exist_in_toc(self, tag_with_link): + toc_links = [] + for a_tag in tag_with_link.find_all('a'): + link_name = a_tag.attrs['name'] + toc_item = self.body_tag.find("a", {'href': '#' + link_name}) + if toc_item: + toc_links.append(toc_item) + + return len(toc_links) > 0 + def _process_toc_links(self): """ Function to extract nodes which contains TOC links, remove links from file and detect headers. @@ -399,12 +410,9 @@ class HTMLPreprocessor: if re.search(r"^h\d$", tag.name): tag.a.unwrap() # outline_level = tag.name[-1] # TODO: add prediction of the outline level - # TODO: escape from recounting paragraphs every time elif tag.name == "p": - link_name = tag.a.attrs['name'] - toc_item = self.body_tag.find("a", {'href': '#' + link_name}) - # TODO: if it is needed, check existence of the link in toc - if tag in self.body_tag.find_all("p"): + exist_in_toc = self._check_parent_link_exist_in_toc(tag) + if tag in self.body_tag.find_all("p") and exist_in_toc: new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level) text = tag.text tag.replaceWith(new_tag)