fix marking paragraphs as headings

2020-09-25 13:12:34 +03:00
parent 3546380cf8
commit dafe1d5980
1 changed files with 13 additions and 5 deletions
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -5,6 +5,7 @@ import re
 from shutil import copyfile

 from bs4 import BeautifulSoup, NavigableString
+
 from config import LawCartaConfig, BookLogger, BookApiWrapper


@@ -388,6 +389,16 @@ class HTMLPreprocessor:
        for div in divs:
            div.unwrap()

+    def _check_parent_link_exist_in_toc(self, tag_with_link):
+        toc_links = []
+        for a_tag in tag_with_link.find_all('a'):
+            link_name = a_tag.attrs['name']
+            toc_item = self.body_tag.find("a", {'href': '#' + link_name})
+            if toc_item:
+                toc_links.append(toc_item)
+
+        return len(toc_links) > 0
+
    def _process_toc_links(self):
        """
        Function to extract nodes which contains TOC links, remove links from file and detect headers.
@@ -399,12 +410,9 @@ class HTMLPreprocessor:
            if re.search(r"^h\d$", tag.name):
                tag.a.unwrap()
                # outline_level = tag.name[-1]  # TODO: add prediction of the outline level
-            # TODO: escape from recounting paragraphs every time
            elif tag.name == "p":
-                link_name = tag.a.attrs['name']
-                toc_item = self.body_tag.find("a", {'href': '#' + link_name})
-                # TODO: if it is needed, check existence of the link in toc
-                if tag in self.body_tag.find_all("p"):
+                exist_in_toc = self._check_parent_link_exist_in_toc(tag)
+                if tag in self.body_tag.find_all("p") and exist_in_toc:
                    new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
                    text = tag.text
                    tag.replaceWith(new_tag)