fix marking paragraphs as headings

This commit is contained in:
shirshasa
2020-09-25 13:12:34 +03:00
parent 3546380cf8
commit dafe1d5980

View File

@@ -5,6 +5,7 @@ import re
from shutil import copyfile from shutil import copyfile
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString
from config import LawCartaConfig, BookLogger, BookApiWrapper from config import LawCartaConfig, BookLogger, BookApiWrapper
@@ -388,6 +389,16 @@ class HTMLPreprocessor:
for div in divs: for div in divs:
div.unwrap() div.unwrap()
def _check_parent_link_exist_in_toc(self, tag_with_link):
toc_links = []
for a_tag in tag_with_link.find_all('a'):
link_name = a_tag.attrs['name']
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
if toc_item:
toc_links.append(toc_item)
return len(toc_links) > 0
def _process_toc_links(self): def _process_toc_links(self):
""" """
Function to extract nodes which contains TOC links, remove links from file and detect headers. Function to extract nodes which contains TOC links, remove links from file and detect headers.
@@ -399,12 +410,9 @@ class HTMLPreprocessor:
if re.search(r"^h\d$", tag.name): if re.search(r"^h\d$", tag.name):
tag.a.unwrap() tag.a.unwrap()
# outline_level = tag.name[-1] # TODO: add prediction of the outline level # outline_level = tag.name[-1] # TODO: add prediction of the outline level
# TODO: escape from recounting paragraphs every time
elif tag.name == "p": elif tag.name == "p":
link_name = tag.a.attrs['name'] exist_in_toc = self._check_parent_link_exist_in_toc(tag)
toc_item = self.body_tag.find("a", {'href': '#' + link_name}) if tag in self.body_tag.find_all("p") and exist_in_toc:
# TODO: if it is needed, check existence of the link in toc
if tag in self.body_tag.find_all("p"):
new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level) new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
text = tag.text text = tag.text
tag.replaceWith(new_tag) tag.replaceWith(new_tag)