forked from LiveCarta/BookConverter
fix marking paragraphs as headings
This commit is contained in:
@@ -5,6 +5,7 @@ import re
|
|||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
|
||||||
from config import LawCartaConfig, BookLogger, BookApiWrapper
|
from config import LawCartaConfig, BookLogger, BookApiWrapper
|
||||||
|
|
||||||
|
|
||||||
@@ -388,6 +389,16 @@ class HTMLPreprocessor:
|
|||||||
for div in divs:
|
for div in divs:
|
||||||
div.unwrap()
|
div.unwrap()
|
||||||
|
|
||||||
|
def _check_parent_link_exist_in_toc(self, tag_with_link):
|
||||||
|
toc_links = []
|
||||||
|
for a_tag in tag_with_link.find_all('a'):
|
||||||
|
link_name = a_tag.attrs['name']
|
||||||
|
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
|
||||||
|
if toc_item:
|
||||||
|
toc_links.append(toc_item)
|
||||||
|
|
||||||
|
return len(toc_links) > 0
|
||||||
|
|
||||||
def _process_toc_links(self):
|
def _process_toc_links(self):
|
||||||
"""
|
"""
|
||||||
Function to extract nodes which contains TOC links, remove links from file and detect headers.
|
Function to extract nodes which contains TOC links, remove links from file and detect headers.
|
||||||
@@ -399,12 +410,9 @@ class HTMLPreprocessor:
|
|||||||
if re.search(r"^h\d$", tag.name):
|
if re.search(r"^h\d$", tag.name):
|
||||||
tag.a.unwrap()
|
tag.a.unwrap()
|
||||||
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
|
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
|
||||||
# TODO: escape from recounting paragraphs every time
|
|
||||||
elif tag.name == "p":
|
elif tag.name == "p":
|
||||||
link_name = tag.a.attrs['name']
|
exist_in_toc = self._check_parent_link_exist_in_toc(tag)
|
||||||
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
|
if tag in self.body_tag.find_all("p") and exist_in_toc:
|
||||||
# TODO: if it is needed, check existence of the link in toc
|
|
||||||
if tag in self.body_tag.find_all("p"):
|
|
||||||
new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
|
new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
|
||||||
text = tag.text
|
text = tag.text
|
||||||
tag.replaceWith(new_tag)
|
tag.replaceWith(new_tag)
|
||||||
|
|||||||
Reference in New Issue
Block a user