diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py
index d3d1cd9..491537d 100644
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -111,6 +111,7 @@ class HTMLPreprocessor:
table.decompose()
def _change_table_of_contents(self):
+ self._change_table_of_contents()
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
for table in tables:
table.wrap(self.html_soup.new_tag("TOC"))
@@ -138,7 +139,6 @@ class HTMLPreprocessor:
self._font_to_span()
# self._remove_table_of_contents()
- self._change_table_of_contents()
def _process_paragraph(self):
"""
@@ -303,10 +303,12 @@ class HTMLPreprocessor:
footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
- if cont_tag.find('a').attrs.get('href') is None:
+ true_a_tag = cont_tag.find('a', {'class': 'sdfootnotesym-western'})
+ if true_a_tag.attrs.get('href') is None:
cont_tag.a.decompose()
continue
- assert anc_tag['name'] == cont_tag.find('a')['href'][1:], \
+
+ assert anc_tag['name'] == true_a_tag['href'][1:], \
'Something went wrong with footnotes after libra conversion'
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
@@ -317,12 +319,13 @@ class HTMLPreprocessor:
anc_tag.replace_with(new_tag)
# extra digits in footnotes from documents downloaded from livecarta
- a_text = cont_tag.a.text
+ a_text = true_a_tag.text
if len(cont_tag.find_all('p')):
sup = cont_tag.find_all('p')[0].find('sup')
if sup and sup.text == a_text:
sup.decompose()
- cont_tag.a.decompose()
+ for tag_a in cont_tag.find_all('a'):
+ tag_a.decompose()
unicode_string = ''
for child in cont_tag.children:
@@ -405,6 +408,9 @@ class HTMLPreprocessor:
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
# TODO: escape from recounting paragraphs every time
elif tag.name == "p":
+ link_name = tag.a.attrs['name']
+ toc_item = self.body_tag.find("a", {'href': '#' + link_name})
+ # TODO: if it is needed, check existence of the link in toc
if tag in self.body_tag.find_all("p"):
new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
text = tag.text