diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index d3d1cd9..491537d 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -111,6 +111,7 @@ class HTMLPreprocessor: table.decompose() def _change_table_of_contents(self): + self._change_table_of_contents() tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) for table in tables: table.wrap(self.html_soup.new_tag("TOC")) @@ -138,7 +139,6 @@ class HTMLPreprocessor: self._font_to_span() # self._remove_table_of_contents() - self._change_table_of_contents() def _process_paragraph(self): """ @@ -303,10 +303,12 @@ class HTMLPreprocessor: footnotes = [] for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): - if cont_tag.find('a').attrs.get('href') is None: + true_a_tag = cont_tag.find('a', {'class': 'sdfootnotesym-western'}) + if true_a_tag.attrs.get('href') is None: cont_tag.a.decompose() continue - assert anc_tag['name'] == cont_tag.find('a')['href'][1:], \ + + assert anc_tag['name'] == true_a_tag['href'][1:], \ 'Something went wrong with footnotes after libra conversion' new_tag = BeautifulSoup(features='lxml').new_tag('sup') @@ -317,12 +319,13 @@ class HTMLPreprocessor: anc_tag.replace_with(new_tag) # extra digits in footnotes from documents downloaded from livecarta - a_text = cont_tag.a.text + a_text = true_a_tag.text if len(cont_tag.find_all('p')): sup = cont_tag.find_all('p')[0].find('sup') if sup and sup.text == a_text: sup.decompose() - cont_tag.a.decompose() + for tag_a in cont_tag.find_all('a'): + tag_a.decompose() unicode_string = '' for child in cont_tag.children: @@ -405,6 +408,9 @@ class HTMLPreprocessor: # outline_level = tag.name[-1] # TODO: add prediction of the outline level # TODO: escape from recounting paragraphs every time elif tag.name == "p": + link_name = tag.a.attrs['name'] + toc_item = self.body_tag.find("a", {'href': '#' + link_name}) + # TODO: if it is needed, check existence of the link in toc if tag in self.body_tag.find_all("p"): new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level) text = tag.text