add new fixture for creating new user

2020-09-18 17:45:39 +03:00
parent f27eefb96b
commit 7143442561
1 changed files with 11 additions and 5 deletions
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -111,6 +111,7 @@ class HTMLPreprocessor:
            table.decompose()

    def _change_table_of_contents(self):
+        self._change_table_of_contents()
        tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
        for table in tables:
            table.wrap(self.html_soup.new_tag("TOC"))
@@ -138,7 +139,6 @@ class HTMLPreprocessor:

        self._font_to_span()
        # self._remove_table_of_contents()
-        self._change_table_of_contents()

    def _process_paragraph(self):
        """
@@ -303,10 +303,12 @@ class HTMLPreprocessor:
        footnotes = []

        for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
-            if cont_tag.find('a').attrs.get('href') is None:
+            true_a_tag = cont_tag.find('a', {'class': 'sdfootnotesym-western'})
+            if true_a_tag.attrs.get('href') is None:
                cont_tag.a.decompose()
                continue
-            assert anc_tag['name'] == cont_tag.find('a')['href'][1:], \
+
+            assert anc_tag['name'] == true_a_tag['href'][1:], \
                'Something went wrong with footnotes after libra conversion'

            new_tag = BeautifulSoup(features='lxml').new_tag('sup')
@@ -317,12 +319,13 @@ class HTMLPreprocessor:
            anc_tag.replace_with(new_tag)

            # extra digits in footnotes from documents downloaded from livecarta
-            a_text = cont_tag.a.text
+            a_text = true_a_tag.text
            if len(cont_tag.find_all('p')):
                sup = cont_tag.find_all('p')[0].find('sup')
                if sup and sup.text == a_text:
                    sup.decompose()
-            cont_tag.a.decompose()
+            for tag_a in cont_tag.find_all('a'):
+                tag_a.decompose()

            unicode_string = ''
            for child in cont_tag.children:
@@ -405,6 +408,9 @@ class HTMLPreprocessor:
                # outline_level = tag.name[-1]  # TODO: add prediction of the outline level
            # TODO: escape from recounting paragraphs every time
            elif tag.name == "p":
+                link_name = tag.a.attrs['name']
+                toc_item = self.body_tag.find("a", {'href': '#' + link_name})
+                # TODO: if it is needed, check existence of the link in toc
                if tag in self.body_tag.find_all("p"):
                    new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
                    text = tag.text