From 4853d2c49fa0af0e8f692e15132f07d5b61e97d3 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Mon, 28 Sep 2020 11:36:09 +0300 Subject: [PATCH] fix toc removing --- src/html_preprocessor.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 61e1bbd..bd213ec 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -110,13 +110,7 @@ class HTMLPreprocessor: assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more tags def delete_content_before_toc(self): - # replace toc with empty tag - tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) - for table in tables: - table.wrap(self.html_soup.new_tag("TOC")) - table.decompose() - - # remove all tag upper the + # remove all tag upper the only in content !!! body tag is not updated toc_tag = self.html_soup.new_tag('TOC') if toc_tag in self.content: ind = self.content.index(toc_tag) + 1 @@ -137,6 +131,12 @@ class HTMLPreprocessor: self._font_to_span() + # replace toc with empty tag + tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) + for table in tables: + table.wrap(self.html_soup.new_tag("TOC")) + table.decompose() + def _process_paragraph(self): """ Function to process

tags (text-align and text-indent value). @@ -578,6 +578,9 @@ class HTMLPreprocessor: Process html code to satisfy LawCarta formatting. """ try: + self.logger_object.log(f'Processing TOC and headers.') + self._process_toc_links() + self.clean_trash() # process main elements of the .html doc @@ -609,9 +612,6 @@ class HTMLPreprocessor: self.content = self.body_tag.find_all(recursive=False) - self.logger_object.log(f'Processing TOC and headers.') - self._process_toc_links() - self.top_level_headers = self._get_top_level_headers() self._mark_introduction_headers()