From d4fb6223e6885331502e452c2958535abfc506d9 Mon Sep 17 00:00:00 2001
From: Jeniamakarchik <eniamak@gmail.com>
Date: Fri, 7 Feb 2020 12:03:50 +0300
Subject: [PATCH] Update book.py

add solution for skiping everything before table of content
---
 src/book.py | 115 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 86 insertions(+), 29 deletions(-)

diff --git a/src/book.py b/src/book.py
index 4c278f3..ceba5e6 100644
--- a/src/book.py
+++ b/src/book.py
@@ -4,6 +4,8 @@ import logging
 import os
 import pathlib
 import re
+from copy import copy
+from shutil import copyfile
 
 from bs4 import BeautifulSoup
 
@@ -28,12 +30,12 @@ class Book:
     }
     SUPPORTED_HEADERS = ["h1", "h2", "h3"]
 
-    def __init__(self, book_id, access=None):
+    def __init__(self, book_id=0, access=None, file_path=None, output_path=None):
         self.book_id = book_id
         self.access = access
+        self.file_path = file_path
+        self.output_path = output_path
 
-        self.file_path = None
-        self.output_path = None
         self.logger = None
         self.html_soup = None
         self.body_tag = None
@@ -268,6 +270,19 @@ class Book:
         for table in tables:
             table.decompose()
 
+    def _change_table_of_contents(self):
+        tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
+        for table in tables:
+            table.wrap(self.html_soup.new_tag("TOC"))
+            table.decompose()
+
+    def delete_content_before_toc(self):
+        toc_tag = self.html_soup.new_tag('TOC')
+        if toc_tag in self.content:
+            ind = self.content.index(toc_tag) + 1
+            self.content = self.content[ind:]
+        self.write_html_from_list()
+
     def clean_trash(self):
         """
         Function to remove all styles and tags we don't need.
@@ -283,7 +298,8 @@ class Book:
         self._clean_underline_links()
 
         self._font_to_span()
-        self._remove_table_of_contents()
+        # self._remove_table_of_contents()
+        self._change_table_of_contents()
 
     def _process_paragraph(self):
         """
@@ -359,8 +375,8 @@ class Book:
 
             new_tag = BeautifulSoup(features='lxml').new_tag('sup')
             new_tag['class'] = 'footnote-element'
-            new_tag['data-id'] = i+1
-            new_tag['id'] = f'footnote-{i+1}'
+            new_tag['data-id'] = i + 1
+            new_tag['id'] = f'footnote-{i + 1}'
             new_tag.string = '*'
             anc_tag.replace_with(new_tag)
 
@@ -385,21 +401,24 @@ class Book:
         imgs = self.body_tag.find_all('img')
 
         if len(imgs):
-            # new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
-            # new_path.mkdir(exist_ok=True)
+            if self.access is None:
+                folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+                new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
+                new_path.mkdir(exist_ok=True)
 
             for img in imgs:
                 img_name = img.attrs.get('src')
                 img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
 
-                link = self.access.send_image(img_path, self.book_id)
-                img.attrs['src'] = link
-
-                # img_size = os.path.getsize(img_path)
-                # print(f'{img_name} successfully loaded. Image size: {img_size}.')
-                # new_img_path = new_path / img_name
-                # copyfile(img_path, new_img_path)
-                # img.attrs["src"] = str(new_img_path)
+                if self.access is not None:
+                    link = self.access.send_image(img_path, self.book_id)
+                    img.attrs['src'] = link
+                else:
+                    img_size = os.path.getsize(img_path)
+                    print(f'{img_name} successfully loaded. Image size: {img_size}.')
+                    new_img_path = new_path / img_name
+                    copyfile(img_path, new_img_path)
+                    img.attrs["src"] = str(new_img_path)
 
         self.images = imgs
 
@@ -472,7 +491,10 @@ class Book:
                 tag.replace_with(new_tag)
 
     def write_html_from_list(self, file_name='url_test.html'):
-        with open(file_name, 'w', encoding='utf-8') as f_out:
+        folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        file_path = pathlib.Path(os.path.join(folder_path, file_name))
+
+        with open(file_path, 'w', encoding='utf-8') as f_out:
             # f_out.write("".join([tag.prettify() for tag in self.content]))
             f_out.write(self.body_tag.prettify())
             self.logger.info(f'Check test file - url_test.html.')
@@ -502,17 +524,14 @@ class Book:
 
         self.content = self.body_tag.find_all(recursive=False)
 
-        # if self.train_mode:
-        #     self.model.train_model(self.content)
-        # else:
-        #     self.model.predict_headers(self.content)
-
-        self.write_html_from_list()
-
         self._process_toc_links()
         self._process_headings()
 
         self.content = self.body_tag.find_all(recursive=False)
+
+        # delete text before table of content if exists
+        self.delete_content_before_toc()
+
         self.logger.info('End of processing .html file.')
 
     @staticmethod
@@ -563,6 +582,22 @@ class Book:
             return result, ind
         return ''
 
+    @staticmethod
+    def _is_empty_p_tag(tag):
+        if tag.name != 'p':
+            return False
+
+        temp_tag = copy(tag)
+        brs = temp_tag.find_all('br')
+        for br in brs:
+            br.decompose()
+
+        text = re.sub(r'\s+', '', temp_tag.text)
+        if text:
+            return False
+
+        return True
+
     def convert_to_json(self):
         """
         Function which convert list of html nodes to appropriate json structure.
@@ -572,17 +607,22 @@ class Book:
         ch_num = 0
 
         while ind < len(self.content):
+            res = {}
+
             if self.content[ind].name in self.SUPPORTED_HEADERS:
                 res, ind = self.header_to_json(ind)
             else:
                 chapter_title = f'Untitled chapter {ch_num}'
                 chapter = []
                 while ind < len(self.content) and self.content[ind].name not in self.SUPPORTED_HEADERS:
-                    chapter.append(self.format_html(str(self.content[ind])))
+                    if not self._is_empty_p_tag(self.content[ind]):
+                        chapter.append(self.format_html(str(self.content[ind])))
                     ind += 1
-                res = {chapter_title: ["".join(chapter)]}
-                ch_num += 1
-            json_strc.append(res)
+                if chapter:
+                    res = {chapter_title: ["".join(chapter)]}
+                    ch_num += 1
+            if res:
+                json_strc.append(res)
 
         self.content_dict = {
             "content": json_strc,
@@ -599,6 +639,13 @@ class Book:
         except Exception as exc:
             raise exc
 
+    def convert_from_html(self, logging_format):
+        self.configure_file_logger(__name__, logging_format=logging_format, filemode='w+')
+        self.read_html()
+        self.process_html()
+        self.convert_to_json()
+        self.write_json()
+
     def conversion(self, logging_format, filemode='w+'):
         self.configure_file_logger(__name__, logging_format=logging_format, filemode=filemode)
         self.log('Beginning of conversion from .docx to .json.')
@@ -607,10 +654,20 @@ class Book:
         self.convert_doc_to_html()
         self.check_output_directory()
         self.read_html()
-        self.clean_trash()
         self.process_html()
         self.set_generate_status()
         self.convert_to_json()
         self.write_json()
         self.send_json_content()
         self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
+
+
+if __name__ == "__main__":
+    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    file_path = pathlib.Path(os.path.join(folder_path, 'html/11/11.html'))
+    out_path = pathlib.Path(os.path.join(folder_path, 'json/11.json'))
+
+    logging_format = '%(asctime)s - %(levelname)s - %(message)s'
+
+    book = Book(file_path=file_path, output_path=out_path)
+    book.convert_from_html(logging_format=logging_format)