updated book conversion

- new resulted json structure - fixed spelling - added asserts messages
2020-06-03 12:40:08 +03:00
parent 35b8e9563c
commit bbe690bf80
1 changed files with 35 additions and 22 deletions
--- a/src/book.py
+++ b/src/book.py
@@ -33,7 +33,6 @@ class Book:
    SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"}
    HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
    def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
        self.book_id = book_id
        self.access = access
@@ -52,7 +51,7 @@ class Book:
        self.tables_amount = 0
        assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
-            "Length of headers doesn't match allowd levels."
+            "Length of headers doesn't match allowed levels."
    def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
                              logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
@@ -131,10 +130,10 @@ class Book:
            content = self.access.get_doc(self.book_id)
            self.log('File was received from server.')
            self.save_docx(content)
-        except FileNotFoundError as ferr:
+        except FileNotFoundError as f_err:
            self.log("Can't get docx from server.", logging.ERROR)
            self.log_error_to_main_log()
-            raise ferr
+            raise f_err
        except Exception as exc:
            raise exc
@@ -505,15 +504,17 @@ class Book:
        """
        Function returns list of footnotes and delete them from html_soup.
        """
-        footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc')
+        footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
        footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
-        footnote_amt = len(footnote_ancors)
+        footnote_amt = len(footnote_anchors)
-        assert footnote_amt == len(footnote_content)
+        assert footnote_amt == len(footnote_content),\
            'Some ting went wrong with footnotes after libra conversion'
        footnotes = []
-        for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
+        for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
-            assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
+            assert anc_tag['name'] == cont_tag.find('a')['href'][1:], \
                'Some ting went wrong with footnotes after libra conversion'
            new_tag = BeautifulSoup(features='lxml').new_tag('sup')
            new_tag['class'] = 'footnote-element'
@@ -540,18 +541,18 @@ class Book:
    def _process_images(self):
        """
-        Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
+        Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
        For now images are moved to one folder.
        """
-        imgs = self.body_tag.find_all('img')
+        img_tags = self.body_tag.find_all('img')
-        if len(imgs):
+        if len(img_tags):
            if self.access is None:
                folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
                new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
                new_path.mkdir(exist_ok=True)
-            for img in imgs:
+            for img in img_tags:
                img_name = img.attrs.get('src')
                img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
@@ -566,7 +567,7 @@ class Book:
                    copyfile(img_path, new_img_path)
                    img.attrs["src"] = str(new_img_path)
-        self.images = imgs
+        self.images = img_tags
    def _process_footer(self):
        """
@@ -606,7 +607,8 @@ class Book:
                    new_tag.string = text
            else:
                # rethink document structure when you have toc_links, other cases?
-                self.logger.warning(f'Something went wrong in processing toc_links. Check the structure of the file. '
+                self.logger.warning(f'Something went wrong in processing toc_links.'
                                    f' Check the structure of the file. '
                                    f'Tag name: {tag.name}')
    @staticmethod
@@ -673,9 +675,9 @@ class Book:
    def _mark_introduction_headers(self):
        """
        Function to find out:
-        what header shouldn't be numbered and can be treated as introductive chapter
+        what header shouldn't be numbered and can be treated as introduction chapter
-        Assume  header(s) to be introductive if:
+        Assume  header(s) to be introduction if:
            1. one header not numbered, before 1 numbered header
            2. it is first header from the top level list and it equals to 'introduction'
@@ -718,7 +720,6 @@ class Book:
                # if tag.name in ["h4", "h5", "h6"]:
                #     tag.name = "h3" # All the lower level headings will be transformed to h3 headings
                new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
                new_tag.string = title
                tag.replace_with(new_tag)
@@ -884,6 +885,14 @@ class Book:
                if self.content[ind].name in self.SUPPORTED_HEADERS:
                    res, ind = self.header_to_json(ind)
                    assert len(res.keys()) == 1, 'Something went wrong during header to json conversion.'
                    top_level_header = list(res.keys())[0]
                    res = {
                        'title': top_level_header,
                        'contents': res[top_level_header]
                    }
                else:
                    chapter_title = f'Untitled chapter {ch_num}'
                    chapter = []
@@ -892,8 +901,12 @@ class Book:
                            chapter.append(self.format_html(str(self.content[ind])))
                        ind += 1
                    if chapter:
-                        res = {chapter_title: ["".join(chapter)]}
+                        res = {
                            'title': chapter_title,
                            'contents': ["".join(chapter)]
                        }
                        ch_num += 1
                if res:
                    json_strc.append(res)
                    ch_amt += 1
@@ -906,7 +919,7 @@ class Book:
        # Add is_introduction field to json structure
        # after deleting content before toc, some chapters can be deleted
-        same_first_titles = self.top_level_headers[0]['title'] in json_strc[0].keys()
+        same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
        is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
        json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
@@ -973,8 +986,8 @@ class Book:
 if __name__ == "__main__":
    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    file_path = pathlib.Path(os.path.join(folder_path, 'html/0/music_inquiry.html'))
+    file_path = pathlib.Path(os.path.join(folder_path, 'html/82/82.html'))
-    out_path = pathlib.Path(os.path.join(folder_path, 'json/music_inquiry.json'))
+    out_path = pathlib.Path(os.path.join(folder_path, 'json/82.json'))
    logging_format = '%(asctime)s - %(levelname)s - %(message)s'