minor fixes in book.py

2020-02-05 16:40:10 +03:00
parent d951682054
commit d13dab2d9e
1 changed files with 16 additions and 35 deletions
--- a/src/book.py
+++ b/src/book.py
@@ -1,16 +1,12 @@
-import argparse
 import codecs
 import json
 import logging
 import os
 import pathlib
 import re
-from shutil import copyfile

 from bs4 import BeautifulSoup

-# from src.header_detection import HeaderDetector
-

 class Book:
    # Main constant values
@@ -110,8 +106,9 @@ class Book:
        Method for convert .docx document to .html file.
        """
        self.log(f'File - {self.file_path}.')
-        print(self.file_path)
+        print(f'{self.file_path}')
        self.log('Beginning of conversion from .docx to .html.')
+
        try:
            f = open(self.file_path)
            f.close()
@@ -131,7 +128,7 @@ class Book:
            self.logger.error('Conversion has gone wrong.')
            raise e

-        self.log("End of conversion from .docx to .html.")
+        self.log('End of conversion from .docx to .html.')
        self.log(f'Input file path after conversion: {self.file_path}.')

    def check_output_directory(self):
@@ -140,15 +137,10 @@ class Book:
            self.output_path = f'json/{filename}'

        self.output_path = pathlib.Path(self.output_path)
-        self.logger.info(f'Output file path: {self.output_path}')
+        self.log(f'Output file path: {self.output_path}')

        pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
-        try:
-            self.output_path.touch(exist_ok=self.recreate)
-        except FileExistsError as e:
-            self.logger.error('Output file already exists! '
-                              'Either change the name of output file or use --recreate switch.')
-            raise e
+        self.output_path.touch(exist_ok=True)

    def read_html(self):
        """
@@ -163,10 +155,6 @@ class Book:
        self.html_soup = BeautifulSoup(html_text, features='lxml')
        self.body_tag = self.html_soup.body

-        # head_tag = self.html_soup.head
-        # styles = parse_styles(head_tag.style)
-        # head_tag.decompose()
-
    def _clean_tag(self, tag, attr_name, attr_value):
        """
        Function to clean tags by its name and attribute value.
@@ -307,13 +295,13 @@ class Book:
        """
        Function to process <dl> tags. All tags will be replaced with <blockquote> tags.
        """
-        dls = self.body_tag.find_all("dl")
+        dls = self.body_tag.find_all('dl')

        for dl in dls:
-            pars = dl.find_all("p")
+            pars = dl.find_all('p')
            for p in pars:
-                p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))
-            new_div = BeautifulSoup(features="lxml").new_tag("div")
+                p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote'))
+            new_div = BeautifulSoup(features='lxml').new_tag('div')
            for p in pars:
                new_div.append(p.parent)
            dl.replaceWith(new_div)
@@ -328,8 +316,8 @@ class Book:
        """
        Function returns list of footnotes and delete them from html_soup.
        """
-        footnote_ancors = self.body_tag.find_all("a", class_="sdfootnoteanc")
-        footnote_content = self.body_tag.find_all("div", id=re.compile(r"^sdfootnote\d+$"))
+        footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc')
+        footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
        footnote_amt = len(footnote_ancors)

        assert footnote_amt == len(footnote_content)
@@ -338,7 +326,7 @@ class Book:
        for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
            assert anc_tag['name'] == cont_tag.find('a')['href'][1:]

-            new_tag = BeautifulSoup(features="lxml").new_tag('sup')
+            new_tag = BeautifulSoup(features='lxml').new_tag('sup')
            new_tag['class'] = 'footnote-element'
            new_tag['data-id'] = i+1
            new_tag['id'] = f'footnote-{i+1}'
@@ -363,7 +351,7 @@ class Book:
        Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
        For now images are moved to one folder.
        """
-        imgs = self.body_tag.find_all("img")
+        imgs = self.body_tag.find_all('img')

        if len(imgs):
            new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
@@ -518,6 +506,7 @@ class Book:
            result = {title: []}
            ch_content = []
            ind += 1
+
            while ind < len(self.content):
                if self.content[ind].name in self.SUPPORTED_HEADERS:
                    outline = int(re.sub(r"^h", "", self.content[ind].name))
@@ -535,6 +524,7 @@ class Book:
                    # result[title].append(res)
                    ch_content.append(res)
                    ind += 1
+
            if ch_content:
                result[title].append("".join(ch_content))
            return result, ind
@@ -547,6 +537,7 @@ class Book:
        json_strc = []
        ind = 0
        ch_num = 0
+
        while ind < len(self.content):
            if self.content[ind].name in self.SUPPORTED_HEADERS:
                res, ind = self.header_to_json(ind)
@@ -584,13 +575,3 @@ class Book:
        self.convert_to_json()
        self.write_json()
        self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
-
-
-if __name__ == '__main__':
-    logging_format = '%(asctime)s - %(levelname)s - %(message)s'
-
-    book = Book(file_path="", recreate=True)
-    book.parse_args()
-    book.conversion(logging_format)
-
-    print('Script has finished.')