diff --git a/src/book.py b/src/book.py
index b4d120b..f457b8c 100644
--- a/src/book.py
+++ b/src/book.py
@@ -33,7 +33,6 @@ class Book:
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"}
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
-
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
self.book_id = book_id
self.access = access
@@ -52,7 +51,7 @@ class Book:
self.tables_amount = 0
assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
- "Length of headers doesn't match allowd levels."
+ "Length of headers doesn't match allowed levels."
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
@@ -131,10 +130,10 @@ class Book:
content = self.access.get_doc(self.book_id)
self.log('File was received from server.')
self.save_docx(content)
- except FileNotFoundError as ferr:
+ except FileNotFoundError as f_err:
self.log("Can't get docx from server.", logging.ERROR)
self.log_error_to_main_log()
- raise ferr
+ raise f_err
except Exception as exc:
raise exc
@@ -505,15 +504,17 @@ class Book:
"""
Function returns list of footnotes and delete them from html_soup.
"""
- footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc')
+ footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
- footnote_amt = len(footnote_ancors)
+ footnote_amt = len(footnote_anchors)
- assert footnote_amt == len(footnote_content)
+ assert footnote_amt == len(footnote_content),\
+ 'Some ting went wrong with footnotes after libra conversion'
footnotes = []
- for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
- assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
+ for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
+ assert anc_tag['name'] == cont_tag.find('a')['href'][1:], \
+ 'Some ting went wrong with footnotes after libra conversion'
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
@@ -540,18 +541,18 @@ class Book:
def _process_images(self):
"""
- Funcction to process
tag. Img should be sent Amazon S3 and then return new tag with valid link.
+ Function to process
tag. Img should be sent Amazon S3 and then return new tag with valid link.
For now images are moved to one folder.
"""
- imgs = self.body_tag.find_all('img')
+ img_tags = self.body_tag.find_all('img')
- if len(imgs):
+ if len(img_tags):
if self.access is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
new_path.mkdir(exist_ok=True)
- for img in imgs:
+ for img in img_tags:
img_name = img.attrs.get('src')
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
@@ -566,7 +567,7 @@ class Book:
copyfile(img_path, new_img_path)
img.attrs["src"] = str(new_img_path)
- self.images = imgs
+ self.images = img_tags
def _process_footer(self):
"""
@@ -606,7 +607,8 @@ class Book:
new_tag.string = text
else:
# rethink document structure when you have toc_links, other cases?
- self.logger.warning(f'Something went wrong in processing toc_links. Check the structure of the file. '
+ self.logger.warning(f'Something went wrong in processing toc_links.'
+ f' Check the structure of the file. '
f'Tag name: {tag.name}')
@staticmethod
@@ -673,9 +675,9 @@ class Book:
def _mark_introduction_headers(self):
"""
Function to find out:
- what header shouldn't be numbered and can be treated as introductive chapter
+ what header shouldn't be numbered and can be treated as introduction chapter
- Assume header(s) to be introductive if:
+ Assume header(s) to be introduction if:
1. one header not numbered, before 1 numbered header
2. it is first header from the top level list and it equals to 'introduction'
@@ -718,7 +720,6 @@ class Book:
# if tag.name in ["h4", "h5", "h6"]:
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
-
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
new_tag.string = title
tag.replace_with(new_tag)
@@ -884,6 +885,14 @@ class Book:
if self.content[ind].name in self.SUPPORTED_HEADERS:
res, ind = self.header_to_json(ind)
+
+ assert len(res.keys()) == 1, 'Something went wrong during header to json conversion.'
+
+ top_level_header = list(res.keys())[0]
+ res = {
+ 'title': top_level_header,
+ 'contents': res[top_level_header]
+ }
else:
chapter_title = f'Untitled chapter {ch_num}'
chapter = []
@@ -892,8 +901,12 @@ class Book:
chapter.append(self.format_html(str(self.content[ind])))
ind += 1
if chapter:
- res = {chapter_title: ["".join(chapter)]}
+ res = {
+ 'title': chapter_title,
+ 'contents': ["".join(chapter)]
+ }
ch_num += 1
+
if res:
json_strc.append(res)
ch_amt += 1
@@ -906,7 +919,7 @@ class Book:
# Add is_introduction field to json structure
# after deleting content before toc, some chapters can be deleted
- same_first_titles = self.top_level_headers[0]['title'] in json_strc[0].keys()
+ same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
@@ -973,8 +986,8 @@ class Book:
if __name__ == "__main__":
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- file_path = pathlib.Path(os.path.join(folder_path, 'html/0/music_inquiry.html'))
- out_path = pathlib.Path(os.path.join(folder_path, 'json/music_inquiry.json'))
+ file_path = pathlib.Path(os.path.join(folder_path, 'html/82/82.html'))
+ out_path = pathlib.Path(os.path.join(folder_path, 'json/82.json'))
logging_format = '%(asctime)s - %(levelname)s - %(message)s'