diff --git a/src/book.py b/src/book.py index b4d120b..f457b8c 100644 --- a/src/book.py +++ b/src/book.py @@ -33,7 +33,6 @@ class Book: SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"} HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} - def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None): self.book_id = book_id self.access = access @@ -52,7 +51,7 @@ class Book: self.tables_amount = 0 assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \ - "Length of headers doesn't match allowd levels." + "Length of headers doesn't match allowed levels." def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+', logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'): @@ -131,10 +130,10 @@ class Book: content = self.access.get_doc(self.book_id) self.log('File was received from server.') self.save_docx(content) - except FileNotFoundError as ferr: + except FileNotFoundError as f_err: self.log("Can't get docx from server.", logging.ERROR) self.log_error_to_main_log() - raise ferr + raise f_err except Exception as exc: raise exc @@ -505,15 +504,17 @@ class Book: """ Function returns list of footnotes and delete them from html_soup. """ - footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc') + footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc') footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$')) - footnote_amt = len(footnote_ancors) + footnote_amt = len(footnote_anchors) - assert footnote_amt == len(footnote_content) + assert footnote_amt == len(footnote_content),\ + 'Some ting went wrong with footnotes after libra conversion' footnotes = [] - for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)): - assert anc_tag['name'] == cont_tag.find('a')['href'][1:] + for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): + assert anc_tag['name'] == cont_tag.find('a')['href'][1:], \ + 'Some ting went wrong with footnotes after libra conversion' new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' @@ -540,18 +541,18 @@ class Book: def _process_images(self): """ - Funcction to process tag. Img should be sent Amazon S3 and then return new tag with valid link. + Function to process tag. Img should be sent Amazon S3 and then return new tag with valid link. For now images are moved to one folder. """ - imgs = self.body_tag.find_all('img') + img_tags = self.body_tag.find_all('img') - if len(imgs): + if len(img_tags): if self.access is None: folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/')) new_path.mkdir(exist_ok=True) - for img in imgs: + for img in img_tags: img_name = img.attrs.get('src') img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}') @@ -566,7 +567,7 @@ class Book: copyfile(img_path, new_img_path) img.attrs["src"] = str(new_img_path) - self.images = imgs + self.images = img_tags def _process_footer(self): """ @@ -606,7 +607,8 @@ class Book: new_tag.string = text else: # rethink document structure when you have toc_links, other cases? - self.logger.warning(f'Something went wrong in processing toc_links. Check the structure of the file. ' + self.logger.warning(f'Something went wrong in processing toc_links.' + f' Check the structure of the file. ' f'Tag name: {tag.name}') @staticmethod @@ -673,9 +675,9 @@ class Book: def _mark_introduction_headers(self): """ Function to find out: - what header shouldn't be numbered and can be treated as introductive chapter + what header shouldn't be numbered and can be treated as introduction chapter - Assume header(s) to be introductive if: + Assume header(s) to be introduction if: 1. one header not numbered, before 1 numbered header 2. it is first header from the top level list and it equals to 'introduction' @@ -718,7 +720,6 @@ class Book: # if tag.name in ["h4", "h5", "h6"]: # tag.name = "h3" # All the lower level headings will be transformed to h3 headings - new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name) new_tag.string = title tag.replace_with(new_tag) @@ -884,6 +885,14 @@ class Book: if self.content[ind].name in self.SUPPORTED_HEADERS: res, ind = self.header_to_json(ind) + + assert len(res.keys()) == 1, 'Something went wrong during header to json conversion.' + + top_level_header = list(res.keys())[0] + res = { + 'title': top_level_header, + 'contents': res[top_level_header] + } else: chapter_title = f'Untitled chapter {ch_num}' chapter = [] @@ -892,8 +901,12 @@ class Book: chapter.append(self.format_html(str(self.content[ind]))) ind += 1 if chapter: - res = {chapter_title: ["".join(chapter)]} + res = { + 'title': chapter_title, + 'contents': ["".join(chapter)] + } ch_num += 1 + if res: json_strc.append(res) ch_amt += 1 @@ -906,7 +919,7 @@ class Book: # Add is_introduction field to json structure # after deleting content before toc, some chapters can be deleted - same_first_titles = self.top_level_headers[0]['title'] in json_strc[0].keys() + same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title'] is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered'] json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles @@ -973,8 +986,8 @@ class Book: if __name__ == "__main__": folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - file_path = pathlib.Path(os.path.join(folder_path, 'html/0/music_inquiry.html')) - out_path = pathlib.Path(os.path.join(folder_path, 'json/music_inquiry.json')) + file_path = pathlib.Path(os.path.join(folder_path, 'html/82/82.html')) + out_path = pathlib.Path(os.path.join(folder_path, 'json/82.json')) logging_format = '%(asctime)s - %(levelname)s - %(message)s'