diff --git a/src/book.py b/src/book.py index 5d1670e..89c3ac4 100644 --- a/src/book.py +++ b/src/book.py @@ -1,16 +1,12 @@ -import argparse import codecs import json import logging import os import pathlib import re -from shutil import copyfile from bs4 import BeautifulSoup -# from src.header_detection import HeaderDetector - class Book: # Main constant values @@ -110,8 +106,9 @@ class Book: Method for convert .docx document to .html file. """ self.log(f'File - {self.file_path}.') - print(self.file_path) + print(f'{self.file_path}') self.log('Beginning of conversion from .docx to .html.') + try: f = open(self.file_path) f.close() @@ -131,7 +128,7 @@ class Book: self.logger.error('Conversion has gone wrong.') raise e - self.log("End of conversion from .docx to .html.") + self.log('End of conversion from .docx to .html.') self.log(f'Input file path after conversion: {self.file_path}.') def check_output_directory(self): @@ -140,15 +137,10 @@ class Book: self.output_path = f'json/{filename}' self.output_path = pathlib.Path(self.output_path) - self.logger.info(f'Output file path: {self.output_path}') + self.log(f'Output file path: {self.output_path}') pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) - try: - self.output_path.touch(exist_ok=self.recreate) - except FileExistsError as e: - self.logger.error('Output file already exists! ' - 'Either change the name of output file or use --recreate switch.') - raise e + self.output_path.touch(exist_ok=True) def read_html(self): """ @@ -163,10 +155,6 @@ class Book: self.html_soup = BeautifulSoup(html_text, features='lxml') self.body_tag = self.html_soup.body - # head_tag = self.html_soup.head - # styles = parse_styles(head_tag.style) - # head_tag.decompose() - def _clean_tag(self, tag, attr_name, attr_value): """ Function to clean tags by its name and attribute value. @@ -307,13 +295,13 @@ class Book: """ Function to process
tags. """ - dls = self.body_tag.find_all("dl") + dls = self.body_tag.find_all('dl') for dl in dls: - pars = dl.find_all("p") + pars = dl.find_all('p') for p in pars: - p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote")) - new_div = BeautifulSoup(features="lxml").new_tag("div") + p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) + new_div = BeautifulSoup(features='lxml').new_tag('div') for p in pars: new_div.append(p.parent) dl.replaceWith(new_div) @@ -328,8 +316,8 @@ class Book: """ Function returns list of footnotes and delete them from html_soup. """ - footnote_ancors = self.body_tag.find_all("a", class_="sdfootnoteanc") - footnote_content = self.body_tag.find_all("div", id=re.compile(r"^sdfootnote\d+$")) + footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc') + footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$')) footnote_amt = len(footnote_ancors) assert footnote_amt == len(footnote_content) @@ -338,7 +326,7 @@ class Book: for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)): assert anc_tag['name'] == cont_tag.find('a')['href'][1:] - new_tag = BeautifulSoup(features="lxml").new_tag('sup') + new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' new_tag['data-id'] = i+1 new_tag['id'] = f'footnote-{i+1}' @@ -363,7 +351,7 @@ class Book: Funcction to processtag. Img should be sent Amazon S3 and then return new tag with valid link. For now images are moved to one folder. """ - imgs = self.body_tag.find_all("img") + imgs = self.body_tag.find_all('img') if len(imgs): new_path = pathlib.Path(f'json/img_{self.file_path.stem}/') @@ -518,6 +506,7 @@ class Book: result = {title: []} ch_content = [] ind += 1 + while ind < len(self.content): if self.content[ind].name in self.SUPPORTED_HEADERS: outline = int(re.sub(r"^h", "", self.content[ind].name)) @@ -535,6 +524,7 @@ class Book: # result[title].append(res) ch_content.append(res) ind += 1 + if ch_content: result[title].append("".join(ch_content)) return result, ind @@ -547,6 +537,7 @@ class Book: json_strc = [] ind = 0 ch_num = 0 + while ind < len(self.content): if self.content[ind].name in self.SUPPORTED_HEADERS: res, ind = self.header_to_json(ind) @@ -584,13 +575,3 @@ class Book: self.convert_to_json() self.write_json() self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.') - - -if __name__ == '__main__': - logging_format = '%(asctime)s - %(levelname)s - %(message)s' - - book = Book(file_path="", recreate=True) - book.parse_args() - book.conversion(logging_format) - - print('Script has finished.')