updated book conversion

- new resulted json structure
- fixed spelling
- added asserts messages
This commit is contained in:
shirshasa
2020-06-03 12:40:08 +03:00
parent 35b8e9563c
commit bbe690bf80

View File

@@ -33,7 +33,6 @@ class Book:
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"} SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"}
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None): def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
self.book_id = book_id self.book_id = book_id
self.access = access self.access = access
@@ -52,7 +51,7 @@ class Book:
self.tables_amount = 0 self.tables_amount = 0
assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \ assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowd levels." "Length of headers doesn't match allowed levels."
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+', def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'): logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
@@ -131,10 +130,10 @@ class Book:
content = self.access.get_doc(self.book_id) content = self.access.get_doc(self.book_id)
self.log('File was received from server.') self.log('File was received from server.')
self.save_docx(content) self.save_docx(content)
except FileNotFoundError as ferr: except FileNotFoundError as f_err:
self.log("Can't get docx from server.", logging.ERROR) self.log("Can't get docx from server.", logging.ERROR)
self.log_error_to_main_log() self.log_error_to_main_log()
raise ferr raise f_err
except Exception as exc: except Exception as exc:
raise exc raise exc
@@ -505,15 +504,17 @@ class Book:
""" """
Function returns list of footnotes and delete them from html_soup. Function returns list of footnotes and delete them from html_soup.
""" """
footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc') footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$')) footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_ancors) footnote_amt = len(footnote_anchors)
assert footnote_amt == len(footnote_content) assert footnote_amt == len(footnote_content),\
'Some ting went wrong with footnotes after libra conversion'
footnotes = [] footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)): for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
assert anc_tag['name'] == cont_tag.find('a')['href'][1:] assert anc_tag['name'] == cont_tag.find('a')['href'][1:], \
'Some ting went wrong with footnotes after libra conversion'
new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element' new_tag['class'] = 'footnote-element'
@@ -540,18 +541,18 @@ class Book:
def _process_images(self): def _process_images(self):
""" """
Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link. Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
For now images are moved to one folder. For now images are moved to one folder.
""" """
imgs = self.body_tag.find_all('img') img_tags = self.body_tag.find_all('img')
if len(imgs): if len(img_tags):
if self.access is None: if self.access is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/')) new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
new_path.mkdir(exist_ok=True) new_path.mkdir(exist_ok=True)
for img in imgs: for img in img_tags:
img_name = img.attrs.get('src') img_name = img.attrs.get('src')
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}') img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
@@ -566,7 +567,7 @@ class Book:
copyfile(img_path, new_img_path) copyfile(img_path, new_img_path)
img.attrs["src"] = str(new_img_path) img.attrs["src"] = str(new_img_path)
self.images = imgs self.images = img_tags
def _process_footer(self): def _process_footer(self):
""" """
@@ -606,7 +607,8 @@ class Book:
new_tag.string = text new_tag.string = text
else: else:
# rethink document structure when you have toc_links, other cases? # rethink document structure when you have toc_links, other cases?
self.logger.warning(f'Something went wrong in processing toc_links. Check the structure of the file. ' self.logger.warning(f'Something went wrong in processing toc_links.'
f' Check the structure of the file. '
f'Tag name: {tag.name}') f'Tag name: {tag.name}')
@staticmethod @staticmethod
@@ -673,9 +675,9 @@ class Book:
def _mark_introduction_headers(self): def _mark_introduction_headers(self):
""" """
Function to find out: Function to find out:
what header shouldn't be numbered and can be treated as introductive chapter what header shouldn't be numbered and can be treated as introduction chapter
Assume header(s) to be introductive if: Assume header(s) to be introduction if:
1. one header not numbered, before 1 numbered header 1. one header not numbered, before 1 numbered header
2. it is first header from the top level list and it equals to 'introduction' 2. it is first header from the top level list and it equals to 'introduction'
@@ -718,7 +720,6 @@ class Book:
# if tag.name in ["h4", "h5", "h6"]: # if tag.name in ["h4", "h5", "h6"]:
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings # tag.name = "h3" # All the lower level headings will be transformed to h3 headings
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name) new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
new_tag.string = title new_tag.string = title
tag.replace_with(new_tag) tag.replace_with(new_tag)
@@ -884,6 +885,14 @@ class Book:
if self.content[ind].name in self.SUPPORTED_HEADERS: if self.content[ind].name in self.SUPPORTED_HEADERS:
res, ind = self.header_to_json(ind) res, ind = self.header_to_json(ind)
assert len(res.keys()) == 1, 'Something went wrong during header to json conversion.'
top_level_header = list(res.keys())[0]
res = {
'title': top_level_header,
'contents': res[top_level_header]
}
else: else:
chapter_title = f'Untitled chapter {ch_num}' chapter_title = f'Untitled chapter {ch_num}'
chapter = [] chapter = []
@@ -892,8 +901,12 @@ class Book:
chapter.append(self.format_html(str(self.content[ind]))) chapter.append(self.format_html(str(self.content[ind])))
ind += 1 ind += 1
if chapter: if chapter:
res = {chapter_title: ["".join(chapter)]} res = {
'title': chapter_title,
'contents': ["".join(chapter)]
}
ch_num += 1 ch_num += 1
if res: if res:
json_strc.append(res) json_strc.append(res)
ch_amt += 1 ch_amt += 1
@@ -906,7 +919,7 @@ class Book:
# Add is_introduction field to json structure # Add is_introduction field to json structure
# after deleting content before toc, some chapters can be deleted # after deleting content before toc, some chapters can be deleted
same_first_titles = self.top_level_headers[0]['title'] in json_strc[0].keys() same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered'] is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
@@ -973,8 +986,8 @@ class Book:
if __name__ == "__main__": if __name__ == "__main__":
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
file_path = pathlib.Path(os.path.join(folder_path, 'html/0/music_inquiry.html')) file_path = pathlib.Path(os.path.join(folder_path, 'html/82/82.html'))
out_path = pathlib.Path(os.path.join(folder_path, 'json/music_inquiry.json')) out_path = pathlib.Path(os.path.join(folder_path, 'json/82.json'))
logging_format = '%(asctime)s - %(levelname)s - %(message)s' logging_format = '%(asctime)s - %(levelname)s - %(message)s'