forked from LiveCarta/BookConverter
updated book conversion
- new resulted json structure - fixed spelling - added asserts messages
This commit is contained in:
57
src/book.py
57
src/book.py
@@ -33,7 +33,6 @@ class Book:
|
||||
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"}
|
||||
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
||||
|
||||
|
||||
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
|
||||
self.book_id = book_id
|
||||
self.access = access
|
||||
@@ -52,7 +51,7 @@ class Book:
|
||||
self.tables_amount = 0
|
||||
|
||||
assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
|
||||
"Length of headers doesn't match allowd levels."
|
||||
"Length of headers doesn't match allowed levels."
|
||||
|
||||
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
|
||||
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
|
||||
@@ -131,10 +130,10 @@ class Book:
|
||||
content = self.access.get_doc(self.book_id)
|
||||
self.log('File was received from server.')
|
||||
self.save_docx(content)
|
||||
except FileNotFoundError as ferr:
|
||||
except FileNotFoundError as f_err:
|
||||
self.log("Can't get docx from server.", logging.ERROR)
|
||||
self.log_error_to_main_log()
|
||||
raise ferr
|
||||
raise f_err
|
||||
except Exception as exc:
|
||||
raise exc
|
||||
|
||||
@@ -505,15 +504,17 @@ class Book:
|
||||
"""
|
||||
Function returns list of footnotes and delete them from html_soup.
|
||||
"""
|
||||
footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc')
|
||||
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
|
||||
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
|
||||
footnote_amt = len(footnote_ancors)
|
||||
footnote_amt = len(footnote_anchors)
|
||||
|
||||
assert footnote_amt == len(footnote_content)
|
||||
assert footnote_amt == len(footnote_content),\
|
||||
'Some ting went wrong with footnotes after libra conversion'
|
||||
|
||||
footnotes = []
|
||||
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
|
||||
assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
|
||||
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
|
||||
assert anc_tag['name'] == cont_tag.find('a')['href'][1:], \
|
||||
'Some ting went wrong with footnotes after libra conversion'
|
||||
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||
new_tag['class'] = 'footnote-element'
|
||||
@@ -540,18 +541,18 @@ class Book:
|
||||
|
||||
def _process_images(self):
|
||||
"""
|
||||
Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
|
||||
Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
|
||||
For now images are moved to one folder.
|
||||
"""
|
||||
imgs = self.body_tag.find_all('img')
|
||||
img_tags = self.body_tag.find_all('img')
|
||||
|
||||
if len(imgs):
|
||||
if len(img_tags):
|
||||
if self.access is None:
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
|
||||
new_path.mkdir(exist_ok=True)
|
||||
|
||||
for img in imgs:
|
||||
for img in img_tags:
|
||||
img_name = img.attrs.get('src')
|
||||
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
|
||||
|
||||
@@ -566,7 +567,7 @@ class Book:
|
||||
copyfile(img_path, new_img_path)
|
||||
img.attrs["src"] = str(new_img_path)
|
||||
|
||||
self.images = imgs
|
||||
self.images = img_tags
|
||||
|
||||
def _process_footer(self):
|
||||
"""
|
||||
@@ -606,7 +607,8 @@ class Book:
|
||||
new_tag.string = text
|
||||
else:
|
||||
# rethink document structure when you have toc_links, other cases?
|
||||
self.logger.warning(f'Something went wrong in processing toc_links. Check the structure of the file. '
|
||||
self.logger.warning(f'Something went wrong in processing toc_links.'
|
||||
f' Check the structure of the file. '
|
||||
f'Tag name: {tag.name}')
|
||||
|
||||
@staticmethod
|
||||
@@ -673,9 +675,9 @@ class Book:
|
||||
def _mark_introduction_headers(self):
|
||||
"""
|
||||
Function to find out:
|
||||
what header shouldn't be numbered and can be treated as introductive chapter
|
||||
what header shouldn't be numbered and can be treated as introduction chapter
|
||||
|
||||
Assume header(s) to be introductive if:
|
||||
Assume header(s) to be introduction if:
|
||||
1. one header not numbered, before 1 numbered header
|
||||
2. it is first header from the top level list and it equals to 'introduction'
|
||||
|
||||
@@ -718,7 +720,6 @@ class Book:
|
||||
# if tag.name in ["h4", "h5", "h6"]:
|
||||
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
|
||||
|
||||
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
|
||||
new_tag.string = title
|
||||
tag.replace_with(new_tag)
|
||||
@@ -884,6 +885,14 @@ class Book:
|
||||
|
||||
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
||||
res, ind = self.header_to_json(ind)
|
||||
|
||||
assert len(res.keys()) == 1, 'Something went wrong during header to json conversion.'
|
||||
|
||||
top_level_header = list(res.keys())[0]
|
||||
res = {
|
||||
'title': top_level_header,
|
||||
'contents': res[top_level_header]
|
||||
}
|
||||
else:
|
||||
chapter_title = f'Untitled chapter {ch_num}'
|
||||
chapter = []
|
||||
@@ -892,8 +901,12 @@ class Book:
|
||||
chapter.append(self.format_html(str(self.content[ind])))
|
||||
ind += 1
|
||||
if chapter:
|
||||
res = {chapter_title: ["".join(chapter)]}
|
||||
res = {
|
||||
'title': chapter_title,
|
||||
'contents': ["".join(chapter)]
|
||||
}
|
||||
ch_num += 1
|
||||
|
||||
if res:
|
||||
json_strc.append(res)
|
||||
ch_amt += 1
|
||||
@@ -906,7 +919,7 @@ class Book:
|
||||
|
||||
# Add is_introduction field to json structure
|
||||
# after deleting content before toc, some chapters can be deleted
|
||||
same_first_titles = self.top_level_headers[0]['title'] in json_strc[0].keys()
|
||||
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
|
||||
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
|
||||
|
||||
json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
|
||||
@@ -973,8 +986,8 @@ class Book:
|
||||
|
||||
if __name__ == "__main__":
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
file_path = pathlib.Path(os.path.join(folder_path, 'html/0/music_inquiry.html'))
|
||||
out_path = pathlib.Path(os.path.join(folder_path, 'json/music_inquiry.json'))
|
||||
file_path = pathlib.Path(os.path.join(folder_path, 'html/82/82.html'))
|
||||
out_path = pathlib.Path(os.path.join(folder_path, 'json/82.json'))
|
||||
|
||||
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user