forked from LiveCarta/BookConverter
updated book conversion
- new resulted json structure - fixed spelling - added asserts messages
This commit is contained in:
57
src/book.py
57
src/book.py
@@ -33,7 +33,6 @@ class Book:
|
|||||||
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"}
|
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4"}
|
||||||
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
|
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
|
||||||
self.book_id = book_id
|
self.book_id = book_id
|
||||||
self.access = access
|
self.access = access
|
||||||
@@ -52,7 +51,7 @@ class Book:
|
|||||||
self.tables_amount = 0
|
self.tables_amount = 0
|
||||||
|
|
||||||
assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
|
assert self.SUPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
|
||||||
"Length of headers doesn't match allowd levels."
|
"Length of headers doesn't match allowed levels."
|
||||||
|
|
||||||
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
|
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
|
||||||
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
|
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
|
||||||
@@ -131,10 +130,10 @@ class Book:
|
|||||||
content = self.access.get_doc(self.book_id)
|
content = self.access.get_doc(self.book_id)
|
||||||
self.log('File was received from server.')
|
self.log('File was received from server.')
|
||||||
self.save_docx(content)
|
self.save_docx(content)
|
||||||
except FileNotFoundError as ferr:
|
except FileNotFoundError as f_err:
|
||||||
self.log("Can't get docx from server.", logging.ERROR)
|
self.log("Can't get docx from server.", logging.ERROR)
|
||||||
self.log_error_to_main_log()
|
self.log_error_to_main_log()
|
||||||
raise ferr
|
raise f_err
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
@@ -505,15 +504,17 @@ class Book:
|
|||||||
"""
|
"""
|
||||||
Function returns list of footnotes and delete them from html_soup.
|
Function returns list of footnotes and delete them from html_soup.
|
||||||
"""
|
"""
|
||||||
footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc')
|
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
|
||||||
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
|
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
|
||||||
footnote_amt = len(footnote_ancors)
|
footnote_amt = len(footnote_anchors)
|
||||||
|
|
||||||
assert footnote_amt == len(footnote_content)
|
assert footnote_amt == len(footnote_content),\
|
||||||
|
'Some ting went wrong with footnotes after libra conversion'
|
||||||
|
|
||||||
footnotes = []
|
footnotes = []
|
||||||
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
|
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
|
||||||
assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
|
assert anc_tag['name'] == cont_tag.find('a')['href'][1:], \
|
||||||
|
'Some ting went wrong with footnotes after libra conversion'
|
||||||
|
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||||
new_tag['class'] = 'footnote-element'
|
new_tag['class'] = 'footnote-element'
|
||||||
@@ -540,18 +541,18 @@ class Book:
|
|||||||
|
|
||||||
def _process_images(self):
|
def _process_images(self):
|
||||||
"""
|
"""
|
||||||
Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
|
Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
|
||||||
For now images are moved to one folder.
|
For now images are moved to one folder.
|
||||||
"""
|
"""
|
||||||
imgs = self.body_tag.find_all('img')
|
img_tags = self.body_tag.find_all('img')
|
||||||
|
|
||||||
if len(imgs):
|
if len(img_tags):
|
||||||
if self.access is None:
|
if self.access is None:
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
|
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
|
||||||
new_path.mkdir(exist_ok=True)
|
new_path.mkdir(exist_ok=True)
|
||||||
|
|
||||||
for img in imgs:
|
for img in img_tags:
|
||||||
img_name = img.attrs.get('src')
|
img_name = img.attrs.get('src')
|
||||||
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
|
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
|
||||||
|
|
||||||
@@ -566,7 +567,7 @@ class Book:
|
|||||||
copyfile(img_path, new_img_path)
|
copyfile(img_path, new_img_path)
|
||||||
img.attrs["src"] = str(new_img_path)
|
img.attrs["src"] = str(new_img_path)
|
||||||
|
|
||||||
self.images = imgs
|
self.images = img_tags
|
||||||
|
|
||||||
def _process_footer(self):
|
def _process_footer(self):
|
||||||
"""
|
"""
|
||||||
@@ -606,7 +607,8 @@ class Book:
|
|||||||
new_tag.string = text
|
new_tag.string = text
|
||||||
else:
|
else:
|
||||||
# rethink document structure when you have toc_links, other cases?
|
# rethink document structure when you have toc_links, other cases?
|
||||||
self.logger.warning(f'Something went wrong in processing toc_links. Check the structure of the file. '
|
self.logger.warning(f'Something went wrong in processing toc_links.'
|
||||||
|
f' Check the structure of the file. '
|
||||||
f'Tag name: {tag.name}')
|
f'Tag name: {tag.name}')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -673,9 +675,9 @@ class Book:
|
|||||||
def _mark_introduction_headers(self):
|
def _mark_introduction_headers(self):
|
||||||
"""
|
"""
|
||||||
Function to find out:
|
Function to find out:
|
||||||
what header shouldn't be numbered and can be treated as introductive chapter
|
what header shouldn't be numbered and can be treated as introduction chapter
|
||||||
|
|
||||||
Assume header(s) to be introductive if:
|
Assume header(s) to be introduction if:
|
||||||
1. one header not numbered, before 1 numbered header
|
1. one header not numbered, before 1 numbered header
|
||||||
2. it is first header from the top level list and it equals to 'introduction'
|
2. it is first header from the top level list and it equals to 'introduction'
|
||||||
|
|
||||||
@@ -718,7 +720,6 @@ class Book:
|
|||||||
# if tag.name in ["h4", "h5", "h6"]:
|
# if tag.name in ["h4", "h5", "h6"]:
|
||||||
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
|
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
|
||||||
|
|
||||||
|
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
|
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
|
||||||
new_tag.string = title
|
new_tag.string = title
|
||||||
tag.replace_with(new_tag)
|
tag.replace_with(new_tag)
|
||||||
@@ -884,6 +885,14 @@ class Book:
|
|||||||
|
|
||||||
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
||||||
res, ind = self.header_to_json(ind)
|
res, ind = self.header_to_json(ind)
|
||||||
|
|
||||||
|
assert len(res.keys()) == 1, 'Something went wrong during header to json conversion.'
|
||||||
|
|
||||||
|
top_level_header = list(res.keys())[0]
|
||||||
|
res = {
|
||||||
|
'title': top_level_header,
|
||||||
|
'contents': res[top_level_header]
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
chapter_title = f'Untitled chapter {ch_num}'
|
chapter_title = f'Untitled chapter {ch_num}'
|
||||||
chapter = []
|
chapter = []
|
||||||
@@ -892,8 +901,12 @@ class Book:
|
|||||||
chapter.append(self.format_html(str(self.content[ind])))
|
chapter.append(self.format_html(str(self.content[ind])))
|
||||||
ind += 1
|
ind += 1
|
||||||
if chapter:
|
if chapter:
|
||||||
res = {chapter_title: ["".join(chapter)]}
|
res = {
|
||||||
|
'title': chapter_title,
|
||||||
|
'contents': ["".join(chapter)]
|
||||||
|
}
|
||||||
ch_num += 1
|
ch_num += 1
|
||||||
|
|
||||||
if res:
|
if res:
|
||||||
json_strc.append(res)
|
json_strc.append(res)
|
||||||
ch_amt += 1
|
ch_amt += 1
|
||||||
@@ -906,7 +919,7 @@ class Book:
|
|||||||
|
|
||||||
# Add is_introduction field to json structure
|
# Add is_introduction field to json structure
|
||||||
# after deleting content before toc, some chapters can be deleted
|
# after deleting content before toc, some chapters can be deleted
|
||||||
same_first_titles = self.top_level_headers[0]['title'] in json_strc[0].keys()
|
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
|
||||||
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
|
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
|
||||||
|
|
||||||
json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
|
json_strc[0]['is_introduction'] = is_first_header_introduction and same_first_titles
|
||||||
@@ -973,8 +986,8 @@ class Book:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
file_path = pathlib.Path(os.path.join(folder_path, 'html/0/music_inquiry.html'))
|
file_path = pathlib.Path(os.path.join(folder_path, 'html/82/82.html'))
|
||||||
out_path = pathlib.Path(os.path.join(folder_path, 'json/music_inquiry.json'))
|
out_path = pathlib.Path(os.path.join(folder_path, 'json/82.json'))
|
||||||
|
|
||||||
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
|
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user