Change paths to books

This commit is contained in:
Kiryl
2022-07-27 20:44:19 +03:00
parent 253c4ebe26
commit af466cbc27
9 changed files with 62 additions and 60 deletions

View File

@@ -33,7 +33,7 @@ def configure_file_logger(name, filename="logs/converter.log", filemode="w+",
def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict):
logger.info(f"Start processing book-{book_id}.")
try:
json_file_path = "json/9781614382264.json"
json_file_path = "books/json/9781614382264.json"
book = book_type(book_id=book_id, main_logger=logger, **params)
book.conversion_local(json_file_path)
except Exception as exc:

2
presets/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*
!.gitignore

View File

@@ -14,7 +14,7 @@ class DocxBook(BookSolver):
def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None):
super().__init__(book_id, access, main_logger)
self.book_type = 'docx'
self.book_type = "docx"
# critical section for occupying libreoffice by one thread
self.libre_locker: Event() = libre_locker
@@ -53,9 +53,9 @@ class DocxBook(BookSolver):
if __name__ == "__main__":
docx_file_path = '../../books/docx/music_inquiry.docx'
docx_file_path = "../../books/docx/music_inquiry.docx"
logger_object = BookLogger(
name='docx', book_id=docx_file_path.split('/')[-1])
name="docx", book_id=docx_file_path.split("/")[-1])
locker = Event()
locker.set()
@@ -70,5 +70,5 @@ if __name__ == "__main__":
content, footnotes, top_level_headers, logger_object)
content_dict = json_converter.convert_to_dict()
with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f:
with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f:
json.dump(content_dict, f, ensure_ascii=False)

View File

@@ -9,58 +9,58 @@ def _clean_footnote_content(content):
def process_footnotes(body_tag):
"""Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc')
footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc")
footnote_content = body_tag.find_all(
'div', id=re.compile(r'^sdfootnote\d+$'))
"div", id=re.compile(r"^sdfootnote\d+$"))
footnote_amt = len(footnote_anchors)
assert footnote_amt == len(footnote_content), \
'Something went wrong with footnotes after libre conversion'
"Something went wrong with footnotes after libre conversion"
footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
true_a_tag = cont_tag.find_all(
'a', class_=re.compile(r'^sdfootnote.+$'))[0]
"a", class_=re.compile(r"^sdfootnote.+$"))[0]
if true_a_tag.attrs.get('href') is None:
if true_a_tag.attrs.get("href") is None:
cont_tag.a.decompose()
continue
assert anc_tag['name'] == true_a_tag['href'][1:], \
'Something went wrong with footnotes after libre conversion'
assert anc_tag["name"] == true_a_tag["href"][1:], \
"Something went wrong with footnotes after libre conversion"
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
new_tag = BeautifulSoup(features="lxml").new_tag("sup")
new_tag["class"] = "footnote-element"
new_tag["data-id"] = i + 1
new_tag["id"] = f"footnote-{i + 1}"
new_tag.string = "*"
anc_tag.replace_with(new_tag)
# extra digits in footnotes from documents downloaded from livecarta
a_text = true_a_tag.text
if len(cont_tag.find_all('p')):
sup = cont_tag.find_all('p')[0].find('sup')
if len(cont_tag.find_all("p")):
sup = cont_tag.find_all("p")[0].find("sup")
if sup and sup.text == a_text:
sup.decompose()
for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
for tag_a in cont_tag.find_all("a", {"class": "sdfootnotesym"}):
tag_a.decompose()
# remove font-size
for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
style = span.get('style')
for span in cont_tag.find_all("span", {"style": re.compile("font-size")}):
style = span.get("style")
style = re.sub(r"font-size: \d+px", "", style)
if style == '':
del span.attrs['style']
if style == "":
del span.attrs["style"]
else:
span.attrs['style'] = style
span.attrs["style"] = style
unicode_string = ''
unicode_string = ""
for child in cont_tag.children:
if type(child) is NavigableString:
continue
if child.name == 'blockquote':
if child.name == "blockquote":
unicode_string += str(child)
else:
unicode_string += child.decode_contents()

View File

@@ -10,23 +10,23 @@ def process_images(access, html_path, book_id, body_tag):
For now images are moved to one folder.
"""
img_tags = body_tag.find_all('img')
img_tags = body_tag.find_all("img")
for img in img_tags:
img_name = img.attrs.get('src')
img_name = img.attrs.get("src")
# quick fix for bad links
if (len(img_name) >= 3) and img_name[:3] == '../':
if (len(img_name) >= 3) and img_name[:3] == "../":
img_name = img_name[3:]
img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
img_path = pathlib.Path(f"{html_path.parent}", f"{img_name}")
if access is not None:
link = access.send_image(img_path, doc_id=book_id)
img.attrs['src'] = link
img.attrs["src"] = link
else:
if img_tags.index(img) == 0:
folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'../books/json/img_{book_id}/'))
folder_path, f"../books/json/img_{book_id}/"))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / img_name
copyfile(img_path, new_img_path)

View File

@@ -29,7 +29,7 @@ class LibreHTML2JSONConverter:
cleaned text
"""
new_text = re.sub(r'([\n\t])', ' ', html_text)
new_text = re.sub(r"([\n\t])", " ", html_text)
return new_text
# TODO: rethink the function structure without indexes.
@@ -48,16 +48,16 @@ class LibreHTML2JSONConverter:
"""
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
title = str(self.content[ind])
title = title.replace(f'<{self.content[ind].name}>', '')
title = title.replace(f'</{self.content[ind].name}>', '')
title = re.sub(r'^\n', '', title)
title = title.replace(f"<{self.content[ind].name}>", "")
title = title.replace(f"</{self.content[ind].name}>", "")
title = re.sub(r"^\n", "", title)
# extract outline from tag
curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
result = {
'title': f'{title}',
'contents': [],
'sub_items': []
"title": f"{title}",
"contents": [],
"sub_items": []
}
ch_content = []
ind += 1
@@ -71,9 +71,9 @@ class LibreHTML2JSONConverter:
header_dict, ind = self.header_to_livecarta_chapter_item(
ind)
if ch_content:
result['contents'].append("".join(ch_content))
result["contents"].append("".join(ch_content))
ch_content = []
result['sub_items'].append(header_dict)
result["sub_items"].append(header_dict)
# - current h_i <= h_initial, end of recursion
else:
# return result, ind
@@ -85,21 +85,21 @@ class LibreHTML2JSONConverter:
ind += 1
if ch_content:
result['contents'].append("".join(ch_content))
result["contents"].append("".join(ch_content))
return result, ind
return ''
return ""
@staticmethod
def _is_empty_p_tag(tag):
if tag.name != 'p':
if tag.name != "p":
return False
temp_tag = copy(tag)
brs = temp_tag.find_all('br')
brs = temp_tag.find_all("br")
for br in brs:
br.decompose()
text = re.sub(r'\s+', '', temp_tag.text)
text = re.sub(r"\s+", "", temp_tag.text)
if text:
return False
@@ -117,7 +117,7 @@ class LibreHTML2JSONConverter:
res, ind = self.header_to_livecarta_chapter_item(ind)
else:
chapter_title = f'Untitled chapter {ch_num}'
chapter_title = f"Untitled chapter {ch_num}"
chapter = []
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
if not self._is_empty_p_tag(self.content[ind]):
@@ -126,9 +126,9 @@ class LibreHTML2JSONConverter:
ind += 1
if chapter:
res = {
'title': chapter_title,
'contents': ["".join(chapter)],
'sub_items': []
"title": chapter_title,
"contents": ["".join(chapter)],
"sub_items": []
}
ch_num += 1
@@ -136,10 +136,10 @@ class LibreHTML2JSONConverter:
json_strc.append(res)
ch_amt += 1
self.logger_object.log(
f'Chapter {ch_amt} has been added to structure.')
f"Chapter {ch_amt} has been added to structure.")
except Exception as exc:
self.logger_object.log(
'Error has occurred while making json structure.', logging.ERROR)
"Error has occurred while making json structure.", logging.ERROR)
self.logger_object.log_error_to_main_log()
if self.book_api_status:
self.book_api_status.set_error()
@@ -148,10 +148,10 @@ class LibreHTML2JSONConverter:
# Add is_introduction field to json structure
# after deleting content before toc, some chapters can be deleted
if self.top_level_headers:
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"]
is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]
json_strc[0]['is_introduction'] = is_first_header_introduction
json_strc[0]["is_introduction"] = is_first_header_introduction
self.content_dict = {
"content": json_strc,

View File

@@ -633,7 +633,7 @@ class EpubConverter:
if __name__ == "__main__":
epub_file_path = "../../epub/9780763774134.epub"
epub_file_path = "../../books/epub/9780763774134.epub"
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])

View File

@@ -72,7 +72,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote":
if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote":
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(
_replace_with_livecarta_anchor_tag(noteref_tag, i))
@@ -80,7 +80,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(
attrs={"role": "doc-backlink"}) or footnote_tag
attrs={"role": "docs-backlink"}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)):

View File

@@ -16,7 +16,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f"../json/img_{book_id}/"))
folder_path, f"../books/json/img_{book_id}/"))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)