forked from LiveCarta/BookConverter
Change paths to books
This commit is contained in:
@@ -33,7 +33,7 @@ def configure_file_logger(name, filename="logs/converter.log", filemode="w+",
|
||||
def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict):
|
||||
logger.info(f"Start processing book-{book_id}.")
|
||||
try:
|
||||
json_file_path = "json/9781614382264.json"
|
||||
json_file_path = "books/json/9781614382264.json"
|
||||
book = book_type(book_id=book_id, main_logger=logger, **params)
|
||||
book.conversion_local(json_file_path)
|
||||
except Exception as exc:
|
||||
|
||||
2
presets/.gitignore
vendored
Normal file
2
presets/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
*
|
||||
!.gitignore
|
||||
@@ -14,7 +14,7 @@ class DocxBook(BookSolver):
|
||||
|
||||
def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None):
|
||||
super().__init__(book_id, access, main_logger)
|
||||
self.book_type = 'docx'
|
||||
self.book_type = "docx"
|
||||
# critical section for occupying libreoffice by one thread
|
||||
self.libre_locker: Event() = libre_locker
|
||||
|
||||
@@ -53,9 +53,9 @@ class DocxBook(BookSolver):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
docx_file_path = '../../books/docx/music_inquiry.docx'
|
||||
docx_file_path = "../../books/docx/music_inquiry.docx"
|
||||
logger_object = BookLogger(
|
||||
name='docx', book_id=docx_file_path.split('/')[-1])
|
||||
name="docx", book_id=docx_file_path.split("/")[-1])
|
||||
locker = Event()
|
||||
locker.set()
|
||||
|
||||
@@ -70,5 +70,5 @@ if __name__ == "__main__":
|
||||
content, footnotes, top_level_headers, logger_object)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
|
||||
with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f:
|
||||
with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f:
|
||||
json.dump(content_dict, f, ensure_ascii=False)
|
||||
|
||||
@@ -9,58 +9,58 @@ def _clean_footnote_content(content):
|
||||
|
||||
def process_footnotes(body_tag):
|
||||
"""Function returns list of footnotes and delete them from html_soup."""
|
||||
footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc')
|
||||
footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc")
|
||||
footnote_content = body_tag.find_all(
|
||||
'div', id=re.compile(r'^sdfootnote\d+$'))
|
||||
"div", id=re.compile(r"^sdfootnote\d+$"))
|
||||
footnote_amt = len(footnote_anchors)
|
||||
|
||||
assert footnote_amt == len(footnote_content), \
|
||||
'Something went wrong with footnotes after libre conversion'
|
||||
"Something went wrong with footnotes after libre conversion"
|
||||
|
||||
footnotes = []
|
||||
|
||||
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
|
||||
true_a_tag = cont_tag.find_all(
|
||||
'a', class_=re.compile(r'^sdfootnote.+$'))[0]
|
||||
"a", class_=re.compile(r"^sdfootnote.+$"))[0]
|
||||
|
||||
if true_a_tag.attrs.get('href') is None:
|
||||
if true_a_tag.attrs.get("href") is None:
|
||||
cont_tag.a.decompose()
|
||||
continue
|
||||
|
||||
assert anc_tag['name'] == true_a_tag['href'][1:], \
|
||||
'Something went wrong with footnotes after libre conversion'
|
||||
assert anc_tag["name"] == true_a_tag["href"][1:], \
|
||||
"Something went wrong with footnotes after libre conversion"
|
||||
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||
new_tag['class'] = 'footnote-element'
|
||||
new_tag['data-id'] = i + 1
|
||||
new_tag['id'] = f'footnote-{i + 1}'
|
||||
new_tag.string = '*'
|
||||
new_tag = BeautifulSoup(features="lxml").new_tag("sup")
|
||||
new_tag["class"] = "footnote-element"
|
||||
new_tag["data-id"] = i + 1
|
||||
new_tag["id"] = f"footnote-{i + 1}"
|
||||
new_tag.string = "*"
|
||||
anc_tag.replace_with(new_tag)
|
||||
|
||||
# extra digits in footnotes from documents downloaded from livecarta
|
||||
a_text = true_a_tag.text
|
||||
if len(cont_tag.find_all('p')):
|
||||
sup = cont_tag.find_all('p')[0].find('sup')
|
||||
if len(cont_tag.find_all("p")):
|
||||
sup = cont_tag.find_all("p")[0].find("sup")
|
||||
if sup and sup.text == a_text:
|
||||
sup.decompose()
|
||||
|
||||
for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
|
||||
for tag_a in cont_tag.find_all("a", {"class": "sdfootnotesym"}):
|
||||
tag_a.decompose()
|
||||
|
||||
# remove font-size
|
||||
for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
|
||||
style = span.get('style')
|
||||
for span in cont_tag.find_all("span", {"style": re.compile("font-size")}):
|
||||
style = span.get("style")
|
||||
style = re.sub(r"font-size: \d+px", "", style)
|
||||
if style == '':
|
||||
del span.attrs['style']
|
||||
if style == "":
|
||||
del span.attrs["style"]
|
||||
else:
|
||||
span.attrs['style'] = style
|
||||
span.attrs["style"] = style
|
||||
|
||||
unicode_string = ''
|
||||
unicode_string = ""
|
||||
for child in cont_tag.children:
|
||||
if type(child) is NavigableString:
|
||||
continue
|
||||
if child.name == 'blockquote':
|
||||
if child.name == "blockquote":
|
||||
unicode_string += str(child)
|
||||
else:
|
||||
unicode_string += child.decode_contents()
|
||||
|
||||
@@ -10,23 +10,23 @@ def process_images(access, html_path, book_id, body_tag):
|
||||
For now images are moved to one folder.
|
||||
|
||||
"""
|
||||
img_tags = body_tag.find_all('img')
|
||||
img_tags = body_tag.find_all("img")
|
||||
for img in img_tags:
|
||||
img_name = img.attrs.get('src')
|
||||
img_name = img.attrs.get("src")
|
||||
# quick fix for bad links
|
||||
if (len(img_name) >= 3) and img_name[:3] == '../':
|
||||
if (len(img_name) >= 3) and img_name[:3] == "../":
|
||||
img_name = img_name[3:]
|
||||
img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
|
||||
img_path = pathlib.Path(f"{html_path.parent}", f"{img_name}")
|
||||
|
||||
if access is not None:
|
||||
link = access.send_image(img_path, doc_id=book_id)
|
||||
img.attrs['src'] = link
|
||||
img.attrs["src"] = link
|
||||
else:
|
||||
if img_tags.index(img) == 0:
|
||||
folder_path = os.path.dirname(
|
||||
os.path.dirname(os.path.abspath(__file__)))
|
||||
new_path = pathlib.Path(os.path.join(
|
||||
folder_path, f'../books/json/img_{book_id}/'))
|
||||
folder_path, f"../books/json/img_{book_id}/"))
|
||||
new_path.mkdir(exist_ok=True)
|
||||
new_img_path = new_path / img_name
|
||||
copyfile(img_path, new_img_path)
|
||||
|
||||
@@ -29,7 +29,7 @@ class LibreHTML2JSONConverter:
|
||||
cleaned text
|
||||
|
||||
"""
|
||||
new_text = re.sub(r'([\n\t])', ' ', html_text)
|
||||
new_text = re.sub(r"([\n\t])", " ", html_text)
|
||||
return new_text
|
||||
|
||||
# TODO: rethink the function structure without indexes.
|
||||
@@ -48,16 +48,16 @@ class LibreHTML2JSONConverter:
|
||||
"""
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
title = str(self.content[ind])
|
||||
title = title.replace(f'<{self.content[ind].name}>', '')
|
||||
title = title.replace(f'</{self.content[ind].name}>', '')
|
||||
title = re.sub(r'^\n', '', title)
|
||||
title = title.replace(f"<{self.content[ind].name}>", "")
|
||||
title = title.replace(f"</{self.content[ind].name}>", "")
|
||||
title = re.sub(r"^\n", "", title)
|
||||
|
||||
# extract outline from tag
|
||||
curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
|
||||
result = {
|
||||
'title': f'{title}',
|
||||
'contents': [],
|
||||
'sub_items': []
|
||||
"title": f"{title}",
|
||||
"contents": [],
|
||||
"sub_items": []
|
||||
}
|
||||
ch_content = []
|
||||
ind += 1
|
||||
@@ -71,9 +71,9 @@ class LibreHTML2JSONConverter:
|
||||
header_dict, ind = self.header_to_livecarta_chapter_item(
|
||||
ind)
|
||||
if ch_content:
|
||||
result['contents'].append("".join(ch_content))
|
||||
result["contents"].append("".join(ch_content))
|
||||
ch_content = []
|
||||
result['sub_items'].append(header_dict)
|
||||
result["sub_items"].append(header_dict)
|
||||
# - current h_i <= h_initial, end of recursion
|
||||
else:
|
||||
# return result, ind
|
||||
@@ -85,21 +85,21 @@ class LibreHTML2JSONConverter:
|
||||
ind += 1
|
||||
|
||||
if ch_content:
|
||||
result['contents'].append("".join(ch_content))
|
||||
result["contents"].append("".join(ch_content))
|
||||
return result, ind
|
||||
return ''
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _is_empty_p_tag(tag):
|
||||
if tag.name != 'p':
|
||||
if tag.name != "p":
|
||||
return False
|
||||
|
||||
temp_tag = copy(tag)
|
||||
brs = temp_tag.find_all('br')
|
||||
brs = temp_tag.find_all("br")
|
||||
for br in brs:
|
||||
br.decompose()
|
||||
|
||||
text = re.sub(r'\s+', '', temp_tag.text)
|
||||
text = re.sub(r"\s+", "", temp_tag.text)
|
||||
if text:
|
||||
return False
|
||||
|
||||
@@ -117,7 +117,7 @@ class LibreHTML2JSONConverter:
|
||||
res, ind = self.header_to_livecarta_chapter_item(ind)
|
||||
|
||||
else:
|
||||
chapter_title = f'Untitled chapter {ch_num}'
|
||||
chapter_title = f"Untitled chapter {ch_num}"
|
||||
chapter = []
|
||||
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
if not self._is_empty_p_tag(self.content[ind]):
|
||||
@@ -126,9 +126,9 @@ class LibreHTML2JSONConverter:
|
||||
ind += 1
|
||||
if chapter:
|
||||
res = {
|
||||
'title': chapter_title,
|
||||
'contents': ["".join(chapter)],
|
||||
'sub_items': []
|
||||
"title": chapter_title,
|
||||
"contents": ["".join(chapter)],
|
||||
"sub_items": []
|
||||
}
|
||||
ch_num += 1
|
||||
|
||||
@@ -136,10 +136,10 @@ class LibreHTML2JSONConverter:
|
||||
json_strc.append(res)
|
||||
ch_amt += 1
|
||||
self.logger_object.log(
|
||||
f'Chapter {ch_amt} has been added to structure.')
|
||||
f"Chapter {ch_amt} has been added to structure.")
|
||||
except Exception as exc:
|
||||
self.logger_object.log(
|
||||
'Error has occurred while making json structure.', logging.ERROR)
|
||||
"Error has occurred while making json structure.", logging.ERROR)
|
||||
self.logger_object.log_error_to_main_log()
|
||||
if self.book_api_status:
|
||||
self.book_api_status.set_error()
|
||||
@@ -148,10 +148,10 @@ class LibreHTML2JSONConverter:
|
||||
# Add is_introduction field to json structure
|
||||
# after deleting content before toc, some chapters can be deleted
|
||||
if self.top_level_headers:
|
||||
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
|
||||
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
|
||||
same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"]
|
||||
is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]
|
||||
|
||||
json_strc[0]['is_introduction'] = is_first_header_introduction
|
||||
json_strc[0]["is_introduction"] = is_first_header_introduction
|
||||
|
||||
self.content_dict = {
|
||||
"content": json_strc,
|
||||
|
||||
@@ -633,7 +633,7 @@ class EpubConverter:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
epub_file_path = "../../epub/9780763774134.epub"
|
||||
epub_file_path = "../../books/epub/9780763774134.epub"
|
||||
logger_object = BookLogger(
|
||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
|
||||
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
|
||||
footnote_tag = expected_footnote_tags[0]
|
||||
if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote":
|
||||
if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote":
|
||||
footnote_tag = footnote_tag.parent
|
||||
new_noterefs_tags.append(
|
||||
_replace_with_livecarta_anchor_tag(noteref_tag, i))
|
||||
@@ -80,7 +80,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
# footnote_tag.decompose()
|
||||
footnotes.append(content)
|
||||
footnote_tag = footnote_tag.find(
|
||||
attrs={"role": "doc-backlink"}) or footnote_tag
|
||||
attrs={"role": "docs-backlink"}) or footnote_tag
|
||||
new_footnotes_tags.append(footnote_tag)
|
||||
|
||||
for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)):
|
||||
|
||||
@@ -16,7 +16,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
|
||||
"""Function saves all images locally"""
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
new_path = pathlib.Path(os.path.join(
|
||||
folder_path, f"../json/img_{book_id}/"))
|
||||
folder_path, f"../books/json/img_{book_id}/"))
|
||||
new_path.mkdir(exist_ok=True)
|
||||
|
||||
new_img_path = new_path / os.path.basename(img_file_path)
|
||||
|
||||
Reference in New Issue
Block a user