forked from LiveCarta/BookConverter
epub converter: add headers and image processing
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
|
|
||||||
@@ -8,17 +9,31 @@ import ebooklib
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
from ebooklib.epub import Link, Section
|
from ebooklib.epub import Link, Section
|
||||||
|
from ebooklib.utils import debug
|
||||||
|
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids
|
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
|
||||||
|
preprocess_image
|
||||||
|
|
||||||
|
|
||||||
|
# todo: https://docs.python.org/3/howto/unicode.html
|
||||||
|
|
||||||
|
|
||||||
class EpubBookAdapter:
|
class EpubBookAdapter:
|
||||||
def __init__(self, file):
|
def __init__(self, file):
|
||||||
self.file = file
|
self.file = file
|
||||||
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
|
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
|
||||||
|
self.href2img_bytes = {}
|
||||||
|
|
||||||
|
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
|
||||||
|
debug(x)
|
||||||
|
file_name = x.file_name
|
||||||
|
content = x.content
|
||||||
|
# todo: check how file path is count in lib
|
||||||
|
self.href2img_bytes[file_name] = content
|
||||||
|
|
||||||
self.id_anchor_exist_in_nav_points = False
|
self.id_anchor_exist_in_nav_points = False
|
||||||
self.href2soup_html = self.build_href2soup_content()
|
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
||||||
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
|
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
|
||||||
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
|
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
|
||||||
self.href2ids = defaultdict(list)
|
self.href2ids = defaultdict(list)
|
||||||
@@ -164,11 +179,13 @@ class EpubBookAdapter:
|
|||||||
def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
|
def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
|
||||||
title = node.title
|
title = node.title
|
||||||
if node.id:
|
if node.id:
|
||||||
content = self.id_anchor2soup[(node.href, node.id)]
|
content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
|
||||||
else:
|
else:
|
||||||
content = self.href2soup_html[node.href]
|
content: BeautifulSoup = self.href2soup_html[node.href]
|
||||||
content_preprocessed = str(content) # todo self.preprocess_html(content, node.id)
|
|
||||||
content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed)
|
preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None)
|
||||||
|
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
|
||||||
|
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
# warning! not EpubHtmlItems won;t be added to chapter
|
# warning! not EpubHtmlItems won;t be added to chapter
|
||||||
if self.adjacency_list.get(node):
|
if self.adjacency_list.get(node):
|
||||||
@@ -196,9 +213,6 @@ if __name__ == "__main__":
|
|||||||
"content": l
|
"content": l
|
||||||
}
|
}
|
||||||
|
|
||||||
output_file = open('output.out', 'w')
|
|
||||||
output_file.write(str(tmp))
|
|
||||||
|
|
||||||
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
|
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
|
||||||
json.dump(tmp, f, ensure_ascii=False)
|
json.dump(tmp, f, ensure_ascii=False)
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,47 @@
|
|||||||
|
import os
|
||||||
|
import pathlib
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
|
||||||
|
from src.access import Access
|
||||||
|
|
||||||
def preprocess_image():
|
|
||||||
pass
|
def save_image_locally(img_file_path, img_content, book_id):
|
||||||
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
|
||||||
|
new_path.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
new_img_path = new_path / os.path.basename(img_file_path)
|
||||||
|
f = open(new_img_path, 'wb+')
|
||||||
|
f.write(img_content)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
return new_img_path
|
||||||
|
|
||||||
|
|
||||||
|
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
|
||||||
|
link = access.send_image_by_bytes(img_file_path, img_content, book_id)
|
||||||
|
return link
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_image(body_tag, href2img_content, path_to_html, access=None):
|
||||||
|
img_tags = body_tag.find_all('img')
|
||||||
|
|
||||||
|
for img in img_tags:
|
||||||
|
path_to_img_from_html = img.attrs.get('src')
|
||||||
|
html_folder = os.path.dirname(path_to_html)
|
||||||
|
path_to_img_from_root = os.path.normpath(os.path.join(html_folder ,path_to_img_from_html))
|
||||||
|
|
||||||
|
assert path_to_img_from_root in href2img_content, f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
|
||||||
|
|
||||||
|
img_content = href2img_content[path_to_img_from_root]
|
||||||
|
if access is not None:
|
||||||
|
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
|
||||||
|
else:
|
||||||
|
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
|
||||||
|
|
||||||
|
img.attrs['src'] = str(new_folder)
|
||||||
|
|
||||||
|
|
||||||
def preprocess_table():
|
def preprocess_table():
|
||||||
@@ -15,8 +52,12 @@ def preprocess_quote():
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def clean_heading_in_content():
|
def clean_heading_in_content(content, title: str):
|
||||||
pass
|
for child in content.contents:
|
||||||
|
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
|
||||||
|
if title == child.text:
|
||||||
|
child.extract()
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def preprocess_footnotes():
|
def preprocess_footnotes():
|
||||||
@@ -28,8 +69,18 @@ def add_fonts():
|
|||||||
|
|
||||||
|
|
||||||
def unwrap_structural_tags(body_tag):
|
def unwrap_structural_tags(body_tag):
|
||||||
|
structural_tags_names = [
|
||||||
|
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
|
||||||
|
'figure', 'footer', 'iframe', 'span'
|
||||||
|
]
|
||||||
|
|
||||||
divs = body_tag.find_all("div")
|
divs = body_tag.find_all("div")
|
||||||
for div in divs:
|
for div in divs:
|
||||||
|
if div.contents:
|
||||||
|
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
|
||||||
|
if all(is_not_struct_tag):
|
||||||
|
div.name = 'p'
|
||||||
|
continue
|
||||||
div.unwrap()
|
div.unwrap()
|
||||||
|
|
||||||
secs = body_tag.find_all("section")
|
secs = body_tag.find_all("section")
|
||||||
@@ -48,19 +99,18 @@ def unwrap_structural_tags(body_tag):
|
|||||||
for s in articles:
|
for s in articles:
|
||||||
s.unwrap()
|
s.unwrap()
|
||||||
|
|
||||||
# articles = body_tag.find_all("html")
|
articles = body_tag.find_all("html")
|
||||||
# for s in articles:
|
for s in articles:
|
||||||
# s.unwrap()
|
s.unwrap()
|
||||||
|
|
||||||
spans = body_tag.find_all("span")
|
spans = body_tag.find_all("span")
|
||||||
# not all cases, if span has <p>s and NavigableString, it won't unwrap
|
# not all cases, if span has <p>s and NavigableString, it won't unwrap
|
||||||
for s in spans:
|
for s in spans:
|
||||||
if not s.string and s.contents:
|
if not s.string and s.contents:
|
||||||
is_string = [isinstance(child, NavigableString) for child in s.contents]
|
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
|
||||||
if any(is_string):
|
if all(is_not_struct_tag):
|
||||||
pass
|
continue
|
||||||
else:
|
s.unwrap()
|
||||||
s.unwrap()
|
|
||||||
|
|
||||||
for node in body_tag:
|
for node in body_tag:
|
||||||
if isinstance(node, NavigableString):
|
if isinstance(node, NavigableString):
|
||||||
@@ -75,15 +125,6 @@ def unwrap_structural_tags(body_tag):
|
|||||||
return body_tag
|
return body_tag
|
||||||
|
|
||||||
|
|
||||||
def str2html_soup(html_text: str, element_id=None):
|
|
||||||
html_soup = BeautifulSoup(html_text, features='lxml')
|
|
||||||
if element_id:
|
|
||||||
x = html_soup.find(id=element_id)
|
|
||||||
return str(x)
|
|
||||||
else:
|
|
||||||
return str(html_text)
|
|
||||||
|
|
||||||
|
|
||||||
def get_tags_between_ids(first_id, href, html_soup):
|
def get_tags_between_ids(first_id, href, html_soup):
|
||||||
h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
|
h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
|
||||||
if h_marked:
|
if h_marked:
|
||||||
@@ -102,3 +143,22 @@ def get_tags_between_ids(first_id, href, html_soup):
|
|||||||
assert 0, f'Warning: no match for {first_id, href}'
|
assert 0, f'Warning: no match for {first_id, href}'
|
||||||
|
|
||||||
return tags
|
return tags
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_title_and_content(title, content: BeautifulSoup):
|
||||||
|
title_str = BeautifulSoup(title, features='lxml').string
|
||||||
|
# 0. cleaning \n
|
||||||
|
to_remove = []
|
||||||
|
for child in content.contents:
|
||||||
|
if isinstance(child, NavigableString):
|
||||||
|
s = re.sub(r'([\n\t\xa0])', '', child.string)
|
||||||
|
if s == '':
|
||||||
|
to_remove.append(child)
|
||||||
|
|
||||||
|
[x.extract() for x in to_remove]
|
||||||
|
# 1. rule#1 for heading removal
|
||||||
|
clean_heading_in_content(content, title_str)
|
||||||
|
|
||||||
|
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content))
|
||||||
|
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||||
|
return title_str, content_str
|
||||||
|
|||||||
Reference in New Issue
Block a user