epub converter: add headers and image processing

This commit is contained in:
shirshasa
2021-04-15 14:45:41 +03:00
parent 880b045de0
commit 5e58cb3d92
2 changed files with 104 additions and 30 deletions

View File

@@ -1,6 +1,7 @@
import codecs import codecs
import json import json
import re import re
import os
from collections import defaultdict from collections import defaultdict
from typing import Dict, Union from typing import Dict, Union
@@ -8,17 +9,31 @@ import ebooklib
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ebooklib import epub from ebooklib import epub
from ebooklib.epub import Link, Section from ebooklib.epub import Link, Section
from ebooklib.utils import debug
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
preprocess_image
# todo: https://docs.python.org/3/howto/unicode.html
class EpubBookAdapter: class EpubBookAdapter:
def __init__(self, file): def __init__(self, file):
self.file = file self.file = file
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
self.href2img_bytes = {}
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
debug(x)
file_name = x.file_name
content = x.content
# todo: check how file path is count in lib
self.href2img_bytes[file_name] = content
self.id_anchor_exist_in_nav_points = False self.id_anchor_exist_in_nav_points = False
self.href2soup_html = self.build_href2soup_content() self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo) # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
self.href2ids = defaultdict(list) self.href2ids = defaultdict(list)
@@ -164,11 +179,13 @@ class EpubBookAdapter:
def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem: def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
title = node.title title = node.title
if node.id: if node.id:
content = self.id_anchor2soup[(node.href, node.id)] content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
else: else:
content = self.href2soup_html[node.href] content: BeautifulSoup = self.href2soup_html[node.href]
content_preprocessed = str(content) # todo self.preprocess_html(content, node.id)
content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed) preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None)
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
sub_nodes = [] sub_nodes = []
# warning! not EpubHtmlItems won;t be added to chapter # warning! not EpubHtmlItems won;t be added to chapter
if self.adjacency_list.get(node): if self.adjacency_list.get(node):
@@ -196,9 +213,6 @@ if __name__ == "__main__":
"content": l "content": l
} }
output_file = open('output.out', 'w')
output_file.write(str(tmp))
with codecs.open('tmp.json', 'w', encoding='utf-8') as f: with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
json.dump(tmp, f, ensure_ascii=False) json.dump(tmp, f, ensure_ascii=False)

View File

@@ -1,10 +1,47 @@
import os
import pathlib
import re import re
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString
from src.access import Access
def preprocess_image():
pass def save_image_locally(img_file_path, img_content, book_id):
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
link = access.send_image_by_bytes(img_file_path, img_content, book_id)
return link
def preprocess_image(body_tag, href2img_content, path_to_html, access=None):
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(html_folder ,path_to_img_from_html))
assert path_to_img_from_root in href2img_content, f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
else:
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
def preprocess_table(): def preprocess_table():
@@ -15,8 +52,12 @@ def preprocess_quote():
pass pass
def clean_heading_in_content(): def clean_heading_in_content(content, title: str):
pass for child in content.contents:
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
if title == child.text:
child.extract()
break
def preprocess_footnotes(): def preprocess_footnotes():
@@ -28,8 +69,18 @@ def add_fonts():
def unwrap_structural_tags(body_tag): def unwrap_structural_tags(body_tag):
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span'
]
divs = body_tag.find_all("div") divs = body_tag.find_all("div")
for div in divs: for div in divs:
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
div.unwrap() div.unwrap()
secs = body_tag.find_all("section") secs = body_tag.find_all("section")
@@ -48,19 +99,18 @@ def unwrap_structural_tags(body_tag):
for s in articles: for s in articles:
s.unwrap() s.unwrap()
# articles = body_tag.find_all("html") articles = body_tag.find_all("html")
# for s in articles: for s in articles:
# s.unwrap() s.unwrap()
spans = body_tag.find_all("span") spans = body_tag.find_all("span")
# not all cases, if span has <p>s and NavigableString, it won't unwrap # not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in spans: for s in spans:
if not s.string and s.contents: if not s.string and s.contents:
is_string = [isinstance(child, NavigableString) for child in s.contents] is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
if any(is_string): if all(is_not_struct_tag):
pass continue
else: s.unwrap()
s.unwrap()
for node in body_tag: for node in body_tag:
if isinstance(node, NavigableString): if isinstance(node, NavigableString):
@@ -75,15 +125,6 @@ def unwrap_structural_tags(body_tag):
return body_tag return body_tag
def str2html_soup(html_text: str, element_id=None):
html_soup = BeautifulSoup(html_text, features='lxml')
if element_id:
x = html_soup.find(id=element_id)
return str(x)
else:
return str(html_text)
def get_tags_between_ids(first_id, href, html_soup): def get_tags_between_ids(first_id, href, html_soup):
h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'}) h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
if h_marked: if h_marked:
@@ -102,3 +143,22 @@ def get_tags_between_ids(first_id, href, html_soup):
assert 0, f'Warning: no match for {first_id, href}' assert 0, f'Warning: no match for {first_id, href}'
return tags return tags
def prepare_title_and_content(title, content: BeautifulSoup):
title_str = BeautifulSoup(title, features='lxml').string
# 0. cleaning \n
to_remove = []
for child in content.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t\xa0])', '', child.string)
if s == '':
to_remove.append(child)
[x.extract() for x in to_remove]
# 1. rule#1 for heading removal
clean_heading_in_content(content, title_str)
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content))
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
return title_str, content_str