epub converter: add headers and image processing

This commit is contained in:
shirshasa
2021-04-15 14:45:41 +03:00
parent 880b045de0
commit 5e58cb3d92
2 changed files with 104 additions and 30 deletions

View File

@@ -1,10 +1,47 @@
import os
import pathlib
import re
from bs4 import BeautifulSoup, NavigableString
from src.access import Access
def preprocess_image():
pass
def save_image_locally(img_file_path, img_content, book_id):
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
link = access.send_image_by_bytes(img_file_path, img_content, book_id)
return link
def preprocess_image(body_tag, href2img_content, path_to_html, access=None):
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(html_folder ,path_to_img_from_html))
assert path_to_img_from_root in href2img_content, f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
else:
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
def preprocess_table():
@@ -15,8 +52,12 @@ def preprocess_quote():
pass
def clean_heading_in_content():
pass
def clean_heading_in_content(content, title: str):
for child in content.contents:
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
if title == child.text:
child.extract()
break
def preprocess_footnotes():
@@ -28,8 +69,18 @@ def add_fonts():
def unwrap_structural_tags(body_tag):
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span'
]
divs = body_tag.find_all("div")
for div in divs:
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
div.unwrap()
secs = body_tag.find_all("section")
@@ -48,19 +99,18 @@ def unwrap_structural_tags(body_tag):
for s in articles:
s.unwrap()
# articles = body_tag.find_all("html")
# for s in articles:
# s.unwrap()
articles = body_tag.find_all("html")
for s in articles:
s.unwrap()
spans = body_tag.find_all("span")
# not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in spans:
if not s.string and s.contents:
is_string = [isinstance(child, NavigableString) for child in s.contents]
if any(is_string):
pass
else:
s.unwrap()
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
if all(is_not_struct_tag):
continue
s.unwrap()
for node in body_tag:
if isinstance(node, NavigableString):
@@ -75,15 +125,6 @@ def unwrap_structural_tags(body_tag):
return body_tag
def str2html_soup(html_text: str, element_id=None):
html_soup = BeautifulSoup(html_text, features='lxml')
if element_id:
x = html_soup.find(id=element_id)
return str(x)
else:
return str(html_text)
def get_tags_between_ids(first_id, href, html_soup):
h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
if h_marked:
@@ -102,3 +143,22 @@ def get_tags_between_ids(first_id, href, html_soup):
assert 0, f'Warning: no match for {first_id, href}'
return tags
def prepare_title_and_content(title, content: BeautifulSoup):
title_str = BeautifulSoup(title, features='lxml').string
# 0. cleaning \n
to_remove = []
for child in content.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t\xa0])', '', child.string)
if s == '':
to_remove.append(child)
[x.extract() for x in to_remove]
# 1. rule#1 for heading removal
clean_heading_in_content(content, title_str)
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content))
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
return title_str, content_str