epub converter: add css processing

This commit is contained in:
shirshasa
2021-04-22 17:26:17 +03:00
parent 8f284651c4
commit e0e64a0c38
3 changed files with 229 additions and 31 deletions

View File

@@ -10,8 +10,7 @@ from ebooklib.epub import Link, Section
from src.data_objects import ChapterItem, NavPoint
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
preprocess_image, preprocess_footnotes
update_src_links_in_images, preprocess_footnotes
# epub3 examples:
# https://github.com/IDPF/epub3-samples
@@ -27,12 +26,15 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_
# поиск toc в epublib:
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav.
from src.util.css_reader import clean_css, add_inline_style_to_html_soup
class EpubPostprocessor:
def __init__(self, file, access=None):
self.file = file
self.access = access
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
# read images
self.href2img_bytes = {}
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
file_name = x.file_name
@@ -42,21 +44,26 @@ class EpubPostprocessor:
# read html
self.id_anchor_exist_in_nav_points = False
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
# read css
self.html_href2css_href = {}
self.css_href2content = {}
self.build_css_content()
# add css
self.add_css_styles2soup()
# read footnotes
self.footnotes = []
for href in self.href2soup_html:
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html,
noteref_attr_name='data-type'))
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html))
# read toc
self.href2ids = defaultdict(list)
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
# build simple toc from spine if needed
if not self.is_toc_valid():
self.build_adjacency_list_from_spine()
# read anchored blocks, split html into separate block
self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed
self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
self.build_anchor2soup()
# if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
@@ -68,12 +75,37 @@ class EpubPostprocessor:
# todo: check if other chapters exist
nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_text = item.get_body_content()
soup = BeautifulSoup(html_text, features='lxml')
html_body_text = item.get_body_content()
soup = BeautifulSoup(html_body_text, features='lxml')
nodes[item.file_name] = soup
return nodes
def build_css_content(self):
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_text = item.content
soup = BeautifulSoup(html_text, features='lxml')
for tag in soup.find_all('link', attrs={"type": "text/css"}):
css_href = tag.attrs.get('href')
self.html_href2css_href[item.file_name] = css_href
if css_href not in self.css_href2content:
print(css_href)
css_content: str = self.ebooklib_book.get_item_with_href(css_href).get_content().decode()
self.css_href2content[css_href] = clean_css(css_content)
for i, tag in enumerate(soup.find_all('style')):
css_content = tag.string
self.html_href2css_href[item.file_name] = f'href{i}'
self.css_href2content[f'href{i}'] = clean_css(css_content)
def add_css_styles2soup(self):
for href in self.href2soup_html:
if self.html_href2css_href.get(href):
css: str = self.css_href2content[self.html_href2css_href[href]]
content = self.href2soup_html[href]
content = add_inline_style_to_html_soup(content, css)
self.href2soup_html[href] = content
def build_manifest_id2href(self):
links = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
@@ -193,7 +225,7 @@ class EpubPostprocessor:
else:
content: BeautifulSoup = self.href2soup_html[node.href]
preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
sub_nodes = []

View File

@@ -27,7 +27,7 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
return link
def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
img_tags = body_tag.find_all('img')
for img in img_tags:
@@ -189,8 +189,7 @@ def unwrap_structural_tags(body_tag):
'figure', 'footer', 'iframe', 'span', 'p'
]
divs = body_tag.find_all("div")
for div in divs:
for div in body_tag.find_all("div"):
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
@@ -198,35 +197,34 @@ def unwrap_structural_tags(body_tag):
continue
div.unwrap()
secs = body_tag.find_all("section")
for s in secs:
for s in body_tag.find_all("section"):
s.unwrap()
articles = body_tag.find_all("article")
for s in articles:
for s in body_tag.find_all("article"):
s.unwrap()
articles = body_tag.find_all("main")
for s in articles:
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
s.unwrap()
articles = body_tag.find_all("body")
for s in articles:
for s in body_tag.find_all("body"):
s.unwrap()
articles = body_tag.find_all("html")
for s in articles:
for s in body_tag.find_all("html"):
s.unwrap()
spans = body_tag.find_all("span")
# not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in spans:
if not s.string and s.contents:
for s in body_tag.find_all("span"):
if s.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
if all(is_not_struct_tag):
continue
s.unwrap()
_preprocessing_headings(body_tag)
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
@@ -278,6 +276,6 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
_process_lists(content_tag)
_preprocessing_headings(content_tag)
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str)
return title_str, content_str
return title_str, str(content_tag)

168
src/util/css_reader.py Normal file
View File

@@ -0,0 +1,168 @@
import re
from itertools import takewhile
import cssutils
from bs4 import BeautifulSoup
from ebooklib import epub
from premailer import transform
from src.config import LawCartaConfig
def convert_font_property(property):
return ''
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
'22px',
'23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '35px',
'36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px',
'49px', '50px', '64px', '72px']
def convert_font_size(value):
if 'pt' in value:
if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
return ''
else:
return value.replace('pt', 'px')
if value == '100%':
return ''
try:
if '%' in value:
value = float(value.replace('%', ''))
value = value / 100.0
elif 'em' in value:
value = float(value.replace('em', ''))
else:
return ''
if value > 5:
return ''
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index]
except ValueError:
return ''
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [], # ???
'font': [], # ???
'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
if x != LawCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
'color': [],
'background-color': [],
}
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': lambda x: LawCartaConfig.INDENT,
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': convert_font_property,
'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_font_size,
}
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
}
'''
FONT -> <span>
font-size:14pt; pt->px
LATER:
vertical-align: sub; <span style="font-size:10px">o</span>
text-transform: uppercase;
text-decoration-color: red;
em, in, pt -> px
'''
def clean_css(css):
sheet = cssutils.parseString(css, validate=False)
for rule in sheet:
if rule.type == rule.STYLE_RULE:
for property_ in rule.style:
if property_.name not in LIVECARTA_STYLE_ATTRS:
rule.style[property_.name] = ''
# not remove based on property value
elif LIVECARTA_STYLE_ATTRS.get(property_.name):
tmp = property_.value.replace('\"', '')
if tmp in LIVECARTA_STYLE_ATTRS[property_.name]:
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
tmp = property_.value.replace('\"', '')
rule.style[property_.name] = func(tmp)
print(property_.name, rule.style[property_.name], )
else:
rule.style[property_.name] = ''
else:
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
tmp = property_.value.replace('\"', '')
rule.style[property_.name] = func(tmp)
print(property_.name, rule.style[property_.name], )
css_text = sheet._getCssText().decode()
return css_text
def style_property2livecarta_convention(style_str):
return style_str
def add_inline_style_to_html_soup(soup1, css_text):
livecarta_p_ids = []
h_regex = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)'))):
x.attrs['livecarta_id'] = i
livecarta_p_ids.append(i)
html_with_inline_style = transform(str(soup1), css_text=css_text, remove_classes=False, external_styles=False,
disable_validation=True)
soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
for i in livecarta_p_ids:
tag = soup1.find(attrs={'livecarta_id': i})
tag_with_style = soup2.find(attrs={'livecarta_id': i})
if tag_with_style.attrs.get('style'):
style = tag_with_style.attrs.get('style') + ';'
tag.attrs['style'] = style_property2livecarta_convention(style)
del tag.attrs['livecarta_id']
return soup1
if __name__ == '__main__':
file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub'
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode()
css_cleaned = clean_css(css_)
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml')
print(add_inline_style_to_html_soup(html_soup, css_cleaned))