forked from LiveCarta/BookConverter
epub converter: add css processing
This commit is contained in:
168
src/util/css_reader.py
Normal file
168
src/util/css_reader.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import re
|
||||
|
||||
from itertools import takewhile
|
||||
|
||||
import cssutils
|
||||
from bs4 import BeautifulSoup
|
||||
from ebooklib import epub
|
||||
from premailer import transform
|
||||
|
||||
from src.config import LawCartaConfig
|
||||
|
||||
|
||||
def convert_font_property(property):
|
||||
return ''
|
||||
|
||||
|
||||
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
|
||||
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
|
||||
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
|
||||
|
||||
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
|
||||
'22px',
|
||||
'23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '35px',
|
||||
'36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px',
|
||||
'49px', '50px', '64px', '72px']
|
||||
|
||||
|
||||
def convert_font_size(value):
|
||||
if 'pt' in value:
|
||||
if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
||||
return ''
|
||||
else:
|
||||
return value.replace('pt', 'px')
|
||||
|
||||
if value == '100%':
|
||||
return ''
|
||||
try:
|
||||
if '%' in value:
|
||||
value = float(value.replace('%', ''))
|
||||
value = value / 100.0
|
||||
elif 'em' in value:
|
||||
value = float(value.replace('em', ''))
|
||||
else:
|
||||
return ''
|
||||
|
||||
if value > 5:
|
||||
return ''
|
||||
|
||||
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
|
||||
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
||||
return sizes_px[last_possible_size_index]
|
||||
|
||||
except ValueError:
|
||||
return ''
|
||||
|
||||
|
||||
LIVECARTA_STYLE_ATTRS = {
|
||||
'text-indent': [],
|
||||
'font-variant': ['small-caps'],
|
||||
'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
|
||||
'align': [], # ???
|
||||
'font': [], # ???
|
||||
'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
|
||||
if x != LawCartaConfig.DEFAULT_FONT_NAME],
|
||||
'font-size': [],
|
||||
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
|
||||
'font-style': ['italic'], # <i>
|
||||
'text-decoration': ['underline', 'line-through'], # <u> , <s>
|
||||
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
|
||||
'vertical-align': ['super'], # <sup>
|
||||
'color': [],
|
||||
'background-color': [],
|
||||
}
|
||||
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||
'text-indent': lambda x: LawCartaConfig.INDENT,
|
||||
'font-variant': lambda x: x,
|
||||
'text-align': lambda x: x,
|
||||
'font': convert_font_property,
|
||||
'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x.capitalize()),
|
||||
'font-size': convert_font_size,
|
||||
}
|
||||
|
||||
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
||||
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
|
||||
'font-style': ['italic'], # <i>
|
||||
'text-decoration': ['underline', 'line-through'], # <u> , <s>
|
||||
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
|
||||
'vertical-align': ['super'], # <sup>
|
||||
}
|
||||
'''
|
||||
FONT -> <span>
|
||||
font-size:14pt; pt->px
|
||||
|
||||
LATER:
|
||||
vertical-align: sub; <span style="font-size:10px">o</span>
|
||||
text-transform: uppercase;
|
||||
text-decoration-color: red;
|
||||
|
||||
em, in, pt -> px
|
||||
'''
|
||||
|
||||
|
||||
def clean_css(css):
|
||||
sheet = cssutils.parseString(css, validate=False)
|
||||
for rule in sheet:
|
||||
if rule.type == rule.STYLE_RULE:
|
||||
for property_ in rule.style:
|
||||
|
||||
if property_.name not in LIVECARTA_STYLE_ATTRS:
|
||||
rule.style[property_.name] = ''
|
||||
# not remove based on property value
|
||||
elif LIVECARTA_STYLE_ATTRS.get(property_.name):
|
||||
tmp = property_.value.replace('\"', '')
|
||||
if tmp in LIVECARTA_STYLE_ATTRS[property_.name]:
|
||||
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
|
||||
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
|
||||
tmp = property_.value.replace('\"', '')
|
||||
rule.style[property_.name] = func(tmp)
|
||||
print(property_.name, rule.style[property_.name], )
|
||||
else:
|
||||
rule.style[property_.name] = ''
|
||||
else:
|
||||
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
|
||||
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
|
||||
tmp = property_.value.replace('\"', '')
|
||||
rule.style[property_.name] = func(tmp)
|
||||
print(property_.name, rule.style[property_.name], )
|
||||
|
||||
css_text = sheet._getCssText().decode()
|
||||
return css_text
|
||||
|
||||
|
||||
def style_property2livecarta_convention(style_str):
|
||||
return style_str
|
||||
|
||||
|
||||
def add_inline_style_to_html_soup(soup1, css_text):
|
||||
livecarta_p_ids = []
|
||||
h_regex = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||
for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)'))):
|
||||
x.attrs['livecarta_id'] = i
|
||||
livecarta_p_ids.append(i)
|
||||
|
||||
html_with_inline_style = transform(str(soup1), css_text=css_text, remove_classes=False, external_styles=False,
|
||||
disable_validation=True)
|
||||
soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
|
||||
|
||||
for i in livecarta_p_ids:
|
||||
tag = soup1.find(attrs={'livecarta_id': i})
|
||||
tag_with_style = soup2.find(attrs={'livecarta_id': i})
|
||||
if tag_with_style.attrs.get('style'):
|
||||
style = tag_with_style.attrs.get('style') + ';'
|
||||
tag.attrs['style'] = style_property2livecarta_convention(style)
|
||||
del tag.attrs['livecarta_id']
|
||||
|
||||
return soup1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub'
|
||||
ebooklib_book = epub.read_epub(file)
|
||||
css_ = ebooklib_book.get_item_with_href('css/epub.css')
|
||||
css_ = css_.get_content().decode()
|
||||
css_cleaned = clean_css(css_)
|
||||
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
|
||||
html_soup = BeautifulSoup(html_, features='lxml')
|
||||
|
||||
print(add_inline_style_to_html_soup(html_soup, css_cleaned))
|
||||
Reference in New Issue
Block a user