forked from LiveCarta/BookConverter
282 lines
10 KiB
Python
282 lines
10 KiB
Python
import re
|
|
from typing import List
|
|
|
|
import cssutils
|
|
|
|
from bs4 import BeautifulSoup
|
|
from ebooklib import epub
|
|
from premailer import transform
|
|
from itertools import takewhile
|
|
from logging import CRITICAL
|
|
|
|
from livecarta_config import LawCartaConfig
|
|
from util.color_reader import str2hex
|
|
|
|
cssutils.log.setLevel(CRITICAL)
|
|
|
|
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
|
|
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
|
|
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
|
|
|
|
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
|
|
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
|
|
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
|
|
'48px', '49px', '50px', '64px', '72px']
|
|
|
|
|
|
def convert_font_size(value):
|
|
if 'pt' in value:
|
|
if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
|
return ''
|
|
else:
|
|
return value.replace('pt', 'px')
|
|
|
|
if value == '100%':
|
|
return ''
|
|
try:
|
|
if '%' in value:
|
|
value = float(value.replace('%', ''))
|
|
value = value / 100.0
|
|
elif 'em' in value:
|
|
value = float(value.replace('em', ''))
|
|
else:
|
|
return ''
|
|
|
|
if value > 5:
|
|
return ''
|
|
|
|
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
|
|
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
|
return sizes_px[last_possible_size_index]
|
|
|
|
except ValueError:
|
|
return ''
|
|
|
|
|
|
"""
|
|
LIVECARTA_STYLE_ATTRS = { css property: value }
|
|
|
|
Style properties that can be used to fit livecarta css style convention.
|
|
If property has empty list, it means that any value can be converted.
|
|
If property has not empty list, it means that only certain property-value combinations can be transformed.
|
|
"""
|
|
LIVECARTA_STYLE_ATTRS = {
|
|
'text-indent': [],
|
|
'font-variant': ['small-caps'],
|
|
'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
|
|
'align': [], # ???
|
|
'font': [], # ???
|
|
'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
|
|
if x != LawCartaConfig.DEFAULT_FONT_NAME],
|
|
'font-size': [],
|
|
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
|
|
'font-style': ['italic'], # <i>
|
|
'text-decoration': ['underline', 'line-through'], # <u> , <s>
|
|
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
|
|
'vertical-align': ['super'], # <sup>
|
|
'color': [],
|
|
'background-color': [],
|
|
'background': [],
|
|
'width': [],
|
|
'border-top-width': [],
|
|
'border-right-width': [],
|
|
'border-left-width': [],
|
|
'border-bottom-width': [],
|
|
'border': []
|
|
}
|
|
|
|
"""
|
|
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
|
|
|
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
|
|
to suit livecarta style convention.
|
|
"""
|
|
|
|
|
|
def get_bg_color(x):
|
|
color = str2hex(x)
|
|
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
|
|
return color
|
|
|
|
|
|
def get_text_color(x):
|
|
color = str2hex(x)
|
|
color = color if color not in ['#000000', '#000', 'black'] else ''
|
|
return color
|
|
|
|
|
|
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
|
'text-indent': lambda x: LawCartaConfig.INDENT if x != '0' else '',
|
|
'font-variant': lambda x: x,
|
|
'text-align': lambda x: x,
|
|
'font': lambda x: '',
|
|
'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x.capitalize()),
|
|
'font-size': convert_font_size,
|
|
'color': get_text_color,
|
|
'background-color': get_bg_color,
|
|
'background': get_bg_color,
|
|
'border-top-width': lambda x: x if x != '0' else '',
|
|
'border-right-width': lambda x: x if x != '0' else '',
|
|
'border-left-width': lambda x: x if x != '0' else '',
|
|
'border-bottom-width': lambda x: x if x != '0' else '',
|
|
}
|
|
|
|
"""
|
|
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
|
|
"""
|
|
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
|
('font-weight', 'bold'): 'strong',
|
|
('font-weight', '600'): 'strong',
|
|
('font-weight', '700'): 'strong',
|
|
('font-weight', '800'): 'strong',
|
|
('font-weight', '900'): 'strong',
|
|
('font-style', 'italic'): 'i',
|
|
('text-decoration', 'underline'): 'u',
|
|
('text-decoration', 'line-through'): 's',
|
|
('text-decoration-line', 'underline'): 'u',
|
|
('text-decoration-line', 'line-through'): 's',
|
|
('vertical-align', 'super'): 'sup',
|
|
}
|
|
|
|
|
|
def check_style_to_be_tag(style) -> List[tuple]:
|
|
""" Some css style properties converts to tags.
|
|
Search for them and prepare list of properties to be removed from style string"""
|
|
to_remove = []
|
|
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
|
if f'{k[0]}:{k[1]}' in style:
|
|
to_remove.append(k)
|
|
return to_remove
|
|
|
|
|
|
def update_property_to_livecarta_convention(rule, property_):
|
|
if property_.name not in LIVECARTA_STYLE_ATTRS:
|
|
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
|
|
rule.style[property_.name] = ''
|
|
return
|
|
|
|
cleaned_value = property_.value.replace('\"', '')
|
|
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(property_.name)
|
|
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[property_.name]
|
|
if there_are_constraints_on_value and value_not_in_possible_values_list:
|
|
# property + value not in LIVECARTA_STYLE_ATTRS, remove from css file
|
|
rule.style[property_.name] = ''
|
|
else:
|
|
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
|
|
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
|
|
rule.style[property_.name] = func(cleaned_value)
|
|
|
|
|
|
def clean_css(css):
|
|
sheet = cssutils.parseString(css, validate=False)
|
|
for rule in sheet:
|
|
if rule.type == rule.STYLE_RULE:
|
|
for property_ in rule.style:
|
|
update_property_to_livecarta_convention(rule, property_)
|
|
|
|
css_text = sheet._getCssText().decode()
|
|
return css_text
|
|
|
|
|
|
def add_inline_style_to_html_soup(soup1, css_text):
|
|
livecarta_tmp_ids = []
|
|
h_regex = f'(^h[1-9]$)'
|
|
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
|
|
elements_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
|
|
for i, x in enumerate(elements_with_possible_style_attr):
|
|
x.attrs['livecarta_id'] = i
|
|
livecarta_tmp_ids.append(i)
|
|
|
|
html_with_inline_style = transform(str(soup1), css_text=css_text,
|
|
remove_classes=False,
|
|
external_styles=False,
|
|
disable_validation=True)
|
|
soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
|
|
|
|
def remove_white_if_no_bgcolor(style_, tag):
|
|
if 'background' in style_:
|
|
return style_
|
|
|
|
# if text color is white, check that we have bg-color
|
|
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
|
|
# if bg color is inherited, just return style as is
|
|
for parent_tag in tag.parents:
|
|
# white bg color not need to be checked as we do not write 'white bg color'
|
|
if parent_tag.attrs.get('style') and ('background' in parent_tag.attrs.get('style')):
|
|
print(tag, parent_tag.attrs.get('style'))
|
|
return style_
|
|
|
|
children = tag.find_all()
|
|
for child in children:
|
|
if child.attrs.get('style') and ('background' in child.attrs.get('style')):
|
|
tmp_style = child.attrs['style'] + '; color:#fff; '
|
|
child.attrs['style'] = tmp_style
|
|
|
|
# for child with bg color we added white text color, so this tag don't need white color
|
|
style_ = style_.replace('color:#fff;', '')
|
|
style_ = style_.replace('color:#ffffff;', '')
|
|
style_ = style_.replace('color:white;', '')
|
|
return style_
|
|
|
|
def wrap_p_if_bg_color(t):
|
|
if (t.name == 'p') and ('background' in t.attrs.get('style')):
|
|
t.name = 'span'
|
|
t.wrap( BeautifulSoup(features='lxml').new_tag('p'))
|
|
|
|
for i in livecarta_tmp_ids:
|
|
tag = soup1.find(attrs={'livecarta_id': i})
|
|
tag_initial_name = tag.name
|
|
tag_with_style = soup2.find(attrs={'livecarta_id': i})
|
|
if tag_with_style.attrs.get('style'):
|
|
style = tag_with_style.attrs.get('style') + ';'
|
|
style = remove_white_if_no_bgcolor(style, tag_with_style)
|
|
style = style.replace('background:', 'background-color:')
|
|
to_remove = check_style_to_be_tag(style)
|
|
new_tags = []
|
|
|
|
for i, (p, v) in enumerate(to_remove):
|
|
s = f'{p}:{v};'
|
|
style = style.replace(s, '')
|
|
if i == 0:
|
|
tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
|
|
new_tags.append(tag)
|
|
else:
|
|
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
|
|
new_tag = BeautifulSoup(features='lxml').new_tag(name)
|
|
new_tags[-1].wrap(new_tag)
|
|
new_tags.append(new_tag)
|
|
|
|
if to_remove:
|
|
style = style.strip()
|
|
tmp_attrs = tag.attrs.copy()
|
|
tag.attrs = {}
|
|
|
|
span_tag = BeautifulSoup(features='lxml').new_tag(tag_initial_name)
|
|
span_tag.attrs = tmp_attrs
|
|
if style:
|
|
span_tag.attrs['style'] = style
|
|
del span_tag.attrs['livecarta_id']
|
|
|
|
new_tags[-1].wrap(span_tag)
|
|
else:
|
|
tag.attrs['style'] = style
|
|
del tag.attrs['livecarta_id']
|
|
|
|
wrap_p_if_bg_color(tag)
|
|
|
|
else:
|
|
del tag.attrs['livecarta_id']
|
|
return soup1
|
|
|
|
|
|
if __name__ == '__main__':
|
|
file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub'
|
|
ebooklib_book = epub.read_epub(file)
|
|
css_ = ebooklib_book.get_item_with_href('css/epub.css')
|
|
css_ = css_.get_content().decode()
|
|
css_cleaned = clean_css(css_)
|
|
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
|
|
html_soup = BeautifulSoup(html_, features='lxml')
|
|
|
|
print(add_inline_style_to_html_soup(html_soup, css_cleaned))
|