This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/css_reader.py
2021-07-09 10:35:31 +03:00

386 lines
15 KiB
Python

import re
from typing import List
import cssutils
from bs4 import BeautifulSoup
from ebooklib import epub
from premailer import transform
from itertools import takewhile
from logging import CRITICAL
from livecarta_config import LawCartaConfig
from util.color_reader import str2hex
cssutils.log.setLevel(CRITICAL)
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
'48px', '49px', '50px', '64px', '72px']
list_types = ['circle', 'disc', 'armenian','decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha','lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none' ]
def convert_font_size(value):
if 'pt' in value:
if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
return ''
else:
return value.replace('pt', 'px')
if value == '100%':
return ''
try:
if '%' in value:
value = float(value.replace('%', ''))
value = value / 100.0
elif 'em' in value:
value = float(value.replace('em', ''))
else:
return ''
if value > 5:
return ''
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index]
except ValueError:
return ''
"""
LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [], # ???
'font': [], # ???
'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
if x != LawCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
'color': [],
'background-color': [],
'background': [],
'width': [],
'border-top-width': [],
'border-right-width': [],
'border-left-width': [],
'border-bottom-width': [],
'border': [],
# 'list-style-type': [],
# 'list-style-image': []
}
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
return color
def get_text_color(x):
color = str2hex(x)
color = color if color not in ['#000000', '#000', 'black'] else ''
return color
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': lambda x: LawCartaConfig.INDENT if x != '0' else '',
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_font_size,
'color': get_text_color,
'background-color': get_bg_color,
'background': get_bg_color,
'border': lambda x: x if x != '0' else '',
'border-top-width': lambda x: x if x != '0' else '',
'border-right-width': lambda x: x if x != '0' else '',
'border-left-width': lambda x: x if x != '0' else '',
'border-bottom-width': lambda x: x if x != '0' else '',
# 'list-style-type': lambda x: x if x in list_types else 'disc',
# 'list-style-image': lambda x: 'disc'
}
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong',
('font-weight', '600'): 'strong',
('font-weight', '700'): 'strong',
('font-weight', '800'): 'strong',
('font-weight', '900'): 'strong',
('font-style', 'italic'): 'i',
('text-decoration', 'underline'): 'u',
('text-decoration', 'line-through'): 's',
('text-decoration-line', 'underline'): 'u',
('text-decoration-line', 'line-through'): 's',
('vertical-align', 'super'): 'sup',
}
def check_style_to_be_tag(style) -> List[tuple]:
""" Some css style properties converts to tags.
Search for them and prepare list of properties to be removed from style string"""
to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
to_remove.append(k)
return to_remove
def update_property_to_livecarta_convention(rule, property_):
if property_.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
rule.style[property_.name] = ''
return
cleaned_value = property_.value.replace('\"', '')
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(property_.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[property_.name]
if there_are_constraints_on_value and value_not_in_possible_values_list:
# property + value not in LIVECARTA_STYLE_ATTRS, remove from css file
rule.style[property_.name] = ''
else:
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
rule.style[property_.name] = func(cleaned_value)
def clean_css(css):
sheet = cssutils.parseString(css, validate=False)
for rule in sheet:
if rule.type == rule.STYLE_RULE:
for property_ in rule.style:
update_property_to_livecarta_convention(rule, property_)
css_text = sheet._getCssText().decode()
return css_text
class TagStyleConverter:
def __init__(self, tag, tag_with_style):
self.tag = tag
self.tag_initial_name = tag.name
self.tag_with_style = tag_with_style
self.style = self.preprocess_style()
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
if 'background' in style_:
return style_
# if text color is white, check that we have bg-color
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
# if bg color is inherited, just return style as is
for parent_tag in tag.parents:
# white bg color not need to be checked as we do not write 'white bg color'
tag_with_bg = ['span', 'td', 'tr', 'p']
tag_will_be_saved = parent_tag.name in tag_with_bg
has_bg = parent_tag.attrs.get('style') and ('background' in parent_tag.attrs.get('style'))
if has_bg and tag_will_be_saved:
return style_
children = tag.find_all()
for child in children:
if child.attrs.get('style') and ('background' in child.attrs.get('style')):
tmp_style = child.attrs['style'] + '; color:#fff; '
child.attrs['style'] = tmp_style
# for child with bg color we added white text color, so this tag don't need white color
style_ = style_.replace('color:#fff;', '')
style_ = style_.replace('color:#ffffff;', '')
style_ = style_.replace('color:white;', '')
return style_
def preprocess_style(self):
style = self.tag_with_style.attrs.get('style') + ';'
style = self.remove_white_if_no_bgcolor(style, self.tag_with_style)
style = style.replace('background:', 'background-color:')
return style
def change_attrs_with_corresponding_tags(self):
# adds <b>, <u>, <sup>, etc
to_remove = check_style_to_be_tag(self.style)
new_tags = []
for i, (p, v) in enumerate(to_remove):
s = f'{p}:{v};'
self.style = self.style.replace(s, '')
self.style = self.style.strip()
if i == 0:
self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
new_tags.append(self.tag)
else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag)
new_tags.append(new_tag)
top_tag = self.tag
if new_tags:
tmp_attrs = top_tag.attrs.copy()
top_tag = BeautifulSoup(features='lxml').new_tag(self.tag_initial_name)
top_tag.attrs = tmp_attrs
if self.style:
top_tag.attrs['style'] = self.style
new_tags[-1].wrap(top_tag)
else:
top_tag.attrs['style'] = self.style
return top_tag
@staticmethod
def wrap_p_to_save_style_attrs(t):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent']]
if t.name == 'p' and t.attrs.get('style'):
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_p]
if any(check):
t.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p')
old_style = t.attrs['style']
new_style = ''
possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)')
has_p_style_attrs = re.search(possible_p_attrs_regexp, old_style)
if has_p_style_attrs:
if has_p_style_attrs.group(1):
new_style += has_p_style_attrs.group(1)
old_style = old_style.replace(has_p_style_attrs.group(1), '')
if has_p_style_attrs.group(3):
new_style += has_p_style_attrs.group(3)
old_style = old_style.replace(has_p_style_attrs.group(3), '')
p_tag.attrs['style'] = new_style
t.attrs['style'] = old_style
t.wrap(p_tag)
@staticmethod
def add_span_to_save_style_attrs_in_li(t):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'list-style-type']]
if t.name == 'li' and t.attrs.get('style'):
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
if any(check):
t.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li')
old_style = t.attrs['style']
possible_li_attrs_regexp = re.compile(r'(text-align:(\w+);)')
has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
new_style = has_li_style_attrs.group(1)
old_style = old_style.replace(has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = new_style
t.attrs['style'] = old_style
t.wrap(li_tag)
@staticmethod
def add_span_to_save_style_attrs_in_ul(t):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
if t.name == 'ul' and t.attrs.get('style'):
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
if any(check):
t.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
old_style = t.attrs['style']
possible_li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
new_style = has_li_style_attrs.group(1)
old_style = old_style.replace(has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = new_style
t.attrs['style'] = old_style
t.wrap(li_tag)
@staticmethod
def add_span_to_save_style_attrs(t):
h_regex = f'(^h[1-9]$)'
no_style_in_livecarta_regexp = re.compile(r'(^ol$)(^ul$)|' + h_regex)
if re.search(no_style_in_livecarta_regexp, t.name) and t.attrs.get('style'):
new_tag = BeautifulSoup(features='lxml').new_tag(t.name)
t.name = 'span'
t.wrap(new_tag)
def convert_initial_tag(self):
del self.tag.attrs['livecarta_id']
self.tag = self.change_attrs_with_corresponding_tags()
self.wrap_p_to_save_style_attrs(self.tag)
self.add_span_to_save_style_attrs_in_li(self.tag)
# self.add_span_to_save_style_attrs_in_ul(self.tag)
self.add_span_to_save_style_attrs(self.tag)
return self.tag
def add_inline_style_to_html_soup(soup1, css_text):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []
h_regex = f'(^h[1-9]$)'
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
elements_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
for i, x in enumerate(elements_with_possible_style_attr):
x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i)
html_with_inline_style = transform(str(soup1), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True)
soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
for i in livecarta_tmp_ids:
tag = soup1.find(attrs={'livecarta_id': i})
tag_with_style = soup2.find(attrs={'livecarta_id': i})
if tag_with_style.attrs.get('style'):
style_converter = TagStyleConverter(tag, tag_with_style)
style_converter.convert_initial_tag()
else:
del tag.attrs['livecarta_id']
return soup1
if __name__ == '__main__':
file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub'
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode()
css_cleaned = clean_css(css_)
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml')
print(add_inline_style_to_html_soup(html_soup, css_cleaned))