This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/css_reader.py
2021-05-28 19:49:59 +03:00

244 lines
9.1 KiB
Python

import re
from typing import List
import cssutils
from bs4 import BeautifulSoup
from ebooklib import epub
from premailer import transform
from itertools import takewhile
from logging import CRITICAL
from livecarta_config import LawCartaConfig
from util.color_reader import str2color_name
cssutils.log.setLevel(CRITICAL)
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
'48px', '49px', '50px', '64px', '72px']
def convert_font_size(value):
if 'pt' in value:
if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
return ''
else:
return value.replace('pt', 'px')
if value == '100%':
return ''
try:
if '%' in value:
value = float(value.replace('%', ''))
value = value / 100.0
elif 'em' in value:
value = float(value.replace('em', ''))
else:
return ''
if value > 5:
return ''
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index]
except ValueError:
return ''
"""
LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [], # ???
'font': [], # ???
'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
if x != LawCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
'color': [],
'background-color': [],
'background': [],
'width': [],
'border-top-width': [],
'border-right-width': [],
'border-left-width': [],
'border-bottom-width': [],
'border': []
}
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': lambda x: LawCartaConfig.INDENT if x != '0' else '',
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_font_size,
'color': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''),
'background-color': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''),
'background': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''),
'border-top-width': lambda x: x if x != '0' else '',
'border-right-width': lambda x: x if x != '0' else '',
'border-left-width': lambda x: x if x != '0' else '',
'border-bottom-width': lambda x: x if x != '0' else '',
}
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong',
('font-weight', '600'): 'strong',
('font-weight', '700'): 'strong',
('font-weight', '800'): 'strong',
('font-weight', '900'): 'strong',
('font-style', 'italic'): 'i',
('text-decoration', 'underline'): 'u',
('text-decoration', 'line-through'): 's',
('text-decoration-line', 'underline'): 'u',
('text-decoration-line', 'line-through'): 's',
('vertical-align', 'super'): 'sup',
}
def check_style_to_be_tag(style) -> List[tuple]:
""" Some css style properties converts to tags.
Search for them and prepare list of properties to be removed from style string"""
to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
to_remove.append(k)
return to_remove
def update_property_to_livecarta_convention(rule, property_):
if property_.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
rule.style[property_.name] = ''
elif LIVECARTA_STYLE_ATTRS.get(property_.name):
# check property value to decide weather to remove or not the property_
cleaned_property = property_.value.replace('\"', '')
if cleaned_property in LIVECARTA_STYLE_ATTRS[property_.name]:
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
# apply transformation
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
rule.style[property_.name] = func(cleaned_property)
else:
# property + value not in LIVECARTA_STYLE_ATTRS, remove from css file
rule.style[property_.name] = ''
else:
# property can have any value
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
cleaned_property = property_.value.replace('\"', '')
rule.style[property_.name] = func(cleaned_property)
def clean_css(css):
sheet = cssutils.parseString(css, validate=False)
for rule in sheet:
if rule.type == rule.STYLE_RULE:
for property_ in rule.style:
update_property_to_livecarta_convention(rule, property_)
css_text = sheet._getCssText().decode()
return css_text
def add_inline_style_to_html_soup(soup1, css_text):
livecarta_tmp_ids = []
h_regex = f'(^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$)'
for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|' + h_regex))):
x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i)
html_with_inline_style = transform(str(soup1), css_text=css_text,
remove_classes=False,
external_styles=False,
disable_validation=True)
soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
for i in livecarta_tmp_ids:
tag = soup1.find(attrs={'livecarta_id': i})
tag_initial_name = tag.name
tag_with_style = soup2.find(attrs={'livecarta_id': i})
if tag_with_style.attrs.get('style'):
style = tag_with_style.attrs.get('style') + ';'
style = style.replace('background:', 'background-color:')
to_remove = check_style_to_be_tag(style)
new_tags = []
for i, (p, v) in enumerate(to_remove):
s = f'{p}:{v};'
style = style.replace(s, '')
if i == 0:
tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
new_tags.append(tag)
else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag)
new_tags.append(new_tag)
if to_remove:
style = style.strip()
tmp_attrs = tag.attrs.copy()
tag.attrs = {}
span_tag = BeautifulSoup(features='lxml').new_tag(tag_initial_name)
span_tag.attrs = tmp_attrs
if style:
span_tag.attrs['style'] = style
del span_tag.attrs['livecarta_id']
new_tags[-1].wrap(span_tag)
else:
tag.attrs['style'] = style
del tag.attrs['livecarta_id']
if(tag.name == 'p') and ('background-color' in tag.attrs.get('style')):
tag.name = 'span'
print(tag)
else:
del tag.attrs['livecarta_id']
return soup1
if __name__ == '__main__':
file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub'
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode()
css_cleaned = clean_css(css_)
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml')
print(add_inline_style_to_html_soup(html_soup, css_cleaned))