This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/epub_converter/css_reader.py
2021-11-03 12:23:13 +03:00

537 lines
22 KiB
Python

import re
import cssutils
from typing import List
from ebooklib import epub
from logging import CRITICAL
from bs4 import BeautifulSoup
from premailer import transform
from itertools import takewhile
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
cssutils.log.setLevel(CRITICAL)
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
'48px', '49px', '50px', '64px', '72px']
list_types = ['circle', 'disc', 'armenian', 'decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_font_size(value):
if 'pt' in value:
if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
return ''
else:
return value.replace('pt', 'px')
if value == '100%':
return ''
try:
if '%' in value:
value = float(value.replace('%', ''))
value = value / 100.0
elif 'em' in value:
value = float(value.replace('em', ''))
else:
return ''
if value > 5:
return ''
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index]
except ValueError:
return ''
def convert_indents(value):
# 30px = 3.2% = 1.25em = 23pt
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)')
has_style_attrs = re.search(text_indent_regexp, value)
if has_style_attrs:
if has_style_attrs.group(1):
value = value.replace(has_style_attrs.group(1),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
'px')
elif has_style_attrs.group(2):
value = value.replace(has_style_attrs.group(2),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
'px')
elif has_style_attrs.group(4):
value = value.replace(has_style_attrs.group(4),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px')
return value
"""
LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [],
'font': [],
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
'color': [],
'background-color': [],
'background': [],
'width': [],
'border': [],
'border-top-width': [],
'border-right-width': [],
'border-left-width': [],
'border-bottom-width': [],
'border-top': [],
'border-bottom': [],
'list-style-type': [],
'list-style-image': [],
'margin-left': []
}
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
return color
def get_text_color(x):
color = str2hex(x)
color = color if color not in ['#000000', '#000', 'black'] else ''
return color
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents,
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_font_size,
'color': get_text_color,
'background-color': get_bg_color,
'background': get_bg_color,
'border': lambda x: x if x != '0' else '',
'border-top-width': lambda x: x if x != '0' else '',
'border-right-width': lambda x: x if x != '0' else '',
'border-left-width': lambda x: x if x != '0' else '',
'border-bottom-width': lambda x: x if x != '0' else '',
'border-top': lambda x: x if x != '0' else '',
'border-bottom': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in list_types else 'disc',
'list-style-image': lambda x: 'disc',
'margin-left': convert_indents
}
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong',
('font-weight', '600'): 'strong',
('font-weight', '700'): 'strong',
('font-weight', '800'): 'strong',
('font-weight', '900'): 'strong',
('font-style', 'italic'): 'i',
('text-decoration', 'underline'): 'u',
('text-decoration', 'line-through'): 's',
('text-decoration-line', 'underline'): 'u',
('text-decoration-line', 'line-through'): 's',
('vertical-align', 'super'): 'sup',
}
def check_style_to_be_tag(style) -> List[tuple]:
""" Some css style properties converts to tags.
Search for them and prepare list of properties to be removed from style string"""
to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
to_remove.append(k)
return to_remove
def update_css_style_types_to_livecarta_convention(css_rule, style_type):
if style_type.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
return
cleaned_value = style_type.value.replace('\"', '') # value of style
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_type.name]
if there_are_constraints_on_value and value_not_in_possible_values_list:
# style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
else:
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
css_rule.style[style_type.name] = func(cleaned_value)
def build_css_content(css_content):
sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet:
if css_rule.type == css_rule.STYLE_RULE:
for style_type in css_rule.style:
update_css_style_types_to_livecarta_convention(
css_rule, style_type)
css_text = sheet._getCssText().decode()
return css_text
class TagStyleConverter:
def __init__(self, tag_with_inline_style, tag_with_ultimate_style):
# tag with inline style to be updated with style attribute
self.tag_with_inline_style = tag_with_inline_style
self.tag_initial_name = tag_with_inline_style.name
# tag with inline style + style parsed from css file
self.tag_with_ultimate_style = tag_with_ultimate_style
self.style = self.preprocess_style()
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
if 'background' in style_:
return style_
# if text color is white, check that we have bg-color
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
# if bg color is inherited, just return style as is
for parent_tag in tag.parents:
# white bg color not need to be checked as we do not write 'white bg color'
tag_with_bg = ['span', 'td', 'tr', 'p']
tag_will_be_saved = parent_tag.name in tag_with_bg
has_bg = parent_tag.attrs.get('style') and (
'background' in parent_tag.attrs.get('style'))
if has_bg and tag_will_be_saved:
return style_
children = tag.find_all()
for child in children:
if child.attrs.get('style') and ('background' in child.attrs.get('style')):
tmp_style = child.attrs['style'] + '; color:#fff; '
child.attrs['style'] = tmp_style
# for child with bg color we added white text color, so this tag don't need white color
style_ = style_.replace('color:#fff;', '')
style_ = style_.replace('color:#ffffff;', '')
style_ = style_.replace('color:white;', '')
return style_
@staticmethod
def process_indents_to_px(split_style: list) -> str:
# clean with convert_indents() style string and make new clean_style
clean_style = ''
for item in split_style:
item = item.split(':')
if item[0] in ['text-indent', 'margin-left']:
item[1] = convert_indents(item[1])
clean_style += item[0] + ': ' + item[1] + '; '
margin_left_regexp = re.compile(
r'(margin-left:( *-*\w+);*)')
text_indent_regexp = re.compile(
r'(text-indent:( *-*\w+);*)')
has_margin_left = re.search(margin_left_regexp, clean_style)
has_text_indent = re.search(text_indent_regexp, clean_style)
#formula_of_indent: indent = abs(margin_left - text_indent)
if has_margin_left:
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2))))))
if has_text_indent:
num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))))
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
clean_style = clean_style.replace(has_margin_left.group(1), '')
return clean_style
clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' +
str(abs(num_ml)) + 'px; ')
return clean_style
elif has_text_indent:
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ')
return clean_style
return clean_style
def preprocess_style(self):
ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';'
ultimate_style = self.remove_white_if_no_bgcolor(
ultimate_style, self.tag_with_ultimate_style)
ultimate_style = ultimate_style.replace(
'background:', 'background-color:')
ultimate_style = ultimate_style.replace(
'list-style-image', 'list-style-type')
split_ultimate_style = ultimate_style.replace('; ', ';').split(';')
# when we split style by ; and we have at the end ; that's why we have '' in list
while '' in split_ultimate_style:
split_ultimate_style.remove('')
# replace all spaces between ': & letter' to ':'
split_ultimate_style = [el.replace(
re.search(r'(:\s*)', el).group(1), ':') for el in split_ultimate_style]
if self.tag_with_inline_style.attrs.get('style'):
inline_style = self.tag_with_inline_style.attrs['style']
split_inline_style = inline_style.replace('; ', ';').split(';')
# when we split style by ; and we have at the end ; that's why we have '' in list
while '' in split_inline_style:
split_inline_style.remove('')
# replace all spaces between ': & letter' to ':'
split_inline_style = [el.replace(
re.search(r'(:\s*)', el).group(1), ':') for el in split_inline_style]
# repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css
repeat_styles = list(set(split_ultimate_style)
& set(split_inline_style))
for item in repeat_styles:
split_inline_style.remove(item)
if split_inline_style:
# if inline style is not empty - start convert and add to ultimate style
print('we enter repetition check', '\n')
inline_style: str = self.process_indents_to_px(
split_inline_style)
ultimate_style += inline_style
ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
return ultimate_style
def change_attrs_with_corresponding_tags(self):
# adds <b>, <u>, <sup>, etc
to_remove = check_style_to_be_tag(self.style)
new_tags = []
for i, (attr, value) in enumerate(to_remove):
s = f'{attr}:{value};'
self.style = self.style.replace(s, '')
self.style = self.style.strip()
if i == 0:
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)]
new_tags.append(self.tag_with_inline_style)
else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag)
new_tags.append(new_tag)
top_tag = self.tag_with_inline_style
if new_tags:
tmp_attrs = top_tag.attrs.copy()
top_tag.attrs = {}
top_tag2 = BeautifulSoup(features='lxml').new_tag(
self.tag_initial_name)
top_tag2.attrs = tmp_attrs
if self.style:
top_tag2.attrs['style'] = self.style
new_tags[-1].wrap(top_tag2)
else:
top_tag.attrs['style'] = self.style
return top_tag
@staticmethod
def wrap_span_in_p_to_save_style_attrs(tag):
'''Function designed to save style attrs that cannot be in p -> span'''
if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
p_style = ''
initial_style = tag.attrs['style']
split_style = initial_style.replace('; ', ';').split(';')
possible_p_attrs_regexp = re.compile(
r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)')
for item in split_style:
has_p_style_attrs = re.search(possible_p_attrs_regexp, item)
if has_p_style_attrs:
p_style += item + ';'
initial_style = initial_style.replace(item + ';', '')
# here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
if any(styles_to_be_saved_in_span):
# if find styles that cannot be in <p> -> wrap them in span
tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p')
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, initial_style)
span_style = initial_style if not has_li_style_attr else initial_style.replace(
has_li_style_attr.group(1), '')
p_tag.attrs['style'] = p_style
tag.attrs['style'] = span_style
tag.wrap(p_tag)
else: tag.attrs['style'] = p_style
@staticmethod
def wrap_span_in_li_to_save_style_attrs(tag):
if tag.name == 'li' and tag.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']]
styles_to_be_saved = [attr in tag.attrs.get(
'style') for attr in styles_cant_be_in_li]
if any(styles_to_be_saved):
tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li')
span_style = tag.attrs['style']
li_style = ''
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
re.compile(r'(list-style-type:(\w+);)')]:
has_li_style_attrs = re.search(
possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
li_style += has_li_style_attrs.group(1)
span_style = span_style.replace(
has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = li_style
tag.attrs['style'] = span_style
tag.wrap(li_tag)
@staticmethod
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
check = [attr in tag.attrs.get('style')
for attr in styles_cant_be_in_ul_ol]
if any(check):
tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
span_style = tag.attrs['style']
possible_li_attrs_regexp = re.compile(
r'(list-style-type:(\w+);)')
has_li_style_attrs = re.search(
possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
oul_style = has_li_style_attrs.group(1)
span_style = span_style.replace(oul_style, '')
li_tag.attrs['style'] = oul_style
tag.attrs['style'] = span_style
tag.wrap(li_tag)
@staticmethod
def wrap_span_in_h_to_save_style_attrs(tag):
h_regexp = re.compile('(^h[1-9]$)')
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
tag.name = 'span'
tag.wrap(h_tag)
style = tag.attrs['style']
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, style)
tag.attrs['style'] = style if not has_li_style_attr else style.replace(
has_li_style_attr.group(1), '')
def convert_initial_tag(self):
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
self.wrap_span_in_p_to_save_style_attrs(self.tag_with_inline_style)
self.wrap_span_in_li_to_save_style_attrs(self.tag_with_inline_style)
self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_with_inline_style)
self.wrap_span_in_h_to_save_style_attrs(self.tag_with_inline_style)
return self.tag_with_inline_style
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []
could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_possible_style_attr = html_soup.find_all(
could_have_style_in_livecarta_regexp)
for i, x in enumerate(tags_with_possible_style_attr):
x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i)
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
# go through tags with possible style attrs
for i in livecarta_tmp_ids:
tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i})
tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})
del tag_with_initial_style.attrs['livecarta_id']
if tag_with_ultimate_style.attrs.get('style'):
style_converter = TagStyleConverter(
tag_with_initial_style, tag_with_ultimate_style)
style_converter.convert_initial_tag()
return html_soup
if __name__ == '__main__':
file = '../../epub/9781627222174.epub'
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode()
css_cleaned = build_css_content(css_)
html_ = ebooklib_book.get_item_with_href(
'pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml')
print(convert_html_soup_with_css_style(html_soup, css_cleaned))