forked from LiveCarta/BookConverter
Make preprocessing of inline css
This commit is contained in:
238
src/epub_converter/css_preprocessing.py
Normal file
238
src/epub_converter/css_preprocessing.py
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
import re
|
||||||
|
import cssutils
|
||||||
|
|
||||||
|
from ebooklib import epub
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from itertools import takewhile
|
||||||
|
|
||||||
|
from src.util.color_reader import str2hex
|
||||||
|
from src.livecarta_config import LiveCartaConfig
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_color(x):
|
||||||
|
color = str2hex(x)
|
||||||
|
color = color if color not in ['#000000', '#000', 'black'] else ''
|
||||||
|
return color
|
||||||
|
|
||||||
|
|
||||||
|
def get_bg_color(x):
|
||||||
|
color = str2hex(x)
|
||||||
|
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
|
||||||
|
return color
|
||||||
|
|
||||||
|
|
||||||
|
def convert_tag_style_values(size_value: str) -> str:
|
||||||
|
"""
|
||||||
|
Function
|
||||||
|
- converts values of tags from em/%/pt to px
|
||||||
|
- find closest font-size px
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
size_value: str
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
size_value: str
|
||||||
|
|
||||||
|
"""
|
||||||
|
def find_closest_size(style_value):
|
||||||
|
possible_sizes = list(
|
||||||
|
takewhile(lambda x: style_value > x, LiveCartaConfig.sizes_pr))
|
||||||
|
last_possible_size_index = LiveCartaConfig.sizes_pr.index(
|
||||||
|
possible_sizes[-1])
|
||||||
|
return LiveCartaConfig.sizes_px[last_possible_size_index]
|
||||||
|
|
||||||
|
font_size_regexp = re.compile(
|
||||||
|
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
|
||||||
|
has_style_attrs = re.search(font_size_regexp, size_value)
|
||||||
|
if has_style_attrs:
|
||||||
|
if has_style_attrs.group(1):
|
||||||
|
size_value = float(size_value.replace('%', '')) / 100.0
|
||||||
|
return find_closest_size(size_value)
|
||||||
|
elif has_style_attrs.group(3):
|
||||||
|
size_value = float(size_value.replace('em', ''))
|
||||||
|
return find_closest_size(size_value)
|
||||||
|
elif has_style_attrs.group(5):
|
||||||
|
return size_value.replace('pt', 'px')
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
return size_value
|
||||||
|
|
||||||
|
|
||||||
|
def convert_indents_tag_values(size_value: str) -> str:
|
||||||
|
"""
|
||||||
|
Function converts values of ['text-indent', 'margin-left', 'margin']
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
size_value: str
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
size_value: str
|
||||||
|
|
||||||
|
"""
|
||||||
|
if len(size_value.split(' ')) == 3:
|
||||||
|
size_value = convert_tag_style_values(size_value.split(
|
||||||
|
' ')[-2]) # returns middle value
|
||||||
|
else:
|
||||||
|
size_value = convert_tag_style_values(size_value.split(
|
||||||
|
' ')[-1]) # returns last value
|
||||||
|
return size_value
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||||
|
Style properties that can be used to fit livecarta css style convention.
|
||||||
|
If property has empty list, it means that any value can be converted.
|
||||||
|
If property has not empty list, it means that only certain property-value combinations can be transformed.
|
||||||
|
"""
|
||||||
|
LIVECARTA_STYLE_ATTRS = {
|
||||||
|
'text-indent': [],
|
||||||
|
'font-variant': ['small-caps'],
|
||||||
|
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
|
||||||
|
'align': [],
|
||||||
|
'font': [],
|
||||||
|
'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys()
|
||||||
|
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
|
||||||
|
'font-size': [],
|
||||||
|
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
|
||||||
|
'font-style': ['italic'], # <i>
|
||||||
|
'text-decoration': ['underline', 'line-through'], # <u> , <s>
|
||||||
|
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
|
||||||
|
'vertical-align': ['super'], # <sup>
|
||||||
|
'color': [],
|
||||||
|
'background-color': [],
|
||||||
|
'background': [],
|
||||||
|
'width': [],
|
||||||
|
'border': [],
|
||||||
|
'border-top-width': [],
|
||||||
|
'border-right-width': [],
|
||||||
|
'border-left-width': [],
|
||||||
|
'border-bottom-width': [],
|
||||||
|
'border-top': [],
|
||||||
|
'border-bottom': [],
|
||||||
|
'list-style-type': [],
|
||||||
|
'list-style-image': [],
|
||||||
|
'margin-left': [],
|
||||||
|
'margin-top': [],
|
||||||
|
'margin': [],
|
||||||
|
}
|
||||||
|
|
||||||
|
"""
|
||||||
|
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
||||||
|
|
||||||
|
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
|
||||||
|
to suit livecarta style convention.
|
||||||
|
"""
|
||||||
|
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||||
|
'text-indent': convert_indents_tag_values,
|
||||||
|
'font-variant': lambda x: x,
|
||||||
|
'text-align': lambda x: x,
|
||||||
|
'font': lambda x: '',
|
||||||
|
'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(x) or
|
||||||
|
LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(x.capitalize()),
|
||||||
|
'font-size': convert_tag_style_values,
|
||||||
|
'color': get_text_color,
|
||||||
|
'background-color': get_bg_color,
|
||||||
|
'background': get_bg_color,
|
||||||
|
'border': lambda x: x if x != '0' else '',
|
||||||
|
'border-top-width': lambda x: x if x != '0' else '',
|
||||||
|
'border-right-width': lambda x: x if x != '0' else '',
|
||||||
|
'border-left-width': lambda x: x if x != '0' else '',
|
||||||
|
'border-bottom-width': lambda x: x if x != '0' else '',
|
||||||
|
'border-top': lambda x: x if x != '0' else '',
|
||||||
|
'border-bottom': lambda x: x if x != '0' else '',
|
||||||
|
'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc',
|
||||||
|
'list-style-image': lambda x: 'disc',
|
||||||
|
'margin-left': convert_indents_tag_values,
|
||||||
|
'margin-top': convert_tag_style_values,
|
||||||
|
'margin': convert_indents_tag_values
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def update_inline_styles_to_livecarta_convention(split_style: list):
|
||||||
|
for i, style in enumerate(split_style):
|
||||||
|
style_name, style_value = style.split(":")
|
||||||
|
if style_name not in LIVECARTA_STYLE_ATTRS:
|
||||||
|
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||||
|
split_style[i] = ''
|
||||||
|
return split_style
|
||||||
|
|
||||||
|
cleaned_value = style_value.replace('\"', '').split()[-1]
|
||||||
|
constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
|
||||||
|
style_name)
|
||||||
|
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
|
||||||
|
style_name]
|
||||||
|
if constraints_on_value and value_not_in_possible_values_list:
|
||||||
|
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||||
|
split_style[i] = ''
|
||||||
|
else:
|
||||||
|
if style_name in LIVECARTA_STYLE_ATTRS_MAPPING:
|
||||||
|
# function that converts our data
|
||||||
|
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
|
||||||
|
style_value = func(cleaned_value)
|
||||||
|
split_style[i] = style_name + ":" + style_value
|
||||||
|
return split_style
|
||||||
|
|
||||||
|
|
||||||
|
def build_inline_style_content(style: str) -> str:
|
||||||
|
"""Build inline style with livecarta convention"""
|
||||||
|
# replace all spaces between '; & letter' to ';'
|
||||||
|
style = re.sub(r"; *", ";", style)
|
||||||
|
# when we split style by ';', last element of the list is ''-None
|
||||||
|
# remove it
|
||||||
|
split_style: list = list(filter(None, style.split(';')))
|
||||||
|
# replace all spaces between ': & letter' to ':'
|
||||||
|
split_style = [el.replace(
|
||||||
|
re.search(r'(:\s*)', el).group(1), ':') for el in split_style]
|
||||||
|
|
||||||
|
split_style = update_inline_styles_to_livecarta_convention(split_style)
|
||||||
|
style = "; ".join(split_style)
|
||||||
|
return style
|
||||||
|
|
||||||
|
|
||||||
|
def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule,
|
||||||
|
style_type: cssutils.css.property.Property):
|
||||||
|
if style_type.name not in LIVECARTA_STYLE_ATTRS:
|
||||||
|
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||||
|
css_rule.style[style_type.name] = ''
|
||||||
|
return
|
||||||
|
|
||||||
|
cleaned_value = style_type.value.replace('\"', '').split(', ')[-1]
|
||||||
|
constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
|
||||||
|
style_type.name)
|
||||||
|
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
|
||||||
|
style_type.name]
|
||||||
|
if constraints_on_value and value_not_in_possible_values_list:
|
||||||
|
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||||
|
css_rule.style[style_type.name] = ''
|
||||||
|
else:
|
||||||
|
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
|
||||||
|
# function that converts our data
|
||||||
|
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
|
||||||
|
css_rule.style[style_type.name] = func(cleaned_value)
|
||||||
|
|
||||||
|
|
||||||
|
def build_css_file_content(css_content: str) -> str:
|
||||||
|
"""Build css content with livecarta convention"""
|
||||||
|
sheet = cssutils.parseString(css_content, validate=False)
|
||||||
|
|
||||||
|
for css_rule in sheet:
|
||||||
|
if css_rule.type == css_rule.STYLE_RULE:
|
||||||
|
for style_type in css_rule.style:
|
||||||
|
update_css_styles_to_livecarta_convention(
|
||||||
|
css_rule, style_type)
|
||||||
|
|
||||||
|
css_text: str = sheet._getCssText().decode()
|
||||||
|
return css_text
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
file = '../../epub/9781627222174.epub'
|
||||||
|
ebooklib_book = epub.read_epub(file)
|
||||||
|
css_ = ebooklib_book.get_item_with_href('css/epub.css')
|
||||||
|
css_ = css_.get_content().decode()
|
||||||
|
css_cleaned = build_css_file_content(css_)
|
||||||
|
html_ = ebooklib_book.get_item_with_href(
|
||||||
|
'pr01s05.xhtml').get_body_content().decode()
|
||||||
|
html_soup = BeautifulSoup(html_, features='lxml')
|
||||||
@@ -1,557 +0,0 @@
|
|||||||
import re
|
|
||||||
import cssutils
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from ebooklib import epub
|
|
||||||
from logging import CRITICAL
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from premailer import transform
|
|
||||||
from itertools import takewhile
|
|
||||||
|
|
||||||
from src.util.color_reader import str2hex
|
|
||||||
from src.livecarta_config import LiveCartaConfig
|
|
||||||
|
|
||||||
cssutils.log.setLevel(CRITICAL)
|
|
||||||
|
|
||||||
|
|
||||||
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0,
|
|
||||||
1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69,
|
|
||||||
1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38,
|
|
||||||
2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
|
|
||||||
|
|
||||||
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px',
|
|
||||||
'17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px',
|
|
||||||
'26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
|
|
||||||
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px',
|
|
||||||
'44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px']
|
|
||||||
|
|
||||||
list_types = ['circle', 'disc', 'armenian', 'decimal',
|
|
||||||
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
|
|
||||||
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
|
||||||
|
|
||||||
|
|
||||||
def convert_tag_style_values(value: str) -> str:
|
|
||||||
"""
|
|
||||||
Function
|
|
||||||
- converts values of tags from em/%/pt to px
|
|
||||||
- find closest font-size px
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
value: str
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
value: str
|
|
||||||
|
|
||||||
"""
|
|
||||||
def find_closest_size(size_value):
|
|
||||||
possible_sizes = list(takewhile(lambda x: size_value > x, sizes_pr))
|
|
||||||
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
|
||||||
return sizes_px[last_possible_size_index]
|
|
||||||
|
|
||||||
font_size_regexp = re.compile(
|
|
||||||
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
|
|
||||||
has_style_attrs = re.search(font_size_regexp, value)
|
|
||||||
if has_style_attrs:
|
|
||||||
if has_style_attrs.group(1):
|
|
||||||
value = float(value.replace('%', '')) / 100.0
|
|
||||||
return find_closest_size(value)
|
|
||||||
elif has_style_attrs.group(3):
|
|
||||||
value = float(value.replace('em', ''))
|
|
||||||
return find_closest_size(value)
|
|
||||||
elif has_style_attrs.group(5):
|
|
||||||
return value.replace('pt', 'px')
|
|
||||||
else:
|
|
||||||
return ''
|
|
||||||
return value
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
|
|
||||||
Style properties that can be used to fit livecarta css style convention.
|
|
||||||
If property has empty list, it means that any value can be converted.
|
|
||||||
If property has not empty list, it means that only certain property-value combinations can be transformed.
|
|
||||||
"""
|
|
||||||
LIVECARTA_STYLE_ATTRS = {
|
|
||||||
'text-indent': [],
|
|
||||||
'font-variant': ['small-caps'],
|
|
||||||
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
|
|
||||||
'align': [],
|
|
||||||
'font': [],
|
|
||||||
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
|
|
||||||
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
|
|
||||||
'font-size': [],
|
|
||||||
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
|
|
||||||
'font-style': ['italic'], # <i>
|
|
||||||
'text-decoration': ['underline', 'line-through'], # <u> , <s>
|
|
||||||
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
|
|
||||||
'vertical-align': ['super'], # <sup>
|
|
||||||
'color': [],
|
|
||||||
'background-color': [],
|
|
||||||
'background': [],
|
|
||||||
'width': [],
|
|
||||||
'border': [],
|
|
||||||
'border-top-width': [],
|
|
||||||
'border-right-width': [],
|
|
||||||
'border-left-width': [],
|
|
||||||
'border-bottom-width': [],
|
|
||||||
'border-top': [],
|
|
||||||
'border-bottom': [],
|
|
||||||
'list-style-type': [],
|
|
||||||
'list-style-image': [],
|
|
||||||
'margin-left': [],
|
|
||||||
'margin-top': [],
|
|
||||||
'margin': [],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_bg_color(x):
|
|
||||||
color = str2hex(x)
|
|
||||||
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
|
|
||||||
return color
|
|
||||||
|
|
||||||
|
|
||||||
def get_text_color(x):
|
|
||||||
color = str2hex(x)
|
|
||||||
color = color if color not in ['#000000', '#000', 'black'] else ''
|
|
||||||
return color
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
|
||||||
|
|
||||||
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
|
|
||||||
to suit livecarta style convention.
|
|
||||||
"""
|
|
||||||
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
|
||||||
'text-indent': convert_tag_style_values,
|
|
||||||
'font-variant': lambda x: x,
|
|
||||||
'text-align': lambda x: x,
|
|
||||||
'font': lambda x: '',
|
|
||||||
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or
|
|
||||||
LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
|
|
||||||
'font-size': convert_tag_style_values,
|
|
||||||
'color': get_text_color,
|
|
||||||
'background-color': get_bg_color,
|
|
||||||
'background': get_bg_color,
|
|
||||||
'border': lambda x: x if x != '0' else '',
|
|
||||||
'border-top-width': lambda x: x if x != '0' else '',
|
|
||||||
'border-right-width': lambda x: x if x != '0' else '',
|
|
||||||
'border-left-width': lambda x: x if x != '0' else '',
|
|
||||||
'border-bottom-width': lambda x: x if x != '0' else '',
|
|
||||||
'border-top': lambda x: x if x != '0' else '',
|
|
||||||
'border-bottom': lambda x: x if x != '0' else '',
|
|
||||||
'list-style-type': lambda x: x if x in list_types else 'disc',
|
|
||||||
'list-style-image': lambda x: 'disc',
|
|
||||||
'margin-left': convert_tag_style_values,
|
|
||||||
'margin-top': convert_tag_style_values,
|
|
||||||
'margin': convert_tag_style_values,
|
|
||||||
}
|
|
||||||
|
|
||||||
"""
|
|
||||||
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
|
|
||||||
|
|
||||||
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
|
|
||||||
"""
|
|
||||||
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
|
||||||
('font-weight', 'bold'): 'strong',
|
|
||||||
('font-weight', '600'): 'strong',
|
|
||||||
('font-weight', '700'): 'strong',
|
|
||||||
('font-weight', '800'): 'strong',
|
|
||||||
('font-weight', '900'): 'strong',
|
|
||||||
('font-style', 'italic'): 'i',
|
|
||||||
('text-decoration', 'underline'): 'u',
|
|
||||||
('text-decoration', 'line-through'): 's',
|
|
||||||
('text-decoration-line', 'underline'): 'u',
|
|
||||||
('text-decoration-line', 'line-through'): 's',
|
|
||||||
('vertical-align', 'super'): 'sup',
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def check_style_to_be_tag(style: str) -> List[tuple]:
|
|
||||||
"""
|
|
||||||
Function searches style properties that can be converted to tags.
|
|
||||||
It searches for them and prepare list of properties to be removed from style string
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
style: str
|
|
||||||
<tag style="...">
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
to_remove: list
|
|
||||||
properties to remove
|
|
||||||
|
|
||||||
"""
|
|
||||||
to_remove = []
|
|
||||||
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
|
||||||
if f'{k[0]}:{k[1]}' in style:
|
|
||||||
to_remove.append(k)
|
|
||||||
return to_remove
|
|
||||||
|
|
||||||
|
|
||||||
def update_css_style_types_to_livecarta_convention(css_rule, style_type):
|
|
||||||
if style_type.name not in LIVECARTA_STYLE_ATTRS:
|
|
||||||
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
|
|
||||||
css_rule.style[style_type.name] = ''
|
|
||||||
return
|
|
||||||
|
|
||||||
cleaned_value = style_type.value.replace('\"', '') # value of style
|
|
||||||
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name)
|
|
||||||
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
|
|
||||||
style_type.name]
|
|
||||||
if there_are_constraints_on_value and value_not_in_possible_values_list:
|
|
||||||
# style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file
|
|
||||||
css_rule.style[style_type.name] = ''
|
|
||||||
else:
|
|
||||||
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
|
|
||||||
# function that converts our data
|
|
||||||
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
|
|
||||||
css_rule.style[style_type.name] = func(cleaned_value)
|
|
||||||
|
|
||||||
|
|
||||||
def build_css_content(css_content):
|
|
||||||
"""Build css content with livecarta convention"""
|
|
||||||
sheet = cssutils.parseString(css_content, validate=False)
|
|
||||||
|
|
||||||
for css_rule in sheet:
|
|
||||||
if css_rule.type == css_rule.STYLE_RULE:
|
|
||||||
for style_type in css_rule.style:
|
|
||||||
update_css_style_types_to_livecarta_convention(
|
|
||||||
css_rule, style_type)
|
|
||||||
|
|
||||||
css_text = sheet._getCssText().decode()
|
|
||||||
return css_text
|
|
||||||
|
|
||||||
|
|
||||||
class TagStyleConverter:
|
|
||||||
def __init__(self, tag_with_inline_style, tag_with_ultimate_style):
|
|
||||||
# tag with inline style to be updated with style attribute
|
|
||||||
self.tag_with_inline_style = tag_with_inline_style
|
|
||||||
self.tag_initial_name = tag_with_inline_style.name
|
|
||||||
# tag with inline style + style parsed from css file
|
|
||||||
self.tag_with_ultimate_style = tag_with_ultimate_style
|
|
||||||
self.style = self.preprocess_style()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def remove_white_if_no_bgcolor(style_, tag):
|
|
||||||
"""Function remove text white color if there is no bg color"""
|
|
||||||
if 'background' in style_:
|
|
||||||
return style_
|
|
||||||
|
|
||||||
# if text color is white, check that we have bg-color
|
|
||||||
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
|
|
||||||
# if bg color is inherited, just return style as is
|
|
||||||
for parent_tag in tag.parents:
|
|
||||||
# white bg color not need to be checked as we do not write 'white bg color'
|
|
||||||
tag_with_bg = ['span', 'td', 'tr', 'p']
|
|
||||||
tag_will_be_saved = parent_tag.name in tag_with_bg
|
|
||||||
has_bg = parent_tag.attrs.get('style') and (
|
|
||||||
'background' in parent_tag.attrs.get('style'))
|
|
||||||
if has_bg and tag_will_be_saved:
|
|
||||||
return style_
|
|
||||||
|
|
||||||
children = tag.find_all()
|
|
||||||
for child in children:
|
|
||||||
if child.attrs.get('style') and ('background' in child.attrs.get('style')):
|
|
||||||
tmp_style = child.attrs['style'] + '; color:#fff; '
|
|
||||||
child.attrs['style'] = tmp_style
|
|
||||||
|
|
||||||
# for child with bg color we added white text color, so this tag don't need white color
|
|
||||||
style_ = style_.replace('color:#fff;', '')
|
|
||||||
style_ = style_.replace('color:#ffffff;', '')
|
|
||||||
style_ = style_.replace('color:white;', '')
|
|
||||||
return style_
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def process_indents_to_px(split_style: dict) -> str:
|
|
||||||
"""Function cleans style string using convert_tag_values() and returns new clean_style"""
|
|
||||||
split_style = [k + ":" + v for k, v in split_style.items()]
|
|
||||||
clean_style = ''
|
|
||||||
for item in split_style:
|
|
||||||
item = item.split(':')
|
|
||||||
if item[0] in ['text-indent', 'margin-left', 'margin']:
|
|
||||||
if len(item[1].split(' ')) == 3:
|
|
||||||
item[1] = convert_tag_style_values(item[1].split(
|
|
||||||
' ')[-2]) # split returns middle value
|
|
||||||
else:
|
|
||||||
item[1] = convert_tag_style_values(item[1].split(
|
|
||||||
' ')[-1]) # split returns last value
|
|
||||||
clean_style += item[0] + ': ' + item[1] + '; '
|
|
||||||
|
|
||||||
margin_left_regexp = re.compile(
|
|
||||||
r'((margin-left|margin): *(-*\w+);*)')
|
|
||||||
text_indent_regexp = re.compile(
|
|
||||||
r'(text-indent: *(-*\w+);*)')
|
|
||||||
|
|
||||||
has_margin = re.search(margin_left_regexp, clean_style)
|
|
||||||
has_text_indent = re.search(text_indent_regexp, clean_style)
|
|
||||||
# formula_of_indent: indent = abs(margin - text_indent)
|
|
||||||
if has_margin:
|
|
||||||
num_m = abs(int("0" + "".join(
|
|
||||||
filter(str.isdigit, str(has_margin.group(3))))))
|
|
||||||
|
|
||||||
if has_text_indent:
|
|
||||||
num_ti = abs(int("0" + "".join(
|
|
||||||
filter(str.isdigit, str(has_text_indent.group(2))))))
|
|
||||||
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
|
|
||||||
str(abs(num_m - num_ti)) + 'px; ')
|
|
||||||
clean_style = clean_style.replace(has_margin.group(1), '')
|
|
||||||
return clean_style
|
|
||||||
|
|
||||||
clean_style = clean_style.replace(has_margin.group(1), 'text-indent: ' +
|
|
||||||
str(abs(num_m)) + 'px; ')
|
|
||||||
return clean_style
|
|
||||||
|
|
||||||
elif has_text_indent:
|
|
||||||
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
|
|
||||||
str(abs(int("0" + "".join(
|
|
||||||
filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ')
|
|
||||||
return clean_style
|
|
||||||
return clean_style
|
|
||||||
|
|
||||||
def preprocess_style(self):
|
|
||||||
def remove_extra_spaces(style: str) -> dict:
|
|
||||||
"""Function to remove extra spaces in style to process clean_style"""
|
|
||||||
# replace all spaces between '; & letter' to ';'
|
|
||||||
style = re.sub(r"; *", ";", style)
|
|
||||||
split_style: List = style.split(';')
|
|
||||||
|
|
||||||
# when we split style by ; and we have at the end ; that's why we have '' in list
|
|
||||||
while '' in split_style:
|
|
||||||
split_style.remove('')
|
|
||||||
|
|
||||||
# replace all spaces between ': & letter' to ':'
|
|
||||||
split_style = [el.replace(
|
|
||||||
re.search(r'(:\s*)', el).group(1), ':') for el in split_style]
|
|
||||||
dict = {}
|
|
||||||
for list_item in split_style:
|
|
||||||
key, val = list_item.split(":")
|
|
||||||
dict[key] = val
|
|
||||||
return dict
|
|
||||||
|
|
||||||
ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';'
|
|
||||||
ultimate_style = self.remove_white_if_no_bgcolor(
|
|
||||||
ultimate_style, self.tag_with_ultimate_style)
|
|
||||||
ultimate_style = ultimate_style.replace(
|
|
||||||
'background:', 'background-color:')
|
|
||||||
ultimate_style = ultimate_style.replace(
|
|
||||||
'list-style-image', 'list-style-type')
|
|
||||||
|
|
||||||
split_ultimate_style: dict = remove_extra_spaces(ultimate_style)
|
|
||||||
ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
|
|
||||||
|
|
||||||
if self.tag_with_inline_style.attrs.get('style'):
|
|
||||||
inline_style = self.tag_with_inline_style.attrs['style']
|
|
||||||
|
|
||||||
split_inline_style: dict = remove_extra_spaces(inline_style)
|
|
||||||
|
|
||||||
# repetition check - if the tag had already had inline style
|
|
||||||
# that isn't in the css styles, add this to style parsed from css
|
|
||||||
repeat_styles = list(set(split_ultimate_style.keys())
|
|
||||||
& set(split_inline_style.keys()))
|
|
||||||
|
|
||||||
# remove styles(css) that are in css and inline
|
|
||||||
[split_inline_style.pop(item) for item in repeat_styles]
|
|
||||||
|
|
||||||
if split_inline_style:
|
|
||||||
# if split_inline_style is not empty - start convert and add to ultimate style
|
|
||||||
print('we enter repetition check', '\n')
|
|
||||||
inline_style: str = self.process_indents_to_px(
|
|
||||||
split_inline_style)
|
|
||||||
ultimate_style += inline_style
|
|
||||||
|
|
||||||
return ultimate_style
|
|
||||||
|
|
||||||
def change_attrs_with_corresponding_tags(self):
|
|
||||||
# adds <b>, <u>, <sup>, etc
|
|
||||||
to_remove = check_style_to_be_tag(self.style)
|
|
||||||
new_tags = []
|
|
||||||
for i, (attr, value) in enumerate(to_remove):
|
|
||||||
s = f'{attr}:{value};'
|
|
||||||
self.style = self.style.replace(s, '')
|
|
||||||
self.style = self.style.strip()
|
|
||||||
if not i:
|
|
||||||
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
|
|
||||||
attr, value)]
|
|
||||||
new_tags.append(self.tag_with_inline_style)
|
|
||||||
else:
|
|
||||||
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag(name)
|
|
||||||
new_tags[-1].wrap(new_tag)
|
|
||||||
new_tags.append(new_tag)
|
|
||||||
|
|
||||||
top_tag = self.tag_with_inline_style
|
|
||||||
|
|
||||||
if new_tags:
|
|
||||||
tmp_attrs = top_tag.attrs.copy()
|
|
||||||
top_tag.attrs = {}
|
|
||||||
top_tag2 = BeautifulSoup(features='lxml').new_tag(
|
|
||||||
self.tag_initial_name)
|
|
||||||
top_tag2.attrs = tmp_attrs
|
|
||||||
if self.style:
|
|
||||||
top_tag2.attrs['style'] = self.style
|
|
||||||
new_tags[-1].wrap(top_tag2)
|
|
||||||
else:
|
|
||||||
top_tag.attrs['style'] = self.style
|
|
||||||
|
|
||||||
return top_tag
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def wrap_span_in_p_to_save_style_attrs(tag):
|
|
||||||
"""Function designed to save style attrs that cannot be in p -> span"""
|
|
||||||
if tag.name == 'p' and tag.attrs.get('style'):
|
|
||||||
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
|
||||||
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
|
|
||||||
p_style = ''
|
|
||||||
initial_style = tag.attrs['style']
|
|
||||||
split_style = initial_style.replace('; ', ';').split(';')
|
|
||||||
possible_p_attrs_regexp = re.compile(
|
|
||||||
r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)')
|
|
||||||
for item in split_style:
|
|
||||||
has_p_style_attrs = re.search(possible_p_attrs_regexp, item)
|
|
||||||
if has_p_style_attrs:
|
|
||||||
p_style += item + ';'
|
|
||||||
initial_style = initial_style.replace(item + ';', '')
|
|
||||||
# here check that this style i exactly the same.
|
|
||||||
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
|
|
||||||
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
|
|
||||||
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
|
|
||||||
if any(styles_to_be_saved_in_span):
|
|
||||||
# if find styles that cannot be in <p> -> wrap them in span
|
|
||||||
tag.name = 'span'
|
|
||||||
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
|
||||||
p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
|
||||||
has_p_style_attr = re.search(p_attrs_regexp, initial_style)
|
|
||||||
span_style = initial_style if not has_p_style_attr else initial_style.replace(
|
|
||||||
has_p_style_attr.group(1), '')
|
|
||||||
p_tag.attrs['style'] = p_style
|
|
||||||
tag.attrs['style'] = span_style
|
|
||||||
tag.wrap(p_tag)
|
|
||||||
else:
|
|
||||||
tag.attrs['style'] = p_style
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def wrap_span_in_li_to_save_style_attrs(tag):
|
|
||||||
"""Function designed to save style attrs that cannot be in li -> span"""
|
|
||||||
if tag.name == 'li' and tag.attrs.get('style'):
|
|
||||||
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
|
|
||||||
attr not in ['text-align', 'list-style-type']]
|
|
||||||
|
|
||||||
styles_to_be_saved_in_span = [attr in tag.attrs.get(
|
|
||||||
'style') for attr in styles_cant_be_in_li]
|
|
||||||
if any(styles_to_be_saved_in_span):
|
|
||||||
tag.name = 'span'
|
|
||||||
li_tag = BeautifulSoup(features='lxml').new_tag('li')
|
|
||||||
span_style = tag.attrs['style']
|
|
||||||
li_style = ''
|
|
||||||
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
|
|
||||||
re.compile(r'(list-style-type:(\w+);)')]:
|
|
||||||
has_li_style_attrs = re.search(
|
|
||||||
possible_li_attrs_regexp, span_style)
|
|
||||||
if has_li_style_attrs and has_li_style_attrs.group(1):
|
|
||||||
li_style += has_li_style_attrs.group(1)
|
|
||||||
span_style = span_style.replace(
|
|
||||||
has_li_style_attrs.group(1), '')
|
|
||||||
li_tag.attrs['style'] = li_style
|
|
||||||
tag.attrs['style'] = span_style
|
|
||||||
tag.wrap(li_tag)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
|
|
||||||
"""Function designed to save style attrs that cannot be in ul/ol -> span"""
|
|
||||||
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
|
|
||||||
styles_cant_be_in_ul_ol = [
|
|
||||||
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
|
|
||||||
|
|
||||||
styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
|
|
||||||
for attr in styles_cant_be_in_ul_ol]
|
|
||||||
if any(styles_to_be_saved_in_span):
|
|
||||||
tag.name = 'span'
|
|
||||||
oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
|
|
||||||
span_style = tag.attrs['style']
|
|
||||||
|
|
||||||
possible_uol_attrs_regexp = re.compile(
|
|
||||||
r'(list-style-type:(\w+);)')
|
|
||||||
has_uol_style_attrs = re.search(
|
|
||||||
possible_uol_attrs_regexp, span_style)
|
|
||||||
if has_uol_style_attrs and has_uol_style_attrs.group(1):
|
|
||||||
oul_style = has_uol_style_attrs.group(1)
|
|
||||||
span_style = span_style.replace(oul_style, '')
|
|
||||||
oul_tag.attrs['style'] = oul_style
|
|
||||||
tag.attrs['style'] = span_style
|
|
||||||
tag.wrap(oul_tag)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def wrap_span_in_h_to_save_style_attrs(tag):
|
|
||||||
"""Function designed to save style attrs that cannot be in h -> span"""
|
|
||||||
h_regexp = re.compile('(^h[1-9]$)')
|
|
||||||
|
|
||||||
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
|
|
||||||
h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
|
|
||||||
tag.name = 'span'
|
|
||||||
tag.wrap(h_tag)
|
|
||||||
style = tag.attrs['style']
|
|
||||||
h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
|
||||||
has_h_style_attr = re.search(h_attrs_regexp, style)
|
|
||||||
tag.attrs['style'] = style if not has_h_style_attr else style.replace(
|
|
||||||
has_h_style_attr.group(1), '')
|
|
||||||
|
|
||||||
def convert_initial_tag(self):
|
|
||||||
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
|
|
||||||
self.wrap_span_in_p_to_save_style_attrs(self.tag_with_inline_style)
|
|
||||||
self.wrap_span_in_li_to_save_style_attrs(self.tag_with_inline_style)
|
|
||||||
self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_with_inline_style)
|
|
||||||
self.wrap_span_in_h_to_save_style_attrs(self.tag_with_inline_style)
|
|
||||||
return self.tag_with_inline_style
|
|
||||||
|
|
||||||
|
|
||||||
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
|
||||||
"""Function adds styles from .css to inline style"""
|
|
||||||
css_text = css_text.replace(
|
|
||||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
|
||||||
livecarta_tmp_ids = []
|
|
||||||
could_have_style_in_livecarta_regexp = re.compile(
|
|
||||||
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
|
||||||
tags_with_possible_style_attr = html_soup.find_all(
|
|
||||||
could_have_style_in_livecarta_regexp)
|
|
||||||
for i, x in enumerate(tags_with_possible_style_attr):
|
|
||||||
if i == 2:
|
|
||||||
pass
|
|
||||||
x.attrs['livecarta_id'] = i
|
|
||||||
livecarta_tmp_ids.append(i)
|
|
||||||
|
|
||||||
# here we add css styles to inline style
|
|
||||||
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
|
||||||
remove_classes=False,
|
|
||||||
external_styles=False,
|
|
||||||
allow_network=False,
|
|
||||||
disable_validation=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
|
|
||||||
|
|
||||||
# go through tags with possible style attrs
|
|
||||||
for i in livecarta_tmp_ids:
|
|
||||||
tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i})
|
|
||||||
tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})
|
|
||||||
del tag_with_initial_style.attrs['livecarta_id']
|
|
||||||
if tag_with_ultimate_style.attrs.get('style'):
|
|
||||||
style_converter = TagStyleConverter(
|
|
||||||
tag_with_initial_style, tag_with_ultimate_style)
|
|
||||||
style_converter.convert_initial_tag()
|
|
||||||
|
|
||||||
return html_soup
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
file = '../../epub/9781627222174.epub'
|
|
||||||
ebooklib_book = epub.read_epub(file)
|
|
||||||
css_ = ebooklib_book.get_item_with_href('css/epub.css')
|
|
||||||
css_ = css_.get_content().decode()
|
|
||||||
css_cleaned = build_css_content(css_)
|
|
||||||
html_ = ebooklib_book.get_item_with_href(
|
|
||||||
'pr01s05.xhtml').get_body_content().decode()
|
|
||||||
html_soup = BeautifulSoup(html_, features='lxml')
|
|
||||||
|
|
||||||
print(convert_html_soup_with_css_style(html_soup, css_cleaned))
|
|
||||||
@@ -17,7 +17,8 @@ from bs4 import BeautifulSoup, Tag
|
|||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
|
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
|
||||||
|
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
|
||||||
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\
|
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\
|
||||||
prepare_title, prepare_content, update_images_src_links, preprocess_footnotes
|
prepare_title, prepare_content, update_images_src_links, preprocess_footnotes
|
||||||
|
|
||||||
@@ -68,6 +69,8 @@ class EpubConverter:
|
|||||||
BeautifulSoup] = self.build_href2soup_content()
|
BeautifulSoup] = self.build_href2soup_content()
|
||||||
# TODO Presets
|
# TODO Presets
|
||||||
|
|
||||||
|
self.logger.log('Process CSS inline styles.')
|
||||||
|
self.process_inline_styles_in_html_soup()
|
||||||
self.logger.log('CSS files processing.')
|
self.logger.log('CSS files processing.')
|
||||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||||
self.logger.log('CSS styles adding.')
|
self.logger.log('CSS styles adding.')
|
||||||
@@ -106,7 +109,7 @@ class EpubConverter:
|
|||||||
|
|
||||||
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
||||||
# using EpubElements
|
# using EpubElements
|
||||||
# for now just for HTML objects, as it is simplest chapter
|
# for now just for HTML objects, as it is the simplest chapter
|
||||||
|
|
||||||
nodes = dict()
|
nodes = dict()
|
||||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||||
@@ -122,6 +125,7 @@ class EpubConverter:
|
|||||||
path_to_css_from_root = normpath(
|
path_to_css_from_root = normpath(
|
||||||
join(html_folder, path_to_css_from_html)).replace('\\', '/')
|
join(html_folder, path_to_css_from_html)).replace('\\', '/')
|
||||||
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
|
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
|
||||||
|
# if in css file we import another css
|
||||||
if "@import" in str(css_obj.content):
|
if "@import" in str(css_obj.content):
|
||||||
path_to_css_from_root = "css/" + \
|
path_to_css_from_root = "css/" + \
|
||||||
re.search('"(.*)"', str(css_obj.content)).group(1)
|
re.search('"(.*)"', str(css_obj.content)).group(1)
|
||||||
@@ -131,12 +135,26 @@ class EpubConverter:
|
|||||||
css_content: str = css_obj.get_content().decode()
|
css_content: str = css_obj.get_content().decode()
|
||||||
return css_content
|
return css_content
|
||||||
|
|
||||||
|
def process_inline_styles_in_html_soup(self):
|
||||||
|
"""This function is designed to convert inline html styles"""
|
||||||
|
for html_href in self.html_href2html_body_soup:
|
||||||
|
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
||||||
|
could_have_style_in_livecarta_regexp = re.compile(
|
||||||
|
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
||||||
|
tags_with_inline_style = html_content.find_all(could_have_style_in_livecarta_regexp,
|
||||||
|
attrs={'style': re.compile('.*')})
|
||||||
|
|
||||||
|
for tag_initial_inline_style in tags_with_inline_style:
|
||||||
|
inline_style = tag_initial_inline_style.attrs['style']
|
||||||
|
tag_initial_inline_style.attrs['style'] = \
|
||||||
|
build_inline_style_content(inline_style)
|
||||||
|
|
||||||
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
||||||
"""
|
"""
|
||||||
Function is designed to get 2 dictionaries:
|
Function is designed to get 2 dictionaries:
|
||||||
The first is css_href2css_content. It is created to connect href of css to content of css
|
The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them
|
||||||
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them
|
|
||||||
) which are used on this html
|
) which are used on this html
|
||||||
|
The second is css_href2css_content. It is created to connect href of css to content of css
|
||||||
...2... = key2value
|
...2... = key2value
|
||||||
Returns
|
Returns
|
||||||
----------
|
----------
|
||||||
@@ -154,26 +172,27 @@ class EpubConverter:
|
|||||||
soup_html_content = BeautifulSoup(html_content, features='lxml')
|
soup_html_content = BeautifulSoup(html_content, features='lxml')
|
||||||
# check if file links to css file
|
# check if file links to css file
|
||||||
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
|
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
|
||||||
|
# alternate page of original page (e.g. another language)
|
||||||
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
|
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
|
||||||
continue
|
continue
|
||||||
css_href = tag.attrs.get('href')
|
css_href = tag.attrs.get('href')
|
||||||
html_href2css_href[html_href].append(css_href)
|
html_href2css_href[html_href].append(css_href)
|
||||||
if css_href not in css_href2css_content:
|
if css_href not in css_href2css_content:
|
||||||
# css_href not in css_href2css_content, add to this dict
|
# css_href not in css_href2css_content, add to this dict
|
||||||
css_href2css_content[css_href] = build_css_content(
|
css_href2css_content[css_href] = build_css_file_content(
|
||||||
self.get_css_content(css_href, html_href))
|
self.get_css_content(css_href, html_href))
|
||||||
|
|
||||||
for i, tag in enumerate(soup_html_content.find_all('style')):
|
for i, tag in enumerate(soup_html_content.find_all('style')):
|
||||||
css_content = tag.string
|
css_content = tag.string
|
||||||
html_href2css_href[html_href].append(f'href{i}')
|
html_href2css_href[html_href].append(f'href{i}')
|
||||||
css_href2css_content[f'href{i}'] = build_css_content(
|
css_href2css_content[f'href{i}'] = build_css_file_content(
|
||||||
css_content)
|
css_content)
|
||||||
return html_href2css_href, css_href2css_content
|
return html_href2css_href, css_href2css_content
|
||||||
|
|
||||||
def add_css_styles_to_html_soup(self):
|
def add_css_styles_to_html_soup(self):
|
||||||
"""
|
"""
|
||||||
This function is designed to update html_href2html_body_soup
|
This function is designed to update html_href2html_body_soup
|
||||||
And add to html_inline_style css_style_content
|
- add to html_inline_style css_style_content
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for html_href in self.html_href2html_body_soup:
|
for html_href in self.html_href2html_body_soup:
|
||||||
@@ -181,9 +200,9 @@ class EpubConverter:
|
|||||||
css = ''
|
css = ''
|
||||||
for css_href in self.html_href2css_href[html_href]:
|
for css_href in self.html_href2css_href[html_href]:
|
||||||
css += self.css_href2css_content[css_href]
|
css += self.css_href2css_content[css_href]
|
||||||
content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
||||||
content = convert_html_soup_with_css_style(content, css)
|
html_content = convert_html_soup_with_css_style(html_content, css)
|
||||||
self.html_href2html_body_soup[html_href] = content
|
self.html_href2html_body_soup[html_href] = html_content
|
||||||
|
|
||||||
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
||||||
"""
|
"""
|
||||||
@@ -191,7 +210,7 @@ class EpubConverter:
|
|||||||
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
|
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
|
||||||
|
|
||||||
key = -1 if root(top chapters),
|
key = -1 if root(top chapters),
|
||||||
value = None if leaf(least chapters)
|
value = None if leaf(the least chapters)
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
element: [Link, tuple, list]
|
element: [Link, tuple, list]
|
||||||
@@ -299,8 +318,7 @@ class EpubConverter:
|
|||||||
# go to line structure
|
# go to line structure
|
||||||
for html_href in self.html_href2html_body_soup:
|
for html_href in self.html_href2html_body_soup:
|
||||||
soup = self.html_href2html_body_soup[html_href]
|
soup = self.html_href2html_body_soup[html_href]
|
||||||
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(
|
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
|
||||||
soup)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_unique_id(href, id_):
|
def create_unique_id(href, id_):
|
||||||
@@ -314,7 +332,7 @@ class EpubConverter:
|
|||||||
new_anchor_span.string = "\xa0"
|
new_anchor_span.string = "\xa0"
|
||||||
return new_anchor_span
|
return new_anchor_span
|
||||||
|
|
||||||
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
|
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
|
||||||
"""
|
"""
|
||||||
Function used to find full path to file that is parsed from tag link
|
Function used to find full path to file that is parsed from tag link
|
||||||
TOC: a/b/c.xhtml
|
TOC: a/b/c.xhtml
|
||||||
@@ -327,7 +345,7 @@ class EpubConverter:
|
|||||||
href_in_link: str
|
href_in_link: str
|
||||||
filename got from tag link, like file1.xhtml
|
filename got from tag link, like file1.xhtml
|
||||||
internal_link_tag: Tag
|
internal_link_tag: Tag
|
||||||
tag object that is parsed now
|
object that is parsed now
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -362,6 +380,10 @@ class EpubConverter:
|
|||||||
1. rebuild ids to be unique in all documents
|
1. rebuild ids to be unique in all documents
|
||||||
2a. process anchor which is a whole xhtml file
|
2a. process anchor which is a whole xhtml file
|
||||||
2b. process anchor which is an element in xhtml file
|
2b. process anchor which is an element in xhtml file
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
process links in html
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# 1. rebuild ids to be unique in all documents
|
# 1. rebuild ids to be unique in all documents
|
||||||
@@ -393,14 +415,14 @@ class EpubConverter:
|
|||||||
if new_id not in self.internal_anchors:
|
if new_id not in self.internal_anchors:
|
||||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||||
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
||||||
# insert a new span to the begin of the file
|
# insert a new span to the beginning of the file
|
||||||
anchor_soup.insert(0, new_anchor_span)
|
anchor_soup.insert(0, new_anchor_span)
|
||||||
self.internal_anchors.add(new_id)
|
self.internal_anchors.add(new_id)
|
||||||
|
|
||||||
del internal_link_tag.attrs['href']
|
del internal_link_tag.attrs['href']
|
||||||
|
|
||||||
# 2b. process anchor which is an element in xhtml file
|
# 2b. process anchor which is an element in xhtml file
|
||||||
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
|
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)')
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
soup = self.html_href2html_body_soup[toc_href]
|
soup = self.html_href2html_body_soup[toc_href]
|
||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
||||||
@@ -459,7 +481,7 @@ class EpubConverter:
|
|||||||
id wraps chapter's content + subchapters' content
|
id wraps chapter's content + subchapters' content
|
||||||
id points to the start of title of a chapter
|
id points to the start of title of a chapter
|
||||||
|
|
||||||
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
|
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
|
||||||
and id of the next chapter/subchapter
|
and id of the next chapter/subchapter
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -504,7 +526,8 @@ class EpubConverter:
|
|||||||
path_to_html=nav_point.href,
|
path_to_html=nav_point.href,
|
||||||
access=self.access,
|
access=self.access,
|
||||||
path2aws_path=self.book_image_src_path2aws_path,
|
path2aws_path=self.book_image_src_path2aws_path,
|
||||||
book_id=self.file_path.stem if hasattr(self.file_path, 'stem') else 'book_id')
|
book_id=self.file_path.stem
|
||||||
|
if hasattr(self.file_path, 'stem') else 'book_id')
|
||||||
|
|
||||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||||
title_preprocessed = prepare_title(title)
|
title_preprocessed = prepare_title(title)
|
||||||
|
|||||||
340
src/epub_converter/tag_css_style_converter.py
Normal file
340
src/epub_converter/tag_css_style_converter.py
Normal file
@@ -0,0 +1,340 @@
|
|||||||
|
import re
|
||||||
|
import cssutils
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from logging import CRITICAL
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from premailer import transform
|
||||||
|
|
||||||
|
from src.livecarta_config import LiveCartaConfig
|
||||||
|
from src.epub_converter.css_preprocessing import LIVECARTA_STYLE_ATTRS
|
||||||
|
|
||||||
|
cssutils.log.setLevel(CRITICAL)
|
||||||
|
|
||||||
|
|
||||||
|
class TagStyleConverter:
|
||||||
|
def __init__(self, tag_inline_style):
|
||||||
|
# tag with inline style + style parsed from css file
|
||||||
|
self.tag_inline_style = tag_inline_style
|
||||||
|
self.style = self.process_inline_style()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def remove_white_if_no_bgcolor(style_, tag):
|
||||||
|
"""Function remove text white color if there is no bg color"""
|
||||||
|
if 'background' in style_:
|
||||||
|
style_ = style_.replace(
|
||||||
|
'background:', 'background-color:')
|
||||||
|
return style_
|
||||||
|
|
||||||
|
# if text color is white, check that we have bg-color
|
||||||
|
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
|
||||||
|
# if bg color is inherited, just return style as is
|
||||||
|
for parent_tag in tag.parents:
|
||||||
|
# white bg color not need to be checked as we do not write 'white bg color'
|
||||||
|
tag_with_bg = ['span', 'td', 'tr', 'p']
|
||||||
|
tag_will_be_saved = parent_tag.name in tag_with_bg
|
||||||
|
has_bg = parent_tag.attrs.get('style') and (
|
||||||
|
'background' in parent_tag.attrs.get('style'))
|
||||||
|
if has_bg and tag_will_be_saved:
|
||||||
|
return style_
|
||||||
|
|
||||||
|
children = tag.find_all()
|
||||||
|
for child in children:
|
||||||
|
if child.attrs.get('style') and ('background' in child.attrs.get('style')):
|
||||||
|
tmp_style = child.attrs['style'] + '; color:#fff; '
|
||||||
|
child.attrs['style'] = tmp_style
|
||||||
|
|
||||||
|
# for child with bg color we added white text color, so this tag don't need white color
|
||||||
|
style_ = style_.replace('color:#fff;', '')
|
||||||
|
style_ = style_.replace('color:#ffffff;', '')
|
||||||
|
style_ = style_.replace('color:white;', '')
|
||||||
|
return style_
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def duplicate_styles_check(split_style: list) -> list:
|
||||||
|
# replace all spaces between ': & letter' to ':'
|
||||||
|
# split_style = [el.replace(
|
||||||
|
# re.search(r'(:\s*)', el).group(1), ':') for el in split_style_]
|
||||||
|
style_name2style_value = {}
|
||||||
|
for list_item in split_style:
|
||||||
|
key, val = list_item.split(":")
|
||||||
|
if val not in style_name2style_value.keys():
|
||||||
|
style_name2style_value[key] = val
|
||||||
|
split_style = [k + ":" + v for k, v in style_name2style_value.items()]
|
||||||
|
return split_style
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def indents_processing(split_style: list) -> str:
|
||||||
|
"""
|
||||||
|
Function process indents from left using
|
||||||
|
formula_of_indent: indent = abs(margin - text_indent)
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
split_style: list
|
||||||
|
list of styles split by ';'
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
processed_style:str
|
||||||
|
processed style with counted indent
|
||||||
|
|
||||||
|
"""
|
||||||
|
processed_style = ";".join(split_style)
|
||||||
|
|
||||||
|
margin_left_regexp = re.compile(
|
||||||
|
r'((margin-left|margin): *(-*\w+);*)')
|
||||||
|
text_indent_regexp = re.compile(
|
||||||
|
r'(text-indent: *(-*\w+);*)')
|
||||||
|
|
||||||
|
has_margin = re.search(margin_left_regexp, processed_style)
|
||||||
|
has_text_indent = re.search(text_indent_regexp, processed_style)
|
||||||
|
if has_margin:
|
||||||
|
num_m = abs(int("0" + "".join(
|
||||||
|
filter(str.isdigit, str(has_margin.group(3))))))
|
||||||
|
|
||||||
|
if has_text_indent:
|
||||||
|
num_ti = abs(int("0" + "".join(
|
||||||
|
filter(str.isdigit, str(has_text_indent.group(2))))))
|
||||||
|
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' +
|
||||||
|
str(abs(num_m - num_ti)) + 'px; ')
|
||||||
|
processed_style = processed_style.replace(
|
||||||
|
has_margin.group(1), '')
|
||||||
|
return processed_style
|
||||||
|
|
||||||
|
processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' +
|
||||||
|
str(abs(num_m)) + 'px; ')
|
||||||
|
return processed_style
|
||||||
|
|
||||||
|
elif has_text_indent:
|
||||||
|
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' +
|
||||||
|
str(abs(int("0" + "".join(
|
||||||
|
filter(str.isdigit, str(has_text_indent.group(2)))))))
|
||||||
|
+ 'px; ')
|
||||||
|
return processed_style
|
||||||
|
return processed_style
|
||||||
|
|
||||||
|
def process_inline_style(self):
|
||||||
|
"""
|
||||||
|
Function processes final(css+initial inline) inline style
|
||||||
|
Steps
|
||||||
|
----------
|
||||||
|
1. Remove white color if tag doesn't have background color in style
|
||||||
|
2. Create list of styles from inline style
|
||||||
|
3. Duplicate styles check - if the tag had duplicate styles
|
||||||
|
4. Processing indents
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
inline_style: str
|
||||||
|
processed inline style
|
||||||
|
|
||||||
|
"""
|
||||||
|
inline_style = self.tag_inline_style.attrs.get('style') + ';'
|
||||||
|
# 1. Remove white color if tag doesn't have background color in style
|
||||||
|
inline_style = self.remove_white_if_no_bgcolor(
|
||||||
|
inline_style, self.tag_inline_style)
|
||||||
|
inline_style = inline_style.replace(
|
||||||
|
'list-style-image', 'list-style-type')
|
||||||
|
|
||||||
|
# 2. Create list of styles from inline style
|
||||||
|
# replace all spaces between '; & letter' to ';'
|
||||||
|
style = re.sub(r"; *", ";", inline_style)
|
||||||
|
# when we split style by ';', last element of the list is '' - None (remove it)
|
||||||
|
split_inline_style: list = list(filter(None, style.split(';')))
|
||||||
|
|
||||||
|
# 3. Duplicate styles check - if the tag had duplicate styles
|
||||||
|
split_inline_style = self.duplicate_styles_check(split_inline_style)
|
||||||
|
|
||||||
|
# 4. Processing indents#
|
||||||
|
inline_style: str = self.indents_processing(split_inline_style)
|
||||||
|
return inline_style
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def check_style_to_be_tag(style: str) -> List[tuple]:
|
||||||
|
"""
|
||||||
|
Function searches style properties that can be converted to tag.
|
||||||
|
It searches for them and prepare list of properties to be removed from style string
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
style: str
|
||||||
|
<tag style="...">
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
to_remove: list
|
||||||
|
properties to remove
|
||||||
|
|
||||||
|
"""
|
||||||
|
to_remove = []
|
||||||
|
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
||||||
|
if f'{k[0]}:{k[1]}' in style:
|
||||||
|
to_remove.append(k)
|
||||||
|
return to_remove
|
||||||
|
|
||||||
|
def change_attrs_with_corresponding_tags(self, tag_initial_name: str):
|
||||||
|
# adds <strong>, <u>, <sup> instead of styles
|
||||||
|
to_remove = self.check_style_to_be_tag(self.style)
|
||||||
|
new_tags = []
|
||||||
|
for i, (attr, value) in enumerate(to_remove):
|
||||||
|
s = f'{attr}:{value};'
|
||||||
|
self.style = self.style.replace(s, '')
|
||||||
|
self.style = self.style.strip()
|
||||||
|
if not i:
|
||||||
|
self.tag_inline_style.name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
|
||||||
|
attr, value)]
|
||||||
|
new_tags.append(self.tag_inline_style)
|
||||||
|
else:
|
||||||
|
name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
|
||||||
|
attr, value)]
|
||||||
|
new_tag = BeautifulSoup(features='lxml').new_tag(name)
|
||||||
|
new_tags[-1].wrap(new_tag)
|
||||||
|
new_tags.append(new_tag)
|
||||||
|
|
||||||
|
top_tag = self.tag_inline_style
|
||||||
|
|
||||||
|
if new_tags:
|
||||||
|
tmp_attrs = top_tag.attrs.copy()
|
||||||
|
top_tag.attrs = {}
|
||||||
|
top_tag2 = BeautifulSoup(features='lxml').new_tag(tag_initial_name)
|
||||||
|
top_tag2.attrs = tmp_attrs
|
||||||
|
if self.style:
|
||||||
|
top_tag2.attrs['style'] = self.style
|
||||||
|
new_tags[-1].wrap(top_tag2)
|
||||||
|
else:
|
||||||
|
top_tag.attrs['style'] = self.style
|
||||||
|
return top_tag
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def wrap_span_in_p_to_save_style_attrs(tag):
|
||||||
|
"""Function designed to save style attrs that cannot be in p -> span"""
|
||||||
|
if tag.name == 'p' and tag.attrs.get('style'):
|
||||||
|
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
||||||
|
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
|
||||||
|
p_style = ''
|
||||||
|
initial_style = tag.attrs['style']
|
||||||
|
split_style = initial_style.replace('; ', ';').split(';')
|
||||||
|
possible_p_attrs_regexp = re.compile(
|
||||||
|
r'(text-align:)|(text-indent:)|(border-bottom:)|(border-top:)')
|
||||||
|
for item in split_style:
|
||||||
|
has_p_style_attrs = re.search(possible_p_attrs_regexp, item)
|
||||||
|
if has_p_style_attrs:
|
||||||
|
p_style += item + ';'
|
||||||
|
initial_style = initial_style.replace(item + ';', '')
|
||||||
|
# here check that this style i exactly the same.
|
||||||
|
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
|
||||||
|
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
|
||||||
|
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
|
||||||
|
if any(styles_to_be_saved_in_span):
|
||||||
|
# if we find styles that cannot be in <p> -> wrap them in span
|
||||||
|
tag.name = 'span'
|
||||||
|
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
||||||
|
p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||||
|
has_p_style_attr = re.search(p_attrs_regexp, initial_style)
|
||||||
|
span_style = initial_style if not has_p_style_attr else initial_style.replace(
|
||||||
|
has_p_style_attr.group(1), '')
|
||||||
|
p_tag.attrs['style'] = p_style
|
||||||
|
tag.attrs['style'] = span_style
|
||||||
|
tag.wrap(p_tag)
|
||||||
|
else:
|
||||||
|
tag.attrs['style'] = p_style
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def wrap_span_in_li_to_save_style_attrs(tag):
|
||||||
|
"""Function designed to save style attrs that cannot be in li -> span"""
|
||||||
|
if tag.name == 'li' and tag.attrs.get('style'):
|
||||||
|
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
|
||||||
|
attr not in ['text-align', 'list-style-type']]
|
||||||
|
|
||||||
|
styles_to_be_saved_in_span = [attr in tag.attrs.get(
|
||||||
|
'style') for attr in styles_cant_be_in_li]
|
||||||
|
if any(styles_to_be_saved_in_span):
|
||||||
|
tag.name = 'span'
|
||||||
|
li_tag = BeautifulSoup(features='lxml').new_tag('li')
|
||||||
|
span_style = tag.attrs['style']
|
||||||
|
li_style = ''
|
||||||
|
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
|
||||||
|
re.compile(r'(list-style-type:(\w+);)')]:
|
||||||
|
has_li_style_attrs = re.search(
|
||||||
|
possible_li_attrs_regexp, span_style)
|
||||||
|
if has_li_style_attrs and has_li_style_attrs.group(1):
|
||||||
|
li_style += has_li_style_attrs.group(1)
|
||||||
|
span_style = span_style.replace(
|
||||||
|
has_li_style_attrs.group(1), '')
|
||||||
|
li_tag.attrs['style'] = li_style
|
||||||
|
tag.attrs['style'] = span_style
|
||||||
|
tag.wrap(li_tag)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
|
||||||
|
"""Function designed to save style attrs that cannot be in ul/ol -> span"""
|
||||||
|
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
|
||||||
|
styles_cant_be_in_ul_ol = [
|
||||||
|
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
|
||||||
|
|
||||||
|
styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
|
||||||
|
for attr in styles_cant_be_in_ul_ol]
|
||||||
|
if any(styles_to_be_saved_in_span):
|
||||||
|
tag.name = 'span'
|
||||||
|
oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
|
||||||
|
span_style = tag.attrs['style']
|
||||||
|
|
||||||
|
possible_uol_attrs_regexp = re.compile(
|
||||||
|
r'(list-style-type:(\w+);)')
|
||||||
|
has_uol_style_attrs = re.search(
|
||||||
|
possible_uol_attrs_regexp, span_style)
|
||||||
|
if has_uol_style_attrs and has_uol_style_attrs.group(1):
|
||||||
|
oul_style = has_uol_style_attrs.group(1)
|
||||||
|
span_style = span_style.replace(oul_style, '')
|
||||||
|
oul_tag.attrs['style'] = oul_style
|
||||||
|
tag.attrs['style'] = span_style
|
||||||
|
tag.wrap(oul_tag)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def wrap_span_in_h_to_save_style_attrs(tag):
|
||||||
|
"""Function designed to save style attrs that cannot be in h -> span"""
|
||||||
|
h_regexp = re.compile('(^h[1-9]$)')
|
||||||
|
|
||||||
|
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
|
||||||
|
h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
|
||||||
|
tag.name = 'span'
|
||||||
|
tag.wrap(h_tag)
|
||||||
|
style = tag.attrs['style']
|
||||||
|
h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||||
|
has_h_style_attr = re.search(h_attrs_regexp, style)
|
||||||
|
tag.attrs['style'] = style if not has_h_style_attr else style.replace(
|
||||||
|
has_h_style_attr.group(1), '')
|
||||||
|
|
||||||
|
def convert_initial_tag(self):
|
||||||
|
self.tag_inline_style = self.change_attrs_with_corresponding_tags(
|
||||||
|
self.tag_inline_style.name)
|
||||||
|
self.wrap_span_in_p_to_save_style_attrs(self.tag_inline_style)
|
||||||
|
self.wrap_span_in_li_to_save_style_attrs(self.tag_inline_style)
|
||||||
|
self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_inline_style)
|
||||||
|
self.wrap_span_in_h_to_save_style_attrs(self.tag_inline_style)
|
||||||
|
return self.tag_inline_style
|
||||||
|
|
||||||
|
|
||||||
|
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
||||||
|
"""Function adds styles from .css to inline style"""
|
||||||
|
# remove this specification because it causes problems
|
||||||
|
css_text = css_text.replace(
|
||||||
|
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||||
|
# here we add css styles to inline style
|
||||||
|
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
||||||
|
remove_classes=False,
|
||||||
|
external_styles=False,
|
||||||
|
allow_network=False,
|
||||||
|
disable_validation=True,
|
||||||
|
)
|
||||||
|
# soup with converted styles from css
|
||||||
|
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
|
||||||
|
|
||||||
|
could_have_style_in_livecarta_regexp = re.compile(
|
||||||
|
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
||||||
|
tags_with_inline_style = inline_soup.find_all(could_have_style_in_livecarta_regexp,
|
||||||
|
attrs={'style': re.compile('.*')})
|
||||||
|
|
||||||
|
# go through the tags with inline style + style parsed from css file
|
||||||
|
for tag_inline_style in tags_with_inline_style:
|
||||||
|
style_converter = TagStyleConverter(tag_inline_style)
|
||||||
|
style_converter.convert_initial_tag()
|
||||||
|
return inline_soup
|
||||||
@@ -1,17 +1,26 @@
|
|||||||
class LiveCartaConfig:
|
class LiveCartaConfig:
|
||||||
"""Class of values that LiveCarta platform using and supports"""
|
"""Class of values that LiveCarta platform using and supports"""
|
||||||
|
# tag with inline style to be updated with style attribute
|
||||||
SUPPORTED_LEVELS = 5
|
SUPPORTED_LEVELS = 5
|
||||||
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
|
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
|
||||||
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
HEADERS_LEVELS = {"h1", "h2", "h3",
|
||||||
|
"h4", "h5", "h6", "h7", "h8", "h9"}
|
||||||
|
|
||||||
|
DEFAULT_ALIGN_STYLE = 'left'
|
||||||
|
|
||||||
|
ALIGN_STYLES = ['justify', 'right', 'center', 'left']
|
||||||
|
|
||||||
# Main constant values
|
# Main constant values
|
||||||
DEFAULT_FONT_NAME = 'Times New Roman'
|
DEFAULT_FONT_NAME = 'Times New Roman'
|
||||||
DEFAULT_ALIGN_STYLE = 'left'
|
|
||||||
ALIGN_STYLES = ['justify', 'right', 'center', 'left']
|
|
||||||
WORD_DEFAULT_FONT_SIZE = 11
|
WORD_DEFAULT_FONT_SIZE = 11
|
||||||
|
|
||||||
LIVECARTA_DEFAULT_FONT_SIZE = 18
|
LIVECARTA_DEFAULT_FONT_SIZE = 18
|
||||||
FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE / WORD_DEFAULT_FONT_SIZE
|
|
||||||
font_correspondence_table = {
|
FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE /\
|
||||||
|
WORD_DEFAULT_FONT_SIZE
|
||||||
|
|
||||||
|
FONT_CORRESPONDANCE_TABLE = {
|
||||||
"Arial": "arial,helvetica,sans-serif",
|
"Arial": "arial,helvetica,sans-serif",
|
||||||
"Comic Sans MS": "comic sans ms,cursive",
|
"Comic Sans MS": "comic sans ms,cursive",
|
||||||
"Courier New": "courier new,courier,monospace",
|
"Courier New": "courier new,courier,monospace",
|
||||||
@@ -61,4 +70,39 @@ class LiveCartaConfig:
|
|||||||
'gray': 'darkGray',
|
'gray': 'darkGray',
|
||||||
'grey': 'darkGray',
|
'grey': 'darkGray',
|
||||||
}
|
}
|
||||||
|
|
||||||
INDENT = '30px'
|
INDENT = '30px'
|
||||||
|
|
||||||
|
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0,
|
||||||
|
1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69,
|
||||||
|
1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38,
|
||||||
|
2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
|
||||||
|
|
||||||
|
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px',
|
||||||
|
'17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px',
|
||||||
|
'26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
|
||||||
|
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px',
|
||||||
|
'44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px']
|
||||||
|
|
||||||
|
list_types = ['circle', 'disc', 'armenian', 'decimal',
|
||||||
|
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
|
||||||
|
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
||||||
|
|
||||||
|
"""
|
||||||
|
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
|
||||||
|
|
||||||
|
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
|
||||||
|
"""
|
||||||
|
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
||||||
|
('font-weight', 'bold'): 'strong',
|
||||||
|
('font-weight', '600'): 'strong',
|
||||||
|
('font-weight', '700'): 'strong',
|
||||||
|
('font-weight', '800'): 'strong',
|
||||||
|
('font-weight', '900'): 'strong',
|
||||||
|
('font-style', 'italic'): 'i',
|
||||||
|
('text-decoration', 'underline'): 'u',
|
||||||
|
('text-decoration', 'line-through'): 's',
|
||||||
|
('text-decoration-line', 'underline'): 'u',
|
||||||
|
('text-decoration-line', 'line-through'): 's',
|
||||||
|
('vertical-align', 'super'): 'sup'
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user