epub converter: update css_reader.py

This commit is contained in:
shirshasa
2021-09-13 18:53:06 +03:00
parent e3a4c14256
commit 7c0e4d1af2

View File

@@ -23,9 +23,9 @@ sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17p
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
'48px', '49px', '50px', '64px', '72px'] '48px', '49px', '50px', '64px', '72px']
list_types = ['circle', 'disc', 'armenian','decimal', list_types = ['circle', 'disc', 'armenian', 'decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha','lower-latin', 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none' ] 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_font_size(value): def convert_font_size(value):
@@ -132,6 +132,8 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong', ('font-weight', 'bold'): 'strong',
@@ -231,22 +233,22 @@ class TagStyleConverter:
# if tag had already had inline style, add this to style parsed from css # if tag had already had inline style, add this to style parsed from css
if self.tag.attrs.get('style') and self.tag.attrs['style'] not in style: if self.tag.attrs.get('style') and self.tag.attrs['style'] not in style:
style += self.tag.attrs['style'] style += self.tag.attrs['style']
print(style)
return style return style
def change_attrs_with_corresponding_tags(self): def change_attrs_with_corresponding_tags(self):
# adds <b>, <u>, <sup>, etc # adds <b>, <u>, <sup>, etc
to_remove = check_style_to_be_tag(self.style) to_remove = check_style_to_be_tag(self.style)
new_tags = [] new_tags = []
for i, (p, v) in enumerate(to_remove): for i, (attr, value) in enumerate(to_remove):
s = f'{p}:{v};' s = f'{attr}:{value};'
self.style = self.style.replace(s, '') self.style = self.style.replace(s, '')
self.style = self.style.strip() self.style = self.style.strip()
if i == 0: if i == 0:
self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)] self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tags.append(self.tag) new_tags.append(self.tag)
else: else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)] name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tag = BeautifulSoup(features='lxml').new_tag(name) new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag) new_tags[-1].wrap(new_tag)
new_tags.append(new_tag) new_tags.append(new_tag)
@@ -267,34 +269,34 @@ class TagStyleConverter:
return top_tag return top_tag
@staticmethod @staticmethod
def wrap_p_to_save_style_attrs(t): def wrap_span_in_p_to_save_style_attrs(tag):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent']] if attr not in ['text-align', 'text-indent']]
if t.name == 'p' and t.attrs.get('style'): if tag.name == 'p' and tag.attrs.get('style'):
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_p] styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
if any(check): if any(styles_to_be_saved):
t.name = 'span' tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p') p_tag = BeautifulSoup(features='lxml').new_tag('p')
old_style = t.attrs['style'] span_style = tag.attrs['style']
new_style = '' p_style = ''
possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)') possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)')
has_p_style_attrs = re.search(possible_p_attrs_regexp, old_style) has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
if has_p_style_attrs: if has_p_style_attrs:
if has_p_style_attrs.group(1): if has_p_style_attrs.group(1):
new_style += has_p_style_attrs.group(1) p_style += has_p_style_attrs.group(1)
old_style = old_style.replace(has_p_style_attrs.group(1), '') span_style = span_style.replace(has_p_style_attrs.group(1), '')
if has_p_style_attrs.group(3): if has_p_style_attrs.group(3):
new_style += has_p_style_attrs.group(3) p_style += has_p_style_attrs.group(3)
old_style = old_style.replace(has_p_style_attrs.group(3), '') span_style = span_style.replace(has_p_style_attrs.group(3), '')
p_tag.attrs['style'] = new_style p_tag.attrs['style'] = p_style
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, old_style) has_li_style_attr = re.search(li_attrs_regexp, span_style)
old_style = old_style if not has_li_style_attr else old_style.replace(has_li_style_attr.group(1), '') span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '')
t.attrs['style'] = old_style tag.attrs['style'] = span_style
t.wrap(p_tag) tag.wrap(p_tag)
@staticmethod @staticmethod
def add_span_to_save_style_attrs_in_li(t): def add_span_to_save_style_attrs_in_li(t):
@@ -354,29 +356,28 @@ class TagStyleConverter:
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
def convert_initial_tag(self): def convert_initial_tag(self):
del self.tag.attrs['livecarta_id']
self.tag = self.change_attrs_with_corresponding_tags() self.tag = self.change_attrs_with_corresponding_tags()
self.wrap_p_to_save_style_attrs(self.tag) self.wrap_span_in_p_to_save_style_attrs(self.tag)
self.add_span_to_save_style_attrs_in_li(self.tag) self.add_span_to_save_style_attrs_in_li(self.tag)
self.add_span_to_save_style_attrs_in_ul_ol(self.tag) self.add_span_to_save_style_attrs_in_ul_ol(self.tag)
self.add_span_to_save_style_attrs(self.tag) self.add_span_to_save_style_attrs(self.tag)
return self.tag return self.tag
def add_inline_style_to_html_soup(soup1, css_text): def add_inline_style_to_html_soup(soup1: BeautifulSoup, css_text: str):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '') css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = [] livecarta_tmp_ids = []
h_regex = f'(^h[1-9]$)' h_regex = f'(^h[1-9]$)'
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex) could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
elements_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp) tags_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
for i, x in enumerate(elements_with_possible_style_attr): for i, x in enumerate(tags_with_possible_style_attr):
x.attrs['livecarta_id'] = i x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i) livecarta_tmp_ids.append(i)
html_with_inline_style = transform(str(soup1), css_text=css_text, html_with_inline_style: str = transform(str(soup1), css_text=css_text,
remove_classes=False, remove_classes=False,
external_styles=False, external_styles=False,
allow_network=False, allow_network=False,
disable_validation=True) disable_validation=True)
soup2 = BeautifulSoup(html_with_inline_style, features='lxml') soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
for i in livecarta_tmp_ids: for i in livecarta_tmp_ids:
@@ -385,8 +386,7 @@ def add_inline_style_to_html_soup(soup1, css_text):
if tag_with_style.attrs.get('style'): if tag_with_style.attrs.get('style'):
style_converter = TagStyleConverter(tag, tag_with_style) style_converter = TagStyleConverter(tag, tag_with_style)
style_converter.convert_initial_tag() style_converter.convert_initial_tag()
else: del tag.attrs['livecarta_id']
del tag.attrs['livecarta_id']
return soup1 return soup1