epub converter: update css_reader.py

This commit is contained in:
shirshasa
2021-09-13 18:53:06 +03:00
parent e3a4c14256
commit 7c0e4d1af2

View File

@@ -132,6 +132,8 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong', ('font-weight', 'bold'): 'strong',
@@ -231,22 +233,22 @@ class TagStyleConverter:
# if tag had already had inline style, add this to style parsed from css # if tag had already had inline style, add this to style parsed from css
if self.tag.attrs.get('style') and self.tag.attrs['style'] not in style: if self.tag.attrs.get('style') and self.tag.attrs['style'] not in style:
style += self.tag.attrs['style'] style += self.tag.attrs['style']
print(style)
return style return style
def change_attrs_with_corresponding_tags(self): def change_attrs_with_corresponding_tags(self):
# adds <b>, <u>, <sup>, etc # adds <b>, <u>, <sup>, etc
to_remove = check_style_to_be_tag(self.style) to_remove = check_style_to_be_tag(self.style)
new_tags = [] new_tags = []
for i, (p, v) in enumerate(to_remove): for i, (attr, value) in enumerate(to_remove):
s = f'{p}:{v};' s = f'{attr}:{value};'
self.style = self.style.replace(s, '') self.style = self.style.replace(s, '')
self.style = self.style.strip() self.style = self.style.strip()
if i == 0: if i == 0:
self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)] self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tags.append(self.tag) new_tags.append(self.tag)
else: else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)] name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tag = BeautifulSoup(features='lxml').new_tag(name) new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag) new_tags[-1].wrap(new_tag)
new_tags.append(new_tag) new_tags.append(new_tag)
@@ -267,34 +269,34 @@ class TagStyleConverter:
return top_tag return top_tag
@staticmethod @staticmethod
def wrap_p_to_save_style_attrs(t): def wrap_span_in_p_to_save_style_attrs(tag):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent']] if attr not in ['text-align', 'text-indent']]
if t.name == 'p' and t.attrs.get('style'): if tag.name == 'p' and tag.attrs.get('style'):
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_p] styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
if any(check): if any(styles_to_be_saved):
t.name = 'span' tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p') p_tag = BeautifulSoup(features='lxml').new_tag('p')
old_style = t.attrs['style'] span_style = tag.attrs['style']
new_style = '' p_style = ''
possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)') possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)')
has_p_style_attrs = re.search(possible_p_attrs_regexp, old_style) has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
if has_p_style_attrs: if has_p_style_attrs:
if has_p_style_attrs.group(1): if has_p_style_attrs.group(1):
new_style += has_p_style_attrs.group(1) p_style += has_p_style_attrs.group(1)
old_style = old_style.replace(has_p_style_attrs.group(1), '') span_style = span_style.replace(has_p_style_attrs.group(1), '')
if has_p_style_attrs.group(3): if has_p_style_attrs.group(3):
new_style += has_p_style_attrs.group(3) p_style += has_p_style_attrs.group(3)
old_style = old_style.replace(has_p_style_attrs.group(3), '') span_style = span_style.replace(has_p_style_attrs.group(3), '')
p_tag.attrs['style'] = new_style p_tag.attrs['style'] = p_style
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, old_style) has_li_style_attr = re.search(li_attrs_regexp, span_style)
old_style = old_style if not has_li_style_attr else old_style.replace(has_li_style_attr.group(1), '') span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '')
t.attrs['style'] = old_style tag.attrs['style'] = span_style
t.wrap(p_tag) tag.wrap(p_tag)
@staticmethod @staticmethod
def add_span_to_save_style_attrs_in_li(t): def add_span_to_save_style_attrs_in_li(t):
@@ -354,25 +356,24 @@ class TagStyleConverter:
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
def convert_initial_tag(self): def convert_initial_tag(self):
del self.tag.attrs['livecarta_id']
self.tag = self.change_attrs_with_corresponding_tags() self.tag = self.change_attrs_with_corresponding_tags()
self.wrap_p_to_save_style_attrs(self.tag) self.wrap_span_in_p_to_save_style_attrs(self.tag)
self.add_span_to_save_style_attrs_in_li(self.tag) self.add_span_to_save_style_attrs_in_li(self.tag)
self.add_span_to_save_style_attrs_in_ul_ol(self.tag) self.add_span_to_save_style_attrs_in_ul_ol(self.tag)
self.add_span_to_save_style_attrs(self.tag) self.add_span_to_save_style_attrs(self.tag)
return self.tag return self.tag
def add_inline_style_to_html_soup(soup1, css_text): def add_inline_style_to_html_soup(soup1: BeautifulSoup, css_text: str):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '') css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = [] livecarta_tmp_ids = []
h_regex = f'(^h[1-9]$)' h_regex = f'(^h[1-9]$)'
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex) could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
elements_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp) tags_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
for i, x in enumerate(elements_with_possible_style_attr): for i, x in enumerate(tags_with_possible_style_attr):
x.attrs['livecarta_id'] = i x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i) livecarta_tmp_ids.append(i)
html_with_inline_style = transform(str(soup1), css_text=css_text, html_with_inline_style: str = transform(str(soup1), css_text=css_text,
remove_classes=False, remove_classes=False,
external_styles=False, external_styles=False,
allow_network=False, allow_network=False,
@@ -385,7 +386,6 @@ def add_inline_style_to_html_soup(soup1, css_text):
if tag_with_style.attrs.get('style'): if tag_with_style.attrs.get('style'):
style_converter = TagStyleConverter(tag, tag_with_style) style_converter = TagStyleConverter(tag, tag_with_style)
style_converter.convert_initial_tag() style_converter.convert_initial_tag()
else:
del tag.attrs['livecarta_id'] del tag.attrs['livecarta_id']
return soup1 return soup1