forked from LiveCarta/BookConverter
Make todos & refactor code
This commit is contained in:
@@ -8,8 +8,9 @@ from bs4 import BeautifulSoup
|
||||
from premailer import transform
|
||||
from itertools import takewhile
|
||||
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.util.color_reader import str2hex
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
|
||||
|
||||
|
||||
cssutils.log.setLevel(CRITICAL)
|
||||
@@ -211,9 +212,9 @@ def build_css_content(css_content):
|
||||
|
||||
|
||||
class TagStyleConverter:
|
||||
def __init__(self, tag_with_initial_style, tag_with_ultimate_style):
|
||||
self.tag_with_initial_style = tag_with_initial_style # tag with inline style to be updated with style attribute
|
||||
self.tag_initial_name = tag_with_initial_style.name
|
||||
def __init__(self, tag_with_inline_style, tag_with_ultimate_style):
|
||||
self.tag_with_inline_style = tag_with_inline_style # tag with inline style to be updated with style attribute
|
||||
self.tag_initial_name = tag_with_inline_style.name
|
||||
self.tag_with_ultimate_style = tag_with_ultimate_style # tag with inline style + style parsed from css file
|
||||
self.style = self.preprocess_style()
|
||||
|
||||
@@ -293,32 +294,39 @@ class TagStyleConverter:
|
||||
ultimate_style = ultimate_style.replace('background:', 'background-color:')
|
||||
ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type')
|
||||
|
||||
split_ultimate_style = ultimate_style.split(';') # make for repetition check and convert to px
|
||||
split_ultimate_style = ultimate_style.replace('; ',';').split(';')
|
||||
|
||||
# check for another ; in style string in preprocess_style()
|
||||
# when we split style by ; and we have at the end ; that's why we have '' in list
|
||||
while '' in split_ultimate_style:
|
||||
split_ultimate_style.remove('')
|
||||
ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
|
||||
|
||||
if self.tag_with_initial_style.attrs.get('style'):
|
||||
# replace all spaces between ': & letter' to ':'
|
||||
split_ultimate_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_ultimate_style]
|
||||
|
||||
initial_style = self.tag_with_initial_style.attrs['style']
|
||||
split_initial_style = initial_style.split(';')
|
||||
if self.tag_with_inline_style.attrs.get('style'):
|
||||
inline_style = self.tag_with_inline_style.attrs['style']
|
||||
|
||||
# check for another ; in style string in preprocess_style()
|
||||
while '' in split_initial_style:
|
||||
split_initial_style.remove('')
|
||||
split_inline_style = inline_style.replace('; ',';').split(';')
|
||||
|
||||
# repetition check - if tag had already had inline style, add this to style parsed from css
|
||||
repeat_styles = list(set(split_ultimate_style) & set(split_initial_style))
|
||||
# when we split style by ; and we have at the end ; that's why we have '' in list
|
||||
while '' in split_inline_style:
|
||||
split_inline_style.remove('')
|
||||
|
||||
# replace all spaces between ': & letter' to ':'
|
||||
split_inline_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_inline_style]
|
||||
|
||||
# repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css
|
||||
repeat_styles = list(set(split_ultimate_style) & set(split_inline_style))
|
||||
for item in repeat_styles:
|
||||
split_initial_style.remove(item)
|
||||
split_inline_style.remove(item)
|
||||
|
||||
if split_initial_style:
|
||||
# if initial style is not empty - start convert and add to ultimate style
|
||||
if split_inline_style:
|
||||
# if inline style is not empty - start convert and add to ultimate style
|
||||
print('we enter repetition check', '\n')
|
||||
initial_style: str = self.process_indents_to_px(split_initial_style)
|
||||
ultimate_style += initial_style
|
||||
inline_style: str = self.process_indents_to_px(split_inline_style)
|
||||
ultimate_style += inline_style
|
||||
|
||||
ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
|
||||
return ultimate_style
|
||||
|
||||
def change_attrs_with_corresponding_tags(self):
|
||||
@@ -330,15 +338,15 @@ class TagStyleConverter:
|
||||
self.style = self.style.replace(s, '')
|
||||
self.style = self.style.strip()
|
||||
if i == 0:
|
||||
self.tag_with_initial_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
||||
new_tags.append(self.tag_with_initial_style)
|
||||
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
||||
new_tags.append(self.tag_with_inline_style)
|
||||
else:
|
||||
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag(name)
|
||||
new_tags[-1].wrap(new_tag)
|
||||
new_tags.append(new_tag)
|
||||
|
||||
top_tag = self.tag_with_initial_style
|
||||
top_tag = self.tag_with_inline_style
|
||||
|
||||
if new_tags:
|
||||
tmp_attrs = top_tag.attrs.copy()
|
||||
@@ -355,10 +363,12 @@ class TagStyleConverter:
|
||||
|
||||
@staticmethod
|
||||
def wrap_span_in_p_to_save_style_attrs(tag):
|
||||
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
||||
if attr not in ['text-align', 'text-indent', 'border-bottom']]
|
||||
|
||||
'''Function designed to save style attrs that cannot be in p -> span
|
||||
that cannot be in span -> p'''
|
||||
if tag.name == 'p' and tag.attrs.get('style'):
|
||||
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
||||
if attr not in ['text-align', 'text-indent', 'border-bottom']]
|
||||
|
||||
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
|
||||
if any(styles_to_be_saved):
|
||||
tag.name = 'span'
|
||||
@@ -388,83 +398,81 @@ class TagStyleConverter:
|
||||
tag.wrap(p_tag)
|
||||
|
||||
@staticmethod
|
||||
def add_span_to_save_style_attrs_in_li(t):
|
||||
if t.name == 'li' and t.attrs.get('style'):
|
||||
def wrap_span_in_li_to_save_style_attrs(tag):
|
||||
if tag.name == 'li' and tag.attrs.get('style'):
|
||||
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
|
||||
attr not in ['text-align', 'list-style-type', 'border-bottom']]
|
||||
attr not in ['text-align', 'list-style-type']]
|
||||
|
||||
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
|
||||
if any(check):
|
||||
t.name = 'span'
|
||||
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_li]
|
||||
if any(styles_to_be_saved):
|
||||
tag.name = 'span'
|
||||
li_tag = BeautifulSoup(features='lxml').new_tag('li')
|
||||
old_style = t.attrs['style']
|
||||
new_style = ''
|
||||
span_style = tag.attrs['style']
|
||||
li_style = ''
|
||||
|
||||
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
|
||||
re.compile(r'(list-style-type:(\w+);)')]:
|
||||
has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
|
||||
has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style)
|
||||
if has_li_style_attrs and has_li_style_attrs.group(1):
|
||||
new_style += has_li_style_attrs.group(1)
|
||||
old_style = old_style.replace(has_li_style_attrs.group(1), '')
|
||||
li_style += has_li_style_attrs.group(1)
|
||||
span_style = span_style.replace(has_li_style_attrs.group(1), '')
|
||||
|
||||
li_tag.attrs['style'] = new_style
|
||||
t.attrs['style'] = old_style
|
||||
t.wrap(li_tag)
|
||||
li_tag.attrs['style'] = li_style
|
||||
tag.attrs['style'] = span_style
|
||||
tag.wrap(li_tag)
|
||||
|
||||
@staticmethod
|
||||
def add_span_to_save_style_attrs_in_ul_ol(t):
|
||||
if t.name in ['ul', 'ol'] and t.attrs.get('style'):
|
||||
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
|
||||
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
|
||||
styles_cant_be_in_ul_ol = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
|
||||
|
||||
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_ul_ol]
|
||||
check = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_ul_ol]
|
||||
if any(check):
|
||||
t.name = 'span'
|
||||
tag.name = 'span'
|
||||
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
|
||||
old_style = t.attrs['style']
|
||||
span_style = tag.attrs['style']
|
||||
|
||||
possible_li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||
has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
|
||||
has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style)
|
||||
if has_li_style_attrs and has_li_style_attrs.group(1):
|
||||
new_style = has_li_style_attrs.group(1)
|
||||
old_style = old_style.replace(new_style, '')
|
||||
li_tag.attrs['style'] = new_style
|
||||
t.attrs['style'] = old_style
|
||||
t.wrap(li_tag)
|
||||
oul_style = has_li_style_attrs.group(1)
|
||||
span_style = span_style.replace(oul_style, '')
|
||||
li_tag.attrs['style'] = oul_style
|
||||
tag.attrs['style'] = span_style
|
||||
tag.wrap(li_tag)
|
||||
|
||||
@staticmethod
|
||||
def add_span_to_save_style_attrs(t):
|
||||
no_style_in_livecarta_regexp = re.compile('(^h[1-9]$)')
|
||||
def wrap_span_in_h_to_save_style_attrs(tag):
|
||||
h_regexp = re.compile('(^h[1-9]$)')
|
||||
|
||||
if re.search(no_style_in_livecarta_regexp, t.name) and t.attrs.get('style'):
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag(t.name)
|
||||
t.name = 'span'
|
||||
t.wrap(new_tag)
|
||||
style = t.attrs['style']
|
||||
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
|
||||
h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
|
||||
tag.name = 'span'
|
||||
tag.wrap(h_tag)
|
||||
style = tag.attrs['style']
|
||||
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||
has_li_style_attr = re.search(li_attrs_regexp, style)
|
||||
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
|
||||
tag.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
|
||||
|
||||
def convert_initial_tag(self):
|
||||
self.tag_with_initial_style = self.change_attrs_with_corresponding_tags()
|
||||
self.wrap_span_in_p_to_save_style_attrs(self.tag_with_initial_style)
|
||||
self.add_span_to_save_style_attrs_in_li(self.tag_with_initial_style)
|
||||
self.add_span_to_save_style_attrs_in_ul_ol(self.tag_with_initial_style)
|
||||
self.add_span_to_save_style_attrs(self.tag_with_initial_style)
|
||||
return self.tag_with_initial_style
|
||||
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
|
||||
self.wrap_span_in_p_to_save_style_attrs(self.tag_with_inline_style)
|
||||
self.wrap_span_in_li_to_save_style_attrs(self.tag_with_inline_style)
|
||||
self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_with_inline_style)
|
||||
self.wrap_span_in_h_to_save_style_attrs(self.tag_with_inline_style)
|
||||
return self.tag_with_inline_style
|
||||
|
||||
|
||||
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
|
||||
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||
livecarta_tmp_ids = []
|
||||
h_regex = f'(^h[1-9]$)'
|
||||
could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
|
||||
could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
|
||||
tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp)
|
||||
for i, x in enumerate(tags_with_possible_style_attr):
|
||||
x.attrs['livecarta_id'] = i
|
||||
livecarta_tmp_ids.append(i)
|
||||
|
||||
# here we add css styles to inline style
|
||||
# sometimes in html_with_css_styles
|
||||
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
||||
remove_classes=False,
|
||||
external_styles=False,
|
||||
@@ -474,6 +482,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
|
||||
|
||||
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
|
||||
|
||||
# go through tags with possible style attrs
|
||||
for i in livecarta_tmp_ids:
|
||||
tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i})
|
||||
tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})
|
||||
|
||||
Reference in New Issue
Block a user