Refix Index Bug

This commit is contained in:
Kiryl
2021-09-28 12:12:38 +03:00
parent 3c579210ff
commit 955a64380c
2 changed files with 71 additions and 51 deletions

View File

@@ -56,15 +56,20 @@ def convert_font_size(value):
except ValueError: except ValueError:
return '' return ''
def convert_text_indent(value): def convert_indents(value):
if value[0] != '-': if '-' not in value[0]:
positive_text_indent_regexp = re.compile(r'(\w+%)') # 30px = 3.2% = 1.25em = 23pt
has_css_style_attrs = re.search(positive_text_indent_regexp, value) positive_text_indent_regexp = re.compile(r'(\w+%)|(\w*.*\w+em)')
if has_css_style_attrs: has_style_attrs = re.search(positive_text_indent_regexp, value)
if has_css_style_attrs.group(1): if has_style_attrs:
value = value.replace(has_css_style_attrs.group(1), if has_style_attrs.group(1):
str(int("".join(filter(str.isdigit, str(has_css_style_attrs.group(1))))) * 6) + value = value.replace(has_style_attrs.group(1),
'px') str(int("".join(filter(str.isdigit, str(has_style_attrs.group(1)))))) +
'%')
# elif has_style_attrs.group(2):
# value = value.replace(has_style_attrs.group(2),
# str(int("".join(filter(str.isdigit, str(has_style_attrs.group(2))))) * 5) +
# '%')
return value return value
else: else:
return '' return ''
@@ -99,7 +104,8 @@ LIVECARTA_STYLE_ATTRS = {
'border-bottom-width': [], 'border-bottom-width': [],
'border': [], 'border': [],
'list-style-type': [], 'list-style-type': [],
'list-style-image': [] 'list-style-image': [],
'margin-left': []
} }
""" """
@@ -123,7 +129,7 @@ def get_text_color(x):
LIVECARTA_STYLE_ATTRS_MAPPING = { LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_text_indent, #'text-indent': convert_indents,
'font-variant': lambda x: x, 'font-variant': lambda x: x,
'text-align': lambda x: x, 'text-align': lambda x: x,
'font': lambda x: '', 'font': lambda x: '',
@@ -139,6 +145,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
'border-bottom-width': lambda x: x if x != '0' else '', 'border-bottom-width': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-type': lambda x: x if x in list_types else 'disc',
'list-style-image': lambda x: 'disc', 'list-style-image': lambda x: 'disc',
'margin-left': lambda x: x
} }
""" """
@@ -235,52 +242,65 @@ class TagStyleConverter:
style_ = style_.replace('color:white;', '') style_ = style_.replace('color:white;', '')
return style_ return style_
@staticmethod
def convert_indentions_to_px(style):
margin_left_regexp = re.compile(
r'(margin-left:( *-*\w+%*);*)')
text_indent_regexp = re.compile(
r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)')
has_margin_left = re.search(margin_left_regexp, style)
has_text_indent = re.search(text_indent_regexp, style)
# consider that 5% = 30px
if has_margin_left and has_text_indent:
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
if has_text_indent.group(1):
num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
style = style.replace(has_margin_left.group(1), '')
return style
elif has_text_indent.group(3):
num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(4))))) * 6)
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
style = style.replace(has_margin_left.group(1), '')
return style
elif has_text_indent:
if has_text_indent.group(1):
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)) + 'px; ')
return style
elif has_text_indent.group(3):
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
str("".join(
filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ')
return style
elif has_margin_left:
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
style = style.replace(has_margin_left.group(1), 'text-indent: ' +
str(abs(num_ml)) + 'px; ')
return style
return style
def preprocess_style(self): def preprocess_style(self):
style = self.tag_with_style.attrs.get('style') + ';' style = self.tag_with_style.attrs.get('style') + ';'
style = self.remove_white_if_no_bgcolor(style, self.tag_with_style) style = self.remove_white_if_no_bgcolor(style, self.tag_with_style)
style = style.replace('background:', 'background-color:') style = style.replace('background:', 'background-color:')
style = style.replace('list-style-image', 'list-style-type') style = style.replace('list-style-image', 'list-style-type')
positive_text_indent_regexp = re.compile( # todo: make hmtl_reader + do a repetition check with inline_style
r'(text-indent:( *\w+%*);*)') style = self.convert_indentions_to_px(style)
has_css_style_attrs = re.search(positive_text_indent_regexp, style)
if has_css_style_attrs:
if has_css_style_attrs.group(1):
style = style.replace(has_css_style_attrs.group(1), 'text-indent: ' +
str(int("".join(filter(str.isdigit, str(has_css_style_attrs.group(2))))) * 6) +
'px; ')
negative_text_indent_regexp = re.compile(
r'((text-indent:( *-\w+%*);) *(margin-left:( *\w+%*);))|(text-indent:( *-\w+%*);*)')
has_css_style_attrs = re.search(negative_text_indent_regexp, style)
if has_css_style_attrs:
if has_css_style_attrs.group(1):
style = style.replace(has_css_style_attrs.group(1), '')
if has_css_style_attrs.group(6):
style = style.replace(has_css_style_attrs.group(6), '')
# if tag had already had inline style, add this to style parsed from css # if tag had already had inline style, add this to style parsed from css
if self.tag.attrs.get('style'): if self.tag.attrs.get('style'):
inline_style = self.tag.attrs['style'] inline_style = self.convert_indentions_to_px(self.tag.attrs['style'])
has_inline_style_attrs = re.search(positive_text_indent_regexp, inline_style)
if has_inline_style_attrs:
if has_inline_style_attrs.group(1):
inline_style = inline_style.replace(has_inline_style_attrs.group(1), 'text-indent: ' +
str(int("".join(filter(str.isdigit,
str(has_inline_style_attrs.group(
2))))) * 6) +
'px; ')
has_inline_style_attrs = re.search(negative_text_indent_regexp, inline_style)
if has_inline_style_attrs:
if has_inline_style_attrs.group(1):
inline_style = inline_style.replace(has_inline_style_attrs.group(1), 'text-indent: ' +
str(int("".join(filter(str.isdigit,
str(has_inline_style_attrs.group(
5))))) * 6) +
'px; ')
if has_inline_style_attrs.group(6):
inline_style = inline_style.replace(has_inline_style_attrs.group(6), '')
if self.tag.attrs['style'] not in style:
style += inline_style style += inline_style
return style return style

View File

@@ -442,7 +442,7 @@ if __name__ == "__main__":
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubConverter('../epub/9781614389170.epub', json_converter = EpubConverter('../epub/9781634256063.epub',
logger=logger_object) logger=logger_object)
tmp = json_converter.convert_to_dict() tmp = json_converter.convert_to_dict()