From 955a64380cc47094be7666b1b7b68550f3cacf43 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 28 Sep 2021 12:12:38 +0300 Subject: [PATCH] Refix Index Bug --- src/css_reader.py | 120 ++++++++++++++++++++++++------------------ src/epub_converter.py | 2 +- 2 files changed, 71 insertions(+), 51 deletions(-) diff --git a/src/css_reader.py b/src/css_reader.py index 68871b3..8d5c2aa 100644 --- a/src/css_reader.py +++ b/src/css_reader.py @@ -56,15 +56,20 @@ def convert_font_size(value): except ValueError: return '' -def convert_text_indent(value): - if value[0] != '-': - positive_text_indent_regexp = re.compile(r'(\w+%)') - has_css_style_attrs = re.search(positive_text_indent_regexp, value) - if has_css_style_attrs: - if has_css_style_attrs.group(1): - value = value.replace(has_css_style_attrs.group(1), - str(int("".join(filter(str.isdigit, str(has_css_style_attrs.group(1))))) * 6) + - 'px') +def convert_indents(value): + if '-' not in value[0]: + # 30px = 3.2% = 1.25em = 23pt + positive_text_indent_regexp = re.compile(r'(\w+%)|(\w*.*\w+em)') + has_style_attrs = re.search(positive_text_indent_regexp, value) + if has_style_attrs: + if has_style_attrs.group(1): + value = value.replace(has_style_attrs.group(1), + str(int("".join(filter(str.isdigit, str(has_style_attrs.group(1)))))) + + '%') + # elif has_style_attrs.group(2): + # value = value.replace(has_style_attrs.group(2), + # str(int("".join(filter(str.isdigit, str(has_style_attrs.group(2))))) * 5) + + # '%') return value else: return '' @@ -99,7 +104,8 @@ LIVECARTA_STYLE_ATTRS = { 'border-bottom-width': [], 'border': [], 'list-style-type': [], - 'list-style-image': [] + 'list-style-image': [], + 'margin-left': [] } """ @@ -123,7 +129,7 @@ def get_text_color(x): LIVECARTA_STYLE_ATTRS_MAPPING = { - 'text-indent': convert_text_indent, + #'text-indent': convert_indents, 'font-variant': lambda x: x, 'text-align': lambda x: x, 'font': lambda x: '', @@ -139,6 +145,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = { 'border-bottom-width': lambda x: x if x != '0' else '', 'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-image': lambda x: 'disc', + 'margin-left': lambda x: x } """ @@ -235,53 +242,66 @@ class TagStyleConverter: style_ = style_.replace('color:white;', '') return style_ + @staticmethod + def convert_indentions_to_px(style): + margin_left_regexp = re.compile( + r'(margin-left:( *-*\w+%*);*)') + text_indent_regexp = re.compile( + r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)') + + has_margin_left = re.search(margin_left_regexp, style) + has_text_indent = re.search(text_indent_regexp, style) + # consider that 5% = 30px + if has_margin_left and has_text_indent: + num_ml = abs(int("".join( + filter(str.isdigit, str(has_margin_left.group(2))))) * 6) + if has_text_indent.group(1): + num_ti = abs(int("".join( + filter(str.isdigit, str(has_text_indent.group(2))))) * 6) + style = style.replace(has_text_indent.group(1), 'text-indent: ' + + str(abs(num_ml - num_ti)) + 'px; ') + style = style.replace(has_margin_left.group(1), '') + return style + + elif has_text_indent.group(3): + num_ti = abs(int("".join( + filter(str.isdigit, str(has_text_indent.group(4))))) * 6) + style = style.replace(has_text_indent.group(3), 'text-indent: ' + + str(abs(num_ml - num_ti)) + 'px; ') + style = style.replace(has_margin_left.group(1), '') + return style + + elif has_text_indent: + if has_text_indent.group(1): + style = style.replace(has_text_indent.group(1), 'text-indent: ' + + str(abs(int("".join( + filter(str.isdigit, str(has_text_indent.group(2))))) * 6)) + 'px; ') + return style + elif has_text_indent.group(3): + style = style.replace(has_text_indent.group(3), 'text-indent: ' + + str("".join( + filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ') + return style + elif has_margin_left: + num_ml = abs(int("".join( + filter(str.isdigit, str(has_margin_left.group(2))))) * 6) + style = style.replace(has_margin_left.group(1), 'text-indent: ' + + str(abs(num_ml)) + 'px; ') + return style + return style + def preprocess_style(self): style = self.tag_with_style.attrs.get('style') + ';' style = self.remove_white_if_no_bgcolor(style, self.tag_with_style) style = style.replace('background:', 'background-color:') style = style.replace('list-style-image', 'list-style-type') - positive_text_indent_regexp = re.compile( - r'(text-indent:( *\w+%*);*)') - has_css_style_attrs = re.search(positive_text_indent_regexp, style) - if has_css_style_attrs: - if has_css_style_attrs.group(1): - style = style.replace(has_css_style_attrs.group(1), 'text-indent: ' + - str(int("".join(filter(str.isdigit, str(has_css_style_attrs.group(2))))) * 6) + - 'px; ') - - negative_text_indent_regexp = re.compile( - r'((text-indent:( *-\w+%*);) *(margin-left:( *\w+%*);))|(text-indent:( *-\w+%*);*)') - has_css_style_attrs = re.search(negative_text_indent_regexp, style) - if has_css_style_attrs: - if has_css_style_attrs.group(1): - style = style.replace(has_css_style_attrs.group(1), '') - if has_css_style_attrs.group(6): - style = style.replace(has_css_style_attrs.group(6), '') - + # todo: make hmtl_reader + do a repetition check with inline_style + style = self.convert_indentions_to_px(style) # if tag had already had inline style, add this to style parsed from css if self.tag.attrs.get('style'): - inline_style = self.tag.attrs['style'] - has_inline_style_attrs = re.search(positive_text_indent_regexp, inline_style) - if has_inline_style_attrs: - if has_inline_style_attrs.group(1): - inline_style = inline_style.replace(has_inline_style_attrs.group(1), 'text-indent: ' + - str(int("".join(filter(str.isdigit, - str(has_inline_style_attrs.group( - 2))))) * 6) + - 'px; ') - has_inline_style_attrs = re.search(negative_text_indent_regexp, inline_style) - if has_inline_style_attrs: - if has_inline_style_attrs.group(1): - inline_style = inline_style.replace(has_inline_style_attrs.group(1), 'text-indent: ' + - str(int("".join(filter(str.isdigit, - str(has_inline_style_attrs.group( - 5))))) * 6) + - 'px; ') - if has_inline_style_attrs.group(6): - inline_style = inline_style.replace(has_inline_style_attrs.group(6), '') - if self.tag.attrs['style'] not in style: - style += inline_style + inline_style = self.convert_indentions_to_px(self.tag.attrs['style']) + style += inline_style return style diff --git a/src/epub_converter.py b/src/epub_converter.py index 798d81a..ead91d2 100644 --- a/src/epub_converter.py +++ b/src/epub_converter.py @@ -442,7 +442,7 @@ if __name__ == "__main__": logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) - json_converter = EpubConverter('../epub/9781614389170.epub', + json_converter = EpubConverter('../epub/9781634256063.epub', logger=logger_object) tmp = json_converter.convert_to_dict()