From 8c3748261666488d51d0e0fd5907e5dba7b1f3a6 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 26 Oct 2021 17:47:51 +0300 Subject: [PATCH] Add lines with style 'border-bottom' --- src/epub_converter/css_reader.py | 15 +++++---- src/epub_converter/html_epub_preprocessor.py | 32 ++++++++++---------- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/epub_converter/css_reader.py b/src/epub_converter/css_reader.py index ff35717..93e199f 100644 --- a/src/epub_converter/css_reader.py +++ b/src/epub_converter/css_reader.py @@ -75,6 +75,7 @@ def convert_indents(value): value = value.replace(has_style_attrs.group(4), str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px') return value + """ LIVECARTA_STYLE_ATTRS = { css property: value } @@ -130,7 +131,6 @@ def get_text_color(x): color = color if color not in ['#000000', '#000', 'black'] else '' return color - LIVECARTA_STYLE_ATTRS_MAPPING = { 'text-indent': convert_indents, 'font-variant': lambda x: x, @@ -293,7 +293,7 @@ class TagStyleConverter: ultimate_style = ultimate_style.replace('background:', 'background-color:') ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type') - split_ultimate_style = ultimate_style.replace(' ', '').split(';') # make for repetition check and convert to px + split_ultimate_style = ultimate_style.split(';') # make for repetition check and convert to px # check for another ; in style string in preprocess_style() while '' in split_ultimate_style: @@ -303,7 +303,7 @@ class TagStyleConverter: if self.tag_with_initial_style.attrs.get('style'): initial_style = self.tag_with_initial_style.attrs['style'] - split_initial_style = initial_style.replace(' ', '').split(';') + split_initial_style = initial_style.split(';') # check for another ; in style string in preprocess_style() while '' in split_initial_style: @@ -356,7 +356,7 @@ class TagStyleConverter: @staticmethod def wrap_span_in_p_to_save_style_attrs(tag): styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS - if attr not in ['text-align', 'text-indent']] + if attr not in ['text-align', 'text-indent', 'border-bottom']] if tag.name == 'p' and tag.attrs.get('style'): styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p] @@ -365,7 +365,7 @@ class TagStyleConverter: p_tag = BeautifulSoup(features='lxml').new_tag('p') span_style = tag.attrs['style'] p_style = '' - possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)') + possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)|(border-bottom:( *\w+);*)') for i in range(span_style.count(';') + 1): has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style) if has_p_style_attrs: @@ -375,6 +375,9 @@ class TagStyleConverter: if has_p_style_attrs.group(3): p_style += has_p_style_attrs.group(3) span_style = span_style.replace(has_p_style_attrs.group(3), '') + if has_p_style_attrs.group(5): + p_style += span_style + span_style = span_style.replace(span_style, '') p_tag.attrs['style'] = p_style @@ -388,7 +391,7 @@ class TagStyleConverter: def add_span_to_save_style_attrs_in_li(t): if t.name == 'li' and t.attrs.get('style'): styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if - attr not in ['text-align', 'list-style-type']] + attr not in ['text-align', 'list-style-type', 'border-bottom']] check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li] if any(check): diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 91bdf79..f842f63 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -144,7 +144,7 @@ def clean_headings_content(content: Tag, title: str): break -def _heading_tag2p_tag(body_tag): +def heading_tag_to_p_tag(body_tag): """ Function to convert all lower level headings to p tags """ @@ -267,7 +267,7 @@ def unwrap_structural_tags(body_tag): if not tag_.parent.attrs.get('class'): tag_.parent.attrs['class'] = tag_class - def _preserve_class_in_section_tag(tag_) -> bool: + def preserve_class_in_section_tag(tag_) -> bool: # to save css style inherited from class, copy class to child

# this is for Wiley books with boxes # returns True, if

could be unwrapped @@ -288,10 +288,10 @@ def unwrap_structural_tags(body_tag): else: return True - def _add_table_to_abc_books(tag_, border, bg_color): + def add_table_to_abc_books(tag_, border, bg_color): wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color) - def _add_span_to_save_ids_for_links(tag_to_be_removed): + def add_span_to_save_ids_for_links(tag_to_be_removed): if tag_to_be_removed.attrs.get('id'): insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed, id_=tag_to_be_removed.attrs['id'], @@ -311,17 +311,17 @@ def unwrap_structural_tags(body_tag): if div.attrs.get('class'): div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0] if div_class in ['C409', 'C409a']: - _add_table_to_abc_books(div, border='solid 3px', bg_color='#e7e7e9') + add_table_to_abc_books(div, border='solid 3px', bg_color='#e7e7e9') elif div_class in ['C441', 'C816']: - _add_table_to_abc_books(div, border='solid #6e6e70 1px', bg_color='#e7e7e8') + add_table_to_abc_books(div, border='solid #6e6e70 1px', bg_color='#e7e7e8') if div.attrs.get('style'): if 'background-color' in div.attrs['style']: end_index = div.attrs['style'].find('background-color') + len('background-color') start_index_of_color = end_index + 2 bg_color = div.attrs['style'][start_index_of_color:start_index_of_color+7] - _add_table_to_abc_books(div, border='', bg_color=bg_color) + add_table_to_abc_books(div, border='', bg_color=bg_color) if div.attrs.get('style') == '': del div.attrs['style'] @@ -331,19 +331,19 @@ def unwrap_structural_tags(body_tag): div.name = 'p' continue - _add_span_to_save_ids_for_links(div) + add_span_to_save_ids_for_links(div) div.unwrap() for s in body_tag.find_all("section"): could_be_unwrapped = True if s.attrs.get('class'): - could_be_unwrapped = _preserve_class_in_section_tag(s) - _add_span_to_save_ids_for_links(s) + could_be_unwrapped = preserve_class_in_section_tag(s) + add_span_to_save_ids_for_links(s) if could_be_unwrapped: s.unwrap() for s in body_tag.find_all("article"): - _add_span_to_save_ids_for_links(s) + add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("figure"): @@ -351,22 +351,22 @@ def unwrap_structural_tags(body_tag): s.attrs['style'] = "text-align: center;" # to center image inside this tag for s in body_tag.find_all("figcaption"): - _add_span_to_save_ids_for_links(s) + add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("aside"): s.name = 'blockquote' for s in body_tag.find_all("main"): - _add_span_to_save_ids_for_links(s) + add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("body"): - _add_span_to_save_ids_for_links(s) + add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("html"): - _add_span_to_save_ids_for_links(s) + add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("header"): @@ -385,7 +385,7 @@ def unwrap_structural_tags(body_tag): parents_marks_are_body = [x.parent == body_tag for x in marks] assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' - _heading_tag2p_tag(body_tag) + heading_tag_to_p_tag(body_tag) # wrap NavigableString with

for node in body_tag: