forked from LiveCarta/BookConverter
Add lines with style 'border-bottom'
This commit is contained in:
@@ -75,6 +75,7 @@ def convert_indents(value):
|
|||||||
value = value.replace(has_style_attrs.group(4),
|
value = value.replace(has_style_attrs.group(4),
|
||||||
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px')
|
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(4))))))) + 'px')
|
||||||
return value
|
return value
|
||||||
|
|
||||||
"""
|
"""
|
||||||
LIVECARTA_STYLE_ATTRS = { css property: value }
|
LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||||
|
|
||||||
@@ -130,7 +131,6 @@ def get_text_color(x):
|
|||||||
color = color if color not in ['#000000', '#000', 'black'] else ''
|
color = color if color not in ['#000000', '#000', 'black'] else ''
|
||||||
return color
|
return color
|
||||||
|
|
||||||
|
|
||||||
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||||
'text-indent': convert_indents,
|
'text-indent': convert_indents,
|
||||||
'font-variant': lambda x: x,
|
'font-variant': lambda x: x,
|
||||||
@@ -293,7 +293,7 @@ class TagStyleConverter:
|
|||||||
ultimate_style = ultimate_style.replace('background:', 'background-color:')
|
ultimate_style = ultimate_style.replace('background:', 'background-color:')
|
||||||
ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type')
|
ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type')
|
||||||
|
|
||||||
split_ultimate_style = ultimate_style.replace(' ', '').split(';') # make for repetition check and convert to px
|
split_ultimate_style = ultimate_style.split(';') # make for repetition check and convert to px
|
||||||
|
|
||||||
# check for another ; in style string in preprocess_style()
|
# check for another ; in style string in preprocess_style()
|
||||||
while '' in split_ultimate_style:
|
while '' in split_ultimate_style:
|
||||||
@@ -303,7 +303,7 @@ class TagStyleConverter:
|
|||||||
if self.tag_with_initial_style.attrs.get('style'):
|
if self.tag_with_initial_style.attrs.get('style'):
|
||||||
|
|
||||||
initial_style = self.tag_with_initial_style.attrs['style']
|
initial_style = self.tag_with_initial_style.attrs['style']
|
||||||
split_initial_style = initial_style.replace(' ', '').split(';')
|
split_initial_style = initial_style.split(';')
|
||||||
|
|
||||||
# check for another ; in style string in preprocess_style()
|
# check for another ; in style string in preprocess_style()
|
||||||
while '' in split_initial_style:
|
while '' in split_initial_style:
|
||||||
@@ -356,7 +356,7 @@ class TagStyleConverter:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def wrap_span_in_p_to_save_style_attrs(tag):
|
def wrap_span_in_p_to_save_style_attrs(tag):
|
||||||
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
||||||
if attr not in ['text-align', 'text-indent']]
|
if attr not in ['text-align', 'text-indent', 'border-bottom']]
|
||||||
|
|
||||||
if tag.name == 'p' and tag.attrs.get('style'):
|
if tag.name == 'p' and tag.attrs.get('style'):
|
||||||
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
|
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
|
||||||
@@ -365,7 +365,7 @@ class TagStyleConverter:
|
|||||||
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
||||||
span_style = tag.attrs['style']
|
span_style = tag.attrs['style']
|
||||||
p_style = ''
|
p_style = ''
|
||||||
possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)')
|
possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)|(border-bottom:( *\w+);*)')
|
||||||
for i in range(span_style.count(';') + 1):
|
for i in range(span_style.count(';') + 1):
|
||||||
has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
|
has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
|
||||||
if has_p_style_attrs:
|
if has_p_style_attrs:
|
||||||
@@ -375,6 +375,9 @@ class TagStyleConverter:
|
|||||||
if has_p_style_attrs.group(3):
|
if has_p_style_attrs.group(3):
|
||||||
p_style += has_p_style_attrs.group(3)
|
p_style += has_p_style_attrs.group(3)
|
||||||
span_style = span_style.replace(has_p_style_attrs.group(3), '')
|
span_style = span_style.replace(has_p_style_attrs.group(3), '')
|
||||||
|
if has_p_style_attrs.group(5):
|
||||||
|
p_style += span_style
|
||||||
|
span_style = span_style.replace(span_style, '')
|
||||||
|
|
||||||
p_tag.attrs['style'] = p_style
|
p_tag.attrs['style'] = p_style
|
||||||
|
|
||||||
@@ -388,7 +391,7 @@ class TagStyleConverter:
|
|||||||
def add_span_to_save_style_attrs_in_li(t):
|
def add_span_to_save_style_attrs_in_li(t):
|
||||||
if t.name == 'li' and t.attrs.get('style'):
|
if t.name == 'li' and t.attrs.get('style'):
|
||||||
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
|
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
|
||||||
attr not in ['text-align', 'list-style-type']]
|
attr not in ['text-align', 'list-style-type', 'border-bottom']]
|
||||||
|
|
||||||
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
|
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
|
||||||
if any(check):
|
if any(check):
|
||||||
|
|||||||
@@ -144,7 +144,7 @@ def clean_headings_content(content: Tag, title: str):
|
|||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def _heading_tag2p_tag(body_tag):
|
def heading_tag_to_p_tag(body_tag):
|
||||||
"""
|
"""
|
||||||
Function to convert all lower level headings to p tags
|
Function to convert all lower level headings to p tags
|
||||||
"""
|
"""
|
||||||
@@ -267,7 +267,7 @@ def unwrap_structural_tags(body_tag):
|
|||||||
if not tag_.parent.attrs.get('class'):
|
if not tag_.parent.attrs.get('class'):
|
||||||
tag_.parent.attrs['class'] = tag_class
|
tag_.parent.attrs['class'] = tag_class
|
||||||
|
|
||||||
def _preserve_class_in_section_tag(tag_) -> bool:
|
def preserve_class_in_section_tag(tag_) -> bool:
|
||||||
# to save css style inherited from class, copy class to child <p>
|
# to save css style inherited from class, copy class to child <p>
|
||||||
# this is for Wiley books with boxes
|
# this is for Wiley books with boxes
|
||||||
# returns True, if <section> could be unwrapped
|
# returns True, if <section> could be unwrapped
|
||||||
@@ -288,10 +288,10 @@ def unwrap_structural_tags(body_tag):
|
|||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _add_table_to_abc_books(tag_, border, bg_color):
|
def add_table_to_abc_books(tag_, border, bg_color):
|
||||||
wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color)
|
wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color)
|
||||||
|
|
||||||
def _add_span_to_save_ids_for_links(tag_to_be_removed):
|
def add_span_to_save_ids_for_links(tag_to_be_removed):
|
||||||
if tag_to_be_removed.attrs.get('id'):
|
if tag_to_be_removed.attrs.get('id'):
|
||||||
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
|
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
|
||||||
id_=tag_to_be_removed.attrs['id'],
|
id_=tag_to_be_removed.attrs['id'],
|
||||||
@@ -311,17 +311,17 @@ def unwrap_structural_tags(body_tag):
|
|||||||
if div.attrs.get('class'):
|
if div.attrs.get('class'):
|
||||||
div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0]
|
div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0]
|
||||||
if div_class in ['C409', 'C409a']:
|
if div_class in ['C409', 'C409a']:
|
||||||
_add_table_to_abc_books(div, border='solid 3px', bg_color='#e7e7e9')
|
add_table_to_abc_books(div, border='solid 3px', bg_color='#e7e7e9')
|
||||||
|
|
||||||
elif div_class in ['C441', 'C816']:
|
elif div_class in ['C441', 'C816']:
|
||||||
_add_table_to_abc_books(div, border='solid #6e6e70 1px', bg_color='#e7e7e8')
|
add_table_to_abc_books(div, border='solid #6e6e70 1px', bg_color='#e7e7e8')
|
||||||
|
|
||||||
if div.attrs.get('style'):
|
if div.attrs.get('style'):
|
||||||
if 'background-color' in div.attrs['style']:
|
if 'background-color' in div.attrs['style']:
|
||||||
end_index = div.attrs['style'].find('background-color') + len('background-color')
|
end_index = div.attrs['style'].find('background-color') + len('background-color')
|
||||||
start_index_of_color = end_index + 2
|
start_index_of_color = end_index + 2
|
||||||
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color+7]
|
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color+7]
|
||||||
_add_table_to_abc_books(div, border='', bg_color=bg_color)
|
add_table_to_abc_books(div, border='', bg_color=bg_color)
|
||||||
|
|
||||||
if div.attrs.get('style') == '':
|
if div.attrs.get('style') == '':
|
||||||
del div.attrs['style']
|
del div.attrs['style']
|
||||||
@@ -331,19 +331,19 @@ def unwrap_structural_tags(body_tag):
|
|||||||
div.name = 'p'
|
div.name = 'p'
|
||||||
continue
|
continue
|
||||||
|
|
||||||
_add_span_to_save_ids_for_links(div)
|
add_span_to_save_ids_for_links(div)
|
||||||
div.unwrap()
|
div.unwrap()
|
||||||
|
|
||||||
for s in body_tag.find_all("section"):
|
for s in body_tag.find_all("section"):
|
||||||
could_be_unwrapped = True
|
could_be_unwrapped = True
|
||||||
if s.attrs.get('class'):
|
if s.attrs.get('class'):
|
||||||
could_be_unwrapped = _preserve_class_in_section_tag(s)
|
could_be_unwrapped = preserve_class_in_section_tag(s)
|
||||||
_add_span_to_save_ids_for_links(s)
|
add_span_to_save_ids_for_links(s)
|
||||||
if could_be_unwrapped:
|
if could_be_unwrapped:
|
||||||
s.unwrap()
|
s.unwrap()
|
||||||
|
|
||||||
for s in body_tag.find_all("article"):
|
for s in body_tag.find_all("article"):
|
||||||
_add_span_to_save_ids_for_links(s)
|
add_span_to_save_ids_for_links(s)
|
||||||
s.unwrap()
|
s.unwrap()
|
||||||
|
|
||||||
for s in body_tag.find_all("figure"):
|
for s in body_tag.find_all("figure"):
|
||||||
@@ -351,22 +351,22 @@ def unwrap_structural_tags(body_tag):
|
|||||||
s.attrs['style'] = "text-align: center;" # to center image inside this tag
|
s.attrs['style'] = "text-align: center;" # to center image inside this tag
|
||||||
|
|
||||||
for s in body_tag.find_all("figcaption"):
|
for s in body_tag.find_all("figcaption"):
|
||||||
_add_span_to_save_ids_for_links(s)
|
add_span_to_save_ids_for_links(s)
|
||||||
s.unwrap()
|
s.unwrap()
|
||||||
|
|
||||||
for s in body_tag.find_all("aside"):
|
for s in body_tag.find_all("aside"):
|
||||||
s.name = 'blockquote'
|
s.name = 'blockquote'
|
||||||
|
|
||||||
for s in body_tag.find_all("main"):
|
for s in body_tag.find_all("main"):
|
||||||
_add_span_to_save_ids_for_links(s)
|
add_span_to_save_ids_for_links(s)
|
||||||
s.unwrap()
|
s.unwrap()
|
||||||
|
|
||||||
for s in body_tag.find_all("body"):
|
for s in body_tag.find_all("body"):
|
||||||
_add_span_to_save_ids_for_links(s)
|
add_span_to_save_ids_for_links(s)
|
||||||
s.unwrap()
|
s.unwrap()
|
||||||
|
|
||||||
for s in body_tag.find_all("html"):
|
for s in body_tag.find_all("html"):
|
||||||
_add_span_to_save_ids_for_links(s)
|
add_span_to_save_ids_for_links(s)
|
||||||
s.unwrap()
|
s.unwrap()
|
||||||
|
|
||||||
for s in body_tag.find_all("header"):
|
for s in body_tag.find_all("header"):
|
||||||
@@ -385,7 +385,7 @@ def unwrap_structural_tags(body_tag):
|
|||||||
parents_marks_are_body = [x.parent == body_tag for x in marks]
|
parents_marks_are_body = [x.parent == body_tag for x in marks]
|
||||||
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
|
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
|
||||||
|
|
||||||
_heading_tag2p_tag(body_tag)
|
heading_tag_to_p_tag(body_tag)
|
||||||
|
|
||||||
# wrap NavigableString with <p>
|
# wrap NavigableString with <p>
|
||||||
for node in body_tag:
|
for node in body_tag:
|
||||||
|
|||||||
Reference in New Issue
Block a user