@@ -9,178 +9,6 @@ from src.access import Access
from src . livecarta_config import LiveCartaConfig
def save_image_locally ( img_file_path : str , img_content : bytes , book_id : str ) :
""" Function saves all images locally """
folder_path = os . path . dirname ( os . path . dirname ( os . path . abspath ( __file__ ) ) )
new_path = pathlib . Path ( os . path . join (
folder_path , f ' ../json/img_ { book_id } / ' ) )
new_path . mkdir ( exist_ok = True )
new_img_path = new_path / os . path . basename ( img_file_path )
f = open ( new_img_path , ' wb+ ' )
f . write ( img_content )
f . close ( )
return new_img_path
def save_image_to_aws ( access : Access , img_file_path : str , img_content : bytes , book_id : str ) :
""" Function saves all images to Amazon web service """
link_path = access . send_image (
img_file_path , doc_id = book_id , img_content = img_content )
return link_path
def update_images_src_links ( body_tag : BeautifulSoup ,
href2img_content : dict ,
path_to_html : str ,
access = None ,
path2aws_path : dict = None ,
book_id : str = None ) - > dict :
""" Function makes dictionary image_src_path -> Amazon web service_path """
img_tags = body_tag . find_all ( ' img ' )
for img in img_tags :
path_to_img_from_html = img . attrs . get ( ' src ' )
html_folder = os . path . dirname ( path_to_html )
path_to_img_from_root = os . path . normpath ( os . path . join (
html_folder , path_to_img_from_html ) ) . replace ( ' \\ ' , ' / ' )
assert path_to_img_from_root in href2img_content , \
f ' Image { path_to_img_from_html } in file { path_to_html } was not added to manifest. '
img_content = href2img_content [ path_to_img_from_root ]
if access is not None :
if path_to_img_from_root in path2aws_path :
new_folder = path2aws_path [ path_to_img_from_root ]
else :
new_folder = save_image_to_aws (
access , path_to_img_from_root , img_content , book_id )
path2aws_path [ path_to_img_from_root ] = new_folder
else :
new_folder = save_image_locally (
path_to_img_from_root , img_content , ' book_id ' )
img . attrs [ ' src ' ] = str ( new_folder )
if img . attrs . get ( ' width ' ) :
del img . attrs [ ' width ' ]
if img . attrs . get ( ' height ' ) :
del img . attrs [ ' height ' ]
if img . attrs . get ( ' style ' ) :
del img . attrs [ ' style ' ]
return path2aws_path
def _preprocess_table ( body_tag : BeautifulSoup ) :
""" Function to preprocess tables and tags(td|th|tr): style """
tables = body_tag . find_all ( " table " )
for table in tables :
t_tags = table . find_all ( re . compile ( " td|th|tr " ) )
for t_tag in t_tags :
style = t_tag . get ( ' style ' )
width = ' '
if style :
width_match = re . search (
r " [^-]width: ?( \ d+ \ .? \ d*)(p[tx]) " , style )
if width_match :
size = width_match . group ( 1 )
width = size + ' px '
t_tag . attrs [ ' width ' ] = t_tag . get ( ' width ' ) or width
if t_tag . attrs . get ( ' style ' ) :
t_tag . attrs [ ' style ' ] = t_tag . attrs [ ' style ' ] . replace (
' border:0; ' , ' ' )
elif t_tag . attrs . get ( ' style ' ) == ' ' :
del t_tag . attrs [ ' style ' ]
if not table . attrs . get ( ' border ' ) or table . attrs . get ( ' border ' ) in [ ' 0 ' , ' 0px ' ] :
table . attrs [ ' border ' ] = ' 1 '
def _process_lists ( body_tag : BeautifulSoup ) :
"""
Function
- process tags <li>.
- unwrap <p> tags.
Parameters
----------
body_tag: Tag, soup object
Returns
-------
None
"""
li_tags = body_tag . find_all ( " li " )
for li_tag in li_tags :
if li_tag . p :
li_tag . attrs . update ( li_tag . p . attrs )
li_tag . p . unwrap ( )
def _insert_span_with_attrs_before_tag ( main_tag , tag , id_ , class_ ) :
""" Function inserts span before tag aren ' t supported by livecarta """
new_tag = main_tag . new_tag ( " span " )
new_tag . attrs [ ' id ' ] = id_ or ' '
new_tag . attrs [ ' class ' ] = class_ or ' '
new_tag . string = " \xa0 "
tag . insert_before ( new_tag )
def _clean_headings_content ( content : BeautifulSoup , title : str ) :
def add_span_to_save_ids_for_links ( tag_to_be_removed : Tag , body_tag : BeautifulSoup ) :
if tag_to_be_removed . attrs . get ( ' id ' ) :
_insert_span_with_attrs_before_tag ( body_tag ,
tag_to_be_removed ,
id_ = tag_to_be_removed . attrs . get (
' id ' ) ,
class_ = tag_to_be_removed . attrs . get ( ' class ' ) )
for sub_tag in tag_to_be_removed . find_all ( ) :
if sub_tag . attrs . get ( ' id ' ) :
_insert_span_with_attrs_before_tag ( body_tag ,
tag_to_be_removed ,
id_ = sub_tag . attrs [ ' id ' ] ,
class_ = sub_tag . attrs . get ( ' class ' ) )
title = title . lower ( )
for child in content . contents :
if isinstance ( child , NavigableString ) :
text = child
else :
text = child . text
if text and re . sub ( r ' ([ \ n \ t \ xa0]) ' , ' ' , text ) :
text = re . sub ( r ' ([ \ n \ t \ xa0]) ' , ' ' , text )
text = re . sub ( r ' + ' , ' ' , text ) . strip ( )
text = text . lower ( )
if title == text :
add_span_to_save_ids_for_links ( child , content )
child . extract ( )
elif ( title in text ) and ( child . name in [ ' h1 ' , ' h2 ' , ' h3 ' ] ) :
add_span_to_save_ids_for_links ( child , content )
child . extract ( )
break
def _heading_tag_to_p_tag ( body_tag ) :
""" Function to convert all lower level headings to p tags """
pattern = f ' ^h[ { LiveCartaConfig . SUPPORTED_LEVELS + 1 } -9]$ '
header_tags = body_tag . find_all ( re . compile ( pattern ) )
for tag in header_tags :
tag . name = ' p '
def _clean_title_from_numbering ( title : str ) :
""" Function removes numbering from titles """
title = re . sub ( r ' ^( \ s+)+ ' , ' ' , title )
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
# title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
def _replace_with_livecarta_anchor_tag ( anchor , i ) :
""" Function replace noteref_tag(anchor) with new livecarta tag """
new_tag = BeautifulSoup ( features = ' lxml ' ) . new_tag ( ' sup ' )
@@ -381,6 +209,13 @@ def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup:
_add_span_to_save_ids_for_links ( div )
div . unwrap ( )
def _heading_tag_to_p_tag ( body_tag ) :
""" Function to convert all lower level headings to p tags """
pattern = f ' ^h[ { LiveCartaConfig . SUPPORTED_LEVELS + 1 } -9]$ '
header_tags = body_tag . find_all ( re . compile ( pattern ) )
for tag in header_tags :
tag . name = ' p '
# comments removal
for tag in body_tag . find_all ( ) :
for element in tag ( text = lambda text : isinstance ( text , Comment ) ) :
@@ -497,6 +332,248 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
return tags
def save_image_to_aws ( access : Access , img_file_path : str , img_content : bytes , book_id : str ) :
""" Function saves all images to Amazon web service """
link_path = access . send_image (
img_file_path , doc_id = book_id , img_content = img_content )
return link_path
def save_image_locally ( img_file_path : str , img_content : bytes , book_id : str ) :
""" Function saves all images locally """
folder_path = os . path . dirname ( os . path . dirname ( os . path . abspath ( __file__ ) ) )
new_path = pathlib . Path ( os . path . join (
folder_path , f ' ../json/img_ { book_id } / ' ) )
new_path . mkdir ( exist_ok = True )
new_img_path = new_path / os . path . basename ( img_file_path )
f = open ( new_img_path , ' wb+ ' )
f . write ( img_content )
f . close ( )
return new_img_path
def update_images_src_links ( body_tag : BeautifulSoup ,
href2img_content : dict ,
path_to_html : str ,
access = None ,
path2aws_path : dict = None ,
book_id : str = None ) - > dict :
""" Function makes dictionary image_src_path -> Amazon web service_path """
img_tags = body_tag . find_all ( ' img ' )
for img in img_tags :
path_to_img_from_html = img . attrs . get ( ' src ' )
html_folder = os . path . dirname ( path_to_html )
path_to_img_from_root = os . path . normpath ( os . path . join (
html_folder , path_to_img_from_html ) ) . replace ( ' \\ ' , ' / ' )
assert path_to_img_from_root in href2img_content , \
f ' Image { path_to_img_from_html } in file { path_to_html } was not added to manifest. '
img_content = href2img_content [ path_to_img_from_root ]
if access is not None :
if path_to_img_from_root in path2aws_path :
new_folder = path2aws_path [ path_to_img_from_root ]
else :
new_folder = save_image_to_aws (
access , path_to_img_from_root , img_content , book_id )
path2aws_path [ path_to_img_from_root ] = new_folder
else :
new_folder = save_image_locally (
path_to_img_from_root , img_content , ' book_id ' )
img . attrs [ ' src ' ] = str ( new_folder )
if img . attrs . get ( ' width ' ) :
del img . attrs [ ' width ' ]
if img . attrs . get ( ' height ' ) :
del img . attrs [ ' height ' ]
if img . attrs . get ( ' style ' ) :
del img . attrs [ ' style ' ]
return path2aws_path
def _clean_title_from_numbering ( title : str ) :
""" Function removes numbering from titles """
title = re . sub ( r ' ^( \ s+)+ ' , ' ' , title )
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
# title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
def prepare_title ( title_of_chapter : str ) - > str :
""" Function finalise processing/cleaning title """
title_str = BeautifulSoup ( title_of_chapter , features = ' lxml ' ) . string
title_str = re . sub ( r ' ([ \ n \ t \ xa0]) ' , ' ' , title_str )
title_str = re . sub ( r ' + ' , ' ' , title_str ) . rstrip ( )
title_str = _clean_title_from_numbering ( title_str )
return title_str
def _insert_span_with_attrs_before_tag ( main_tag , tag , id_ , class_ ) :
""" Function inserts span before tag aren ' t supported by livecarta """
new_tag = main_tag . new_tag ( " span " )
new_tag . attrs [ ' id ' ] = id_ or ' '
new_tag . attrs [ ' class ' ] = class_ or ' '
new_tag . string = " \xa0 "
tag . insert_before ( new_tag )
def _clean_headings_content ( content : BeautifulSoup , title : str ) :
def add_span_to_save_ids_for_links ( tag_to_be_removed : Tag , body_tag : BeautifulSoup ) :
if tag_to_be_removed . attrs . get ( ' id ' ) :
_insert_span_with_attrs_before_tag ( body_tag ,
tag_to_be_removed ,
id_ = tag_to_be_removed . attrs . get (
' id ' ) ,
class_ = tag_to_be_removed . attrs . get ( ' class ' ) )
for sub_tag in tag_to_be_removed . find_all ( ) :
if sub_tag . attrs . get ( ' id ' ) :
_insert_span_with_attrs_before_tag ( body_tag ,
tag_to_be_removed ,
id_ = sub_tag . attrs [ ' id ' ] ,
class_ = sub_tag . attrs . get ( ' class ' ) )
title = title . lower ( )
for child in content . contents :
if isinstance ( child , NavigableString ) :
text = child
else :
text = child . text
if text and re . sub ( r ' ([ \ n \ t \ xa0]) ' , ' ' , text ) :
text = re . sub ( r ' ([ \ n \ t \ xa0]) ' , ' ' , text )
text = re . sub ( r ' + ' , ' ' , text ) . strip ( )
text = text . lower ( )
if title == text :
add_span_to_save_ids_for_links ( child , content )
child . extract ( )
elif ( title in text ) and ( child . name in [ ' h1 ' , ' h2 ' , ' h3 ' ] ) :
add_span_to_save_ids_for_links ( child , content )
child . extract ( )
break
def _process_lists ( body_tag : BeautifulSoup ) :
"""
Function
- process tags <li>.
- unwrap <p> tags.
Parameters
----------
body_tag: Tag, soup object
Returns
-------
None
"""
li_tags = body_tag . find_all ( " li " )
for li_tag in li_tags :
if li_tag . p :
li_tag . attrs . update ( li_tag . p . attrs )
li_tag . p . unwrap ( )
def _preprocess_table ( body_tag : BeautifulSoup ) :
""" Function to preprocess tables and tags(td|th|tr): style """
tables = body_tag . find_all ( " table " )
for table in tables :
t_tags = table . find_all ( re . compile ( " td|th|tr " ) )
for t_tag in t_tags :
style = t_tag . get ( ' style ' )
width = ' '
if style :
width_match = re . search (
r " [^-]width: ?( \ d+ \ .? \ d*)(p[tx]) " , style )
if width_match :
size = width_match . group ( 1 )
width = size + ' px '
t_tag . attrs [ ' width ' ] = t_tag . get ( ' width ' ) or width
if t_tag . attrs . get ( ' style ' ) :
t_tag . attrs [ ' style ' ] = t_tag . attrs [ ' style ' ] . replace (
' border:0; ' , ' ' )
elif t_tag . attrs . get ( ' style ' ) == ' ' :
del t_tag . attrs [ ' style ' ]
if not table . attrs . get ( ' border ' ) or table . attrs . get ( ' border ' ) in [ ' 0 ' , ' 0px ' ] :
table . attrs [ ' border ' ] = ' 1 '
def _preprocess_code_tags ( chapter_tag : BeautifulSoup ) :
"""
Function
- transform <code>, <kdb>, <var> tags into span
- add code style to this tags
Parameters
----------
chapter_tag: Tag, soup object
Returns
-------
None
"""
for code in chapter_tag . find_all ( re . compile ( " code|kbd|var " ) ) :
if not code . parent . name == " pre " :
code . name = " span "
continue
# if tag isn't in pre and doesn't have style
if not code . attrs . get ( ' style ' ) :
code . attrs [ ' style ' ] = ' font-size: 14px; font-family: courier new,courier,monospace; '
def _prepare_formatted ( text : str ) - > str :
""" Function replaces special symbols with their Unicode representation """
text = text . replace ( " < " , " \x3C " )
text = text . replace ( " > " , " \x3E " )
text = text . replace ( ' \t ' , " \xa0 \xa0 " ) #
text = text . replace ( ' ' , " \xa0 " )
text = text . replace ( ' 𝑓 ' , " \xf0 \x9d \x91 \x93 " )
return text
def _preprocess_pre_tags ( chapter_tag : BeautifulSoup ) :
"""
Function preprocessing <pre> tags
Wrap string of the tag with <code> if it ' s necessary
Parameters
----------
chapter_tag: Tag, soup object
Returns
----------
None
Modified chapter tag
"""
for pre in chapter_tag . find_all ( " pre " ) :
if pre . find_all ( " code|kbd|var " ) :
continue
else :
code = chapter_tag . new_tag ( " code " )
# insert all items that was in pre to code and remove from pre
for content in reversed ( pre . contents ) :
code . insert ( 0 , content . extract ( ) )
# wrap code with items
pre . append ( code )
def _clean_wiley_block ( block ) :
hrs = block . find_all ( " p " , attrs = { " class " : re . compile ( " .+ hr " ) } )
for hr in hrs :
hr . extract ( )
h = block . find ( re . compile ( " h[1-9] " ) )
if h :
h . name = " p "
h . insert_before ( BeautifulSoup ( features = ' lxml ' ) . new_tag ( " br " ) )
def _wrap_block_tag_with_table ( main_tag , old_tag , width = ' 95 ' , border = ' 1px ' , bg_color = None ) :
""" Function wraps <block> with <table> """
table = main_tag . new_tag ( " table " )
@@ -517,16 +594,6 @@ def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_c
return table
def _clean_wiley_block ( block ) :
hrs = block . find_all ( " p " , attrs = { " class " : re . compile ( " .+ hr " ) } )
for hr in hrs :
hr . extract ( )
h = block . find ( re . compile ( " h[1-9] " ) )
if h :
h . name = " p "
h . insert_before ( BeautifulSoup ( features = ' lxml ' ) . new_tag ( " br " ) )
def _preprocess_block_tags ( chapter_tag : Tag ) :
""" Function preprocessing <block> tags """
for block in chapter_tag . find_all ( " blockquote " , attrs = { " class " : re . compile ( " feature[1234] " ) } ) :
@@ -548,114 +615,6 @@ def _preprocess_block_tags(chapter_tag: Tag):
_wrap_block_tag_with_table ( chapter_tag , future_block , bg_color = color )
def _prepare_formatted ( text : str ) - > str :
""" Function replaces special symbols with their Unicode representation """
text = text . replace ( " < " , " \x3C " )
text = text . replace ( " > " , " \x3E " )
text = text . replace ( ' \t ' , " \xa0 \xa0 " ) #
text = text . replace ( ' ' , " \xa0 " )
text = text . replace ( ' 𝑓 ' , " \xf0 \x9d \x91 \x93 " )
return text
def _wrap_preformatted_span_with_table ( chapter_tag : Tag , span_tag : Tag ) - > Tag :
""" Function wraps <span> with <table> """
table , tbody , tr , td = chapter_tag . new_tag ( " table " ) , chapter_tag . new_tag (
" tbody " ) , chapter_tag . new_tag ( " tr " ) , chapter_tag . new_tag ( " td " )
table . attrs [ ' border ' ] , table . attrs [ ' style ' ] = ' 1px #ccc; ' , ' width:100 % ; '
td . attrs [ ' bgcolor ' ] = ' #f5f5f5 '
# td.attrs['border-radius'] = '4px'
span_tag . wrap ( td )
td . wrap ( tr )
tr . wrap ( tbody )
tbody . wrap ( table )
return table
def _preprocess_pre_tags ( chapter_tag : BeautifulSoup ) :
"""
Function preprocessing <pre> tags
Parameters
----------
chapter_tag: Tag, soup object
Steps
----------
1. Process NavigableString
2. Process Tags and their children
"""
for pre in chapter_tag . find_all ( " pre " ) :
new_tag = BeautifulSoup ( features = ' lxml ' ) . new_tag ( " span " )
new_tag . attrs = pre . attrs . copy ( )
new_tag . attrs [ ' style ' ] = " font-family: courier new,courier,monospace; " \
" font-size: 14px; white-space: nowrap; "
# if in <pre> there are multiple <span>, we need to add <br> after each content
to_add_br = len ( pre . find_all ( " span " ) ) > 1
copy_contents = pre . contents [ : ]
for child in copy_contents :
# Navigable String
if isinstance ( child , NavigableString ) :
cleaned_text = _prepare_formatted ( str ( child ) )
sub_strings = re . split ( ' \r \n | \n | \r ' , cleaned_text )
for string in sub_strings [ : - 1 ] :
new_tag . append ( NavigableString ( string ) )
new_tag . append ( BeautifulSoup (
features = ' lxml ' ) . new_tag ( ' br ' ) )
new_tag . append ( NavigableString ( sub_strings [ - 1 ] ) )
# Tag
else :
for sub_child in child . children :
if isinstance ( sub_child , NavigableString ) :
cleaned_text = _prepare_formatted ( str ( sub_child ) )
sub_child . replace_with ( NavigableString ( cleaned_text ) )
else :
sub_child . string = _prepare_formatted ( sub_child . text )
cleaned_tag = child . extract ( )
new_tag . append ( cleaned_tag )
if to_add_br :
new_tag . append ( BeautifulSoup (
features = ' lxml ' ) . new_tag ( ' br ' ) )
pre . replace_with ( new_tag )
table = _wrap_preformatted_span_with_table ( chapter_tag , new_tag )
# add <p> to save brs
p_for_br = chapter_tag . new_tag ( " p " )
p_for_br . string = " \xa0 "
table . insert_after ( p_for_br )
def _preprocess_code_tags ( chapter_tag : BeautifulSoup ) :
"""
Function
- transform <code>, <kdb>, <var> tags into span
- add code style to this tags
Parameters
----------
chapter_tag: Tag, soup object
Returns
-------
None
"""
for code in chapter_tag . find_all ( re . compile ( " code|kbd|var " ) ) :
code . name = " span "
if code . parent . name == " pre " :
continue
# if tags aren't in pre and don't have style
if not code . attrs . get ( ' style ' ) :
code . attrs [ ' style ' ] = ' font-size: 14px; font-family: courier new,courier,monospace; '
def prepare_title ( title_of_chapter : str ) - > str :
""" Function finalise processing/cleaning title """
title_str = BeautifulSoup ( title_of_chapter , features = ' lxml ' ) . string
title_str = re . sub ( r ' ([ \ n \ t \ xa0]) ' , ' ' , title_str )
title_str = re . sub ( r ' + ' , ' ' , title_str ) . rstrip ( )
title_str = _clean_title_from_numbering ( title_str )
return title_str
def prepare_content ( title_str : str , content_tag : BeautifulSoup , remove_title_from_chapter : bool ) - > str :
"""
Function finalise processing/cleaning content