forked from LiveCarta/BookConverter
Wrote documentation for every func/class in .py
This commit is contained in:
@@ -10,6 +10,7 @@ from src.livecarta_config import LiveCartaConfig
|
||||
|
||||
|
||||
def save_image_locally(img_file_path, img_content, book_id):
|
||||
""" Function saves all images locally """
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
new_path = pathlib.Path(os.path.join(
|
||||
folder_path, f'../json/img_{book_id}/'))
|
||||
@@ -24,17 +25,19 @@ def save_image_locally(img_file_path, img_content, book_id):
|
||||
|
||||
|
||||
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
|
||||
link = access.send_image(
|
||||
""" Function saves all images to Amazon web service """
|
||||
link_path = access.send_image(
|
||||
img_file_path, doc_id=book_id, img_content=img_content)
|
||||
return link
|
||||
return link_path
|
||||
|
||||
|
||||
def update_src_links_in_images(body_tag: Tag,
|
||||
href2img_content: dict,
|
||||
path_to_html,
|
||||
access=None,
|
||||
path2aws_path=None,
|
||||
book_id=None):
|
||||
def update_images_src_links(body_tag: Tag,
|
||||
href2img_content: dict,
|
||||
path_to_html,
|
||||
access=None,
|
||||
path2aws_path=None,
|
||||
book_id=None):
|
||||
""" Function makes dictionary image_src_path -> Amazon web service_path """
|
||||
img_tags = body_tag.find_all('img')
|
||||
|
||||
for img in img_tags:
|
||||
@@ -65,16 +68,16 @@ def update_src_links_in_images(body_tag: Tag,
|
||||
del img.attrs['height']
|
||||
if img.attrs.get('style'):
|
||||
del img.attrs['style']
|
||||
|
||||
return path2aws_path
|
||||
|
||||
|
||||
def preprocess_table(body_tag: BeautifulSoup):
|
||||
""" Function to preprocess tables and tags(td|th|tr): style """
|
||||
tables = body_tag.find_all("table")
|
||||
for table in tables:
|
||||
tds = table.find_all(re.compile("td|th|tr"))
|
||||
for td in tds:
|
||||
style = td.get('style')
|
||||
ts = table.find_all(re.compile("td|th|tr"))
|
||||
for t_tag in ts:
|
||||
style = t_tag.get('style')
|
||||
width = ''
|
||||
if style:
|
||||
width_match = re.search(
|
||||
@@ -84,13 +87,13 @@ def preprocess_table(body_tag: BeautifulSoup):
|
||||
units = width_match.group(2)
|
||||
width = size+'px'
|
||||
|
||||
td.attrs['width'] = td.get('width') or width
|
||||
t_tag.attrs['width'] = t_tag.get('width') or width
|
||||
|
||||
if td.attrs.get('style'):
|
||||
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
|
||||
if t_tag.attrs.get('style'):
|
||||
t_tag.attrs['style'] = t_tag.attrs['style'].replace('border:0;', '')
|
||||
|
||||
if td.attrs.get('style') == '':
|
||||
del td.attrs['style']
|
||||
elif t_tag.attrs.get('style') == '':
|
||||
del t_tag.attrs['style']
|
||||
|
||||
if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
|
||||
table.attrs['border'] = '1'
|
||||
@@ -110,6 +113,7 @@ def process_lists(body_tag):
|
||||
|
||||
|
||||
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||
""" Function inserts span before tag to be removed(aren't supported by livecarta) """
|
||||
new_tag = main_tag.new_tag("span")
|
||||
new_tag.attrs['id'] = id_ or ''
|
||||
new_tag.attrs['class'] = class_ or ''
|
||||
@@ -153,9 +157,7 @@ def clean_headings_content(content: Tag, title: str):
|
||||
|
||||
|
||||
def heading_tag_to_p_tag(body_tag):
|
||||
"""
|
||||
Function to convert all lower level headings to p tags
|
||||
"""
|
||||
""" Function to convert all lower level headings to p tags """
|
||||
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||
header_tags = body_tag.find_all(re.compile(pattern))
|
||||
for tag in header_tags:
|
||||
@@ -163,17 +165,16 @@ def heading_tag_to_p_tag(body_tag):
|
||||
|
||||
|
||||
def clean_title_from_numbering(title: str):
|
||||
"""
|
||||
Function to remove digits from headers.
|
||||
"""
|
||||
""" Function removes numbering from titles """
|
||||
title = re.sub(r'^(\s+)+', '', title)
|
||||
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
|
||||
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
|
||||
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
|
||||
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
|
||||
return title
|
||||
|
||||
|
||||
def replace_with_livecarta_anchor_tag(anchor, i):
|
||||
""" Function replace noteref_tag(anchor) with new livecarta tag """
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||
new_tag['class'] = 'footnote-element'
|
||||
new_tag['data-id'] = i + 1
|
||||
@@ -188,11 +189,11 @@ def replace_with_livecarta_anchor_tag(anchor, i):
|
||||
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
|
||||
-> Tuple[list, list, list]:
|
||||
"""
|
||||
This function preprocessing footnotes
|
||||
This function should be earlier that adding fonts in pipeline.
|
||||
|
||||
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
|
||||
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
||||
|
||||
"""
|
||||
footnotes = []
|
||||
noterefs_tags = source_html_tag.find_all(
|
||||
@@ -205,12 +206,14 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
new_footnotes_tags = []
|
||||
[tag.decompose() for tag in bad_noterefs_tags]
|
||||
|
||||
def parse_a_tag_href(s: str):
|
||||
def parse_a_tag_href(s: str) -> Tuple[str, str]:
|
||||
""" Returns name of file & id of an anchor """
|
||||
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
|
||||
f, id_ = s.split('#')
|
||||
return f, id_
|
||||
|
||||
def verify_footnote_tag(tags: list):
|
||||
""" Function verifies is tag - footnote """
|
||||
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
|
||||
if len(tags) == 0:
|
||||
anchored_tags = list(target_html_tag.find_all(id=element_id))
|
||||
@@ -275,7 +278,7 @@ def unwrap_structural_tags(body_tag):
|
||||
"""
|
||||
|
||||
def _preserve_class_in_aside_tag(tag_):
|
||||
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
|
||||
""" to save css style inherited from class, copy class to aside tag (which is parent to tag_) """
|
||||
# this is for Wiley books with boxes
|
||||
tag_class = tag_.attrs['class'] if not isinstance(
|
||||
tag_.attrs['class'], list) else tag_.attrs['class'][0]
|
||||
@@ -284,10 +287,11 @@ def unwrap_structural_tags(body_tag):
|
||||
tag_.parent.attrs['class'] = tag_class
|
||||
|
||||
def preserve_class_in_section_tag(tag_) -> bool:
|
||||
# to save css style inherited from class, copy class to child <p>
|
||||
"""
|
||||
to save css style inherited from class, copy class to child <p>
|
||||
returns True, if <section> could be unwrapped
|
||||
"""
|
||||
# this is for Wiley books with boxes
|
||||
# returns True, if <section> could be unwrapped
|
||||
|
||||
tag_class = tag_.attrs['class'] if not isinstance(
|
||||
tag_.attrs['class'], list) else tag_.attrs['class'][0]
|
||||
if 'feature' not in tag_class:
|
||||
@@ -312,6 +316,10 @@ def unwrap_structural_tags(body_tag):
|
||||
class_=tag_to_be_removed.attrs.get('class'))
|
||||
|
||||
def replace_div_tag_with_table():
|
||||
"""Function replace <div> with <table>:
|
||||
1. Convert div with certain classes to tables
|
||||
2. Add background color to div with background-color
|
||||
"""
|
||||
for div in body_tag.find_all("div"):
|
||||
if div.attrs.get('class'):
|
||||
div_class = div.attrs['class'] if not isinstance(
|
||||
@@ -348,12 +356,12 @@ def unwrap_structural_tags(body_tag):
|
||||
continue
|
||||
add_span_to_save_ids_for_links(div)
|
||||
div.unwrap()
|
||||
|
||||
# comments removal
|
||||
for tag in body_tag.find_all():
|
||||
for element in tag(text=lambda text: isinstance(text, Comment)):
|
||||
element.extract()
|
||||
|
||||
|
||||
replace_div_tag_with_table()
|
||||
|
||||
for s in body_tag.find_all("section"):
|
||||
@@ -458,23 +466,8 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
|
||||
return tags
|
||||
|
||||
|
||||
def wrap_preformatted_span_with_table(main_tag, old_tag):
|
||||
table = main_tag.new_tag("table")
|
||||
table.attrs['border'] = '1px #ccc;'
|
||||
table.attrs['style'] = 'width:100%;'
|
||||
tbody = main_tag.new_tag("tbody")
|
||||
tr = main_tag.new_tag("tr")
|
||||
td = main_tag.new_tag("td")
|
||||
td.attrs['bgcolor'] = '#f5f5f5'
|
||||
# td.attrs['border-radius'] = '4px'
|
||||
old_tag.wrap(td)
|
||||
td.wrap(tr)
|
||||
tr.wrap(tbody)
|
||||
tbody.wrap(table)
|
||||
return table
|
||||
|
||||
|
||||
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
|
||||
""" Function wraps <block> with <table> """
|
||||
table = main_tag.new_tag("table")
|
||||
table.attrs['border'] = border
|
||||
table.attrs['align'] = 'center'
|
||||
@@ -497,7 +490,6 @@ def clean_wiley_block(block):
|
||||
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
|
||||
for hr in hrs:
|
||||
hr.extract()
|
||||
print(hr)
|
||||
h = block.find(re.compile("h[1-9]"))
|
||||
if h:
|
||||
h.name = "p"
|
||||
@@ -505,6 +497,7 @@ def clean_wiley_block(block):
|
||||
|
||||
|
||||
def preprocess_block_tags(chapter_tag):
|
||||
""" Function preprocessing <block> tags """
|
||||
for block in chapter_tag.find_all("blockquote"):
|
||||
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
|
||||
clean_wiley_block(block)
|
||||
@@ -527,7 +520,7 @@ def preprocess_block_tags(chapter_tag):
|
||||
|
||||
|
||||
def prepare_formatted(text):
|
||||
# replace <,> to save them as is in html code
|
||||
""" Function replaces special symbols with their Unicode representation """
|
||||
text = text.replace("<", "\x3C")
|
||||
text = text.replace(">", "\x3E")
|
||||
text = text.replace('\t', "\xa0 \xa0 ") #
|
||||
@@ -536,7 +529,25 @@ def prepare_formatted(text):
|
||||
return text
|
||||
|
||||
|
||||
def wrap_preformatted_span_with_table(main_tag, old_tag):
|
||||
""" Function wraps <span> with <table> """
|
||||
table = main_tag.new_tag("table")
|
||||
table.attrs['border'] = '1px #ccc;'
|
||||
table.attrs['style'] = 'width:100%;'
|
||||
tbody = main_tag.new_tag("tbody")
|
||||
tr = main_tag.new_tag("tr")
|
||||
td = main_tag.new_tag("td")
|
||||
td.attrs['bgcolor'] = '#f5f5f5'
|
||||
# td.attrs['border-radius'] = '4px'
|
||||
old_tag.wrap(td)
|
||||
td.wrap(tr)
|
||||
tr.wrap(tbody)
|
||||
tbody.wrap(table)
|
||||
return table
|
||||
|
||||
|
||||
def preprocess_pre_tags(chapter_tag):
|
||||
""" Function preprocessing <pre> tags """
|
||||
for pre in chapter_tag.find_all("pre"):
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag("span")
|
||||
new_tag.attrs = pre.attrs.copy()
|
||||
@@ -575,7 +586,7 @@ def preprocess_pre_tags(chapter_tag):
|
||||
|
||||
|
||||
def preprocess_code_tags(chapter_tag):
|
||||
# function that emulates style of <code>, <kdb>, <var>
|
||||
""" Function that emulates style of <code>, <kdb>, <var> """
|
||||
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
|
||||
code.name = 'span'
|
||||
if code.parent.name == "pre":
|
||||
@@ -584,9 +595,7 @@ def preprocess_code_tags(chapter_tag):
|
||||
|
||||
|
||||
def prepare_title(title_of_chapter: str) -> str:
|
||||
"""
|
||||
Final processing/cleaning function.
|
||||
"""
|
||||
""" Function finalise processing/cleaning title """
|
||||
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
|
||||
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
||||
@@ -596,7 +605,11 @@ def prepare_title(title_of_chapter: str) -> str:
|
||||
|
||||
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
||||
"""
|
||||
Final processing/cleaning function.
|
||||
Function finalise processing/cleaning content
|
||||
1. cleaning \n
|
||||
2. heading removal
|
||||
3. processing tags
|
||||
4. class removal
|
||||
"""
|
||||
# 0. cleaning \n
|
||||
to_remove = []
|
||||
@@ -609,13 +622,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
# 1. heading removal
|
||||
if remove_title_from_chapter:
|
||||
clean_headings_content(content_tag, title_str)
|
||||
|
||||
# 2. processing tags (<li>, <table>, <code>, <pre>, <block>)
|
||||
process_lists(content_tag)
|
||||
preprocess_table(content_tag)
|
||||
preprocess_code_tags(content_tag)
|
||||
preprocess_pre_tags(content_tag)
|
||||
preprocess_block_tags(content_tag)
|
||||
|
||||
# 2. class removal
|
||||
# 3. class removal
|
||||
for tag in content_tag.find_all(recursive=True):
|
||||
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
|
||||
'footnote-element']):
|
||||
|
||||
Reference in New Issue
Block a user