forked from LiveCarta/BookConverter
Function annotations
This commit is contained in:
@@ -9,7 +9,7 @@ from src.access import Access
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
|
||||
|
||||
def save_image_locally(img_file_path, img_content, book_id):
|
||||
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
|
||||
"""Function saves all images locally"""
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
new_path = pathlib.Path(os.path.join(
|
||||
@@ -24,19 +24,19 @@ def save_image_locally(img_file_path, img_content, book_id):
|
||||
return new_img_path
|
||||
|
||||
|
||||
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
|
||||
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
|
||||
"""Function saves all images to Amazon web service"""
|
||||
link_path = access.send_image(
|
||||
img_file_path, doc_id=book_id, img_content=img_content)
|
||||
return link_path
|
||||
|
||||
|
||||
def update_images_src_links(body_tag: Tag,
|
||||
def update_images_src_links(body_tag: BeautifulSoup,
|
||||
href2img_content: dict,
|
||||
path_to_html,
|
||||
path_to_html: str,
|
||||
access=None,
|
||||
path2aws_path=None,
|
||||
book_id=None):
|
||||
path2aws_path: dict = None,
|
||||
book_id: str = None) -> dict:
|
||||
"""Function makes dictionary image_src_path -> Amazon web service_path"""
|
||||
img_tags = body_tag.find_all('img')
|
||||
|
||||
@@ -99,13 +99,22 @@ def preprocess_table(body_tag: BeautifulSoup):
|
||||
table.attrs['border'] = '1'
|
||||
|
||||
|
||||
def process_lists(body_tag):
|
||||
def process_lists(body_tag: BeautifulSoup):
|
||||
"""
|
||||
Function to process tags <li>.
|
||||
Unwrap <p> tags.
|
||||
"""
|
||||
li_tags = body_tag.find_all("li")
|
||||
Function
|
||||
- process tags <li>.
|
||||
- unwrap <p> tags.
|
||||
Parameters
|
||||
----------
|
||||
body_tag: Tag, soup object
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
"""
|
||||
|
||||
li_tags = body_tag.find_all("li")
|
||||
for li_tag in li_tags:
|
||||
if li_tag.p:
|
||||
li_tag.attrs.update(li_tag.p.attrs)
|
||||
@@ -113,7 +122,7 @@ def process_lists(body_tag):
|
||||
|
||||
|
||||
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||
"""Function inserts span before tag to be removed(aren't supported by livecarta)"""
|
||||
"""Function inserts span before tag aren't supported by livecarta"""
|
||||
new_tag = main_tag.new_tag("span")
|
||||
new_tag.attrs['id'] = id_ or ''
|
||||
new_tag.attrs['class'] = class_ or ''
|
||||
@@ -121,8 +130,8 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||
tag.insert_before(new_tag)
|
||||
|
||||
|
||||
def clean_headings_content(content: Tag, title: str):
|
||||
def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
|
||||
def clean_headings_content(content: BeautifulSoup, title: str):
|
||||
def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
|
||||
if tag_to_be_removed.attrs.get('id'):
|
||||
insert_span_with_attrs_before_tag(body_tag,
|
||||
tag_to_be_removed,
|
||||
@@ -194,6 +203,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
|
||||
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
|
||||
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
||||
|
||||
"""
|
||||
footnotes = []
|
||||
noterefs_tags = source_html_tag.find_all(
|
||||
@@ -258,21 +268,28 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
return footnotes, new_noterefs_tags, new_footnotes_tags
|
||||
|
||||
|
||||
def unwrap_structural_tags(body_tag):
|
||||
"""Main function that works with structure of html. Make changes inplace.
|
||||
def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
"""
|
||||
Main function that works with structure of html. Make changes inplace.
|
||||
Parameters
|
||||
----------
|
||||
body_tag: Tag, soup object
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. Extracts tags that are not needed
|
||||
|
||||
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
|
||||
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
|
||||
This tag must have a body_tag as a parent.
|
||||
Otherwise, it is wrapped with some tags. Like:
|
||||
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
|
||||
|
||||
3. Headings that are not supported by livecarta converts to <p>
|
||||
4. Wrapping NavigableString
|
||||
:param body_tag: Tag, soup object
|
||||
:return: None
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
"""
|
||||
|
||||
def preserve_class_in_aside_tag(tag_):
|
||||
@@ -284,10 +301,18 @@ def unwrap_structural_tags(body_tag):
|
||||
if not tag_.parent.attrs.get('class'):
|
||||
tag_.parent.attrs['class'] = tag_class
|
||||
|
||||
def preserve_class_in_section_tag(tag_) -> bool:
|
||||
def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
|
||||
"""
|
||||
to save css style inherited from class, copy class to child <p>
|
||||
Function saves css style inherited from class, copies class to child <p>
|
||||
returns True, if <section> could be unwrapped
|
||||
Parameters
|
||||
----------
|
||||
tag_: Tag, soup object
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
"""
|
||||
# this is for Wiley books with boxes
|
||||
tag_class = tag_.attrs['class'] if not isinstance(
|
||||
@@ -314,9 +339,11 @@ def unwrap_structural_tags(body_tag):
|
||||
class_=tag_to_be_removed.attrs.get('class'))
|
||||
|
||||
def replace_div_tag_with_table():
|
||||
"""Function replace <div> with <table>:
|
||||
"""
|
||||
Function replace <div> with <table>:
|
||||
1. Convert div with certain classes to tables
|
||||
2. Add background color to div with background-color
|
||||
|
||||
"""
|
||||
for div in body_tag.find_all("div"):
|
||||
if div.attrs.get('class'):
|
||||
@@ -431,22 +458,22 @@ def unwrap_structural_tags(body_tag):
|
||||
return body_tag
|
||||
|
||||
|
||||
def get_tags_between_chapter_marks(first_id, href, html_soup):
|
||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||
"""After processing on a first_id that corresponds to current chapter,
|
||||
from initial html_soup all tags from current chapter are extracted
|
||||
|
||||
Parameters
|
||||
----------
|
||||
first_id :
|
||||
first_id:
|
||||
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
||||
href :
|
||||
href:
|
||||
Name of current chapter's file
|
||||
html_soup :
|
||||
html_soup: Tag, soup object
|
||||
Soup object of current file
|
||||
|
||||
Returns
|
||||
-------
|
||||
tags : list [Tag, NavigableString]
|
||||
tags: list [Tag, NavigableString]
|
||||
Chapter's tags
|
||||
|
||||
"""
|
||||
@@ -536,37 +563,33 @@ def prepare_formatted(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def wrap_preformatted_span_with_table(main_tag, old_tag):
|
||||
def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
|
||||
"""Function wraps <span> with <table>"""
|
||||
table = main_tag.new_tag("table")
|
||||
table.attrs['border'] = '1px #ccc;'
|
||||
table.attrs['style'] = 'width:100%;'
|
||||
tbody = main_tag.new_tag("tbody")
|
||||
tr = main_tag.new_tag("tr")
|
||||
td = main_tag.new_tag("td")
|
||||
table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
|
||||
"tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||
table.attrs['border'], table.attrs['style'] = '1px #ccc;', 'width:100%;'
|
||||
td.attrs['bgcolor'] = '#f5f5f5'
|
||||
# td.attrs['border-radius'] = '4px'
|
||||
old_tag.wrap(td)
|
||||
span_tag.wrap(td)
|
||||
td.wrap(tr)
|
||||
tr.wrap(tbody)
|
||||
tbody.wrap(table)
|
||||
return table
|
||||
|
||||
|
||||
def preprocess_pre_tags(chapter_tag):
|
||||
"""Function preprocessing <pre> tags
|
||||
def preprocess_pre_tags(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function preprocessing <pre> tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
chapter_tag: Tag, soup object
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. cleaning \n
|
||||
2. heading removal
|
||||
3. processing tags
|
||||
4. class removal
|
||||
"""
|
||||
1. Process NavigableString
|
||||
2. Process Tags and their children
|
||||
|
||||
"""
|
||||
for pre in chapter_tag.find_all("pre"):
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag("span")
|
||||
new_tag.attrs = pre.attrs.copy()
|
||||
@@ -599,17 +622,26 @@ def preprocess_pre_tags(chapter_tag):
|
||||
"font-size: 14px; white-space: nowrap;"
|
||||
pre.replace_with(new_tag)
|
||||
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
|
||||
# add <p> to save brs
|
||||
p_for_br = chapter_tag.new_tag("p")
|
||||
p_for_br.string = "\xa0"
|
||||
table.insert_after(p_for_br)
|
||||
|
||||
|
||||
def preprocess_code_tags(chapter_tag: Tag):
|
||||
"""Function that
|
||||
- transform <code>, <kdb>, <var> tags into span
|
||||
- add code style to this tags
|
||||
def preprocess_code_tags(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function
|
||||
- transform <code>, <kdb>, <var> tags into span
|
||||
- add code style to this tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: Tag, soup object
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
"""
|
||||
for code in chapter_tag.find_all(re.compile("code|kbd|var")):
|
||||
code.name = "span"
|
||||
if code.parent.name == "pre":
|
||||
@@ -620,7 +652,6 @@ def preprocess_code_tags(chapter_tag: Tag):
|
||||
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
|
||||
|
||||
|
||||
|
||||
def prepare_title(title_of_chapter: str) -> str:
|
||||
"""Function finalise processing/cleaning title"""
|
||||
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
|
||||
@@ -631,18 +662,19 @@ def prepare_title(title_of_chapter: str) -> str:
|
||||
|
||||
|
||||
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
||||
"""Function finalise processing/cleaning content
|
||||
"""
|
||||
Function finalise processing/cleaning content
|
||||
Parameters
|
||||
----------
|
||||
title_str: str
|
||||
|
||||
content_tag: BeautifulSoup
|
||||
content_tag: Tag, soup object
|
||||
|
||||
remove_title_from_chapter: bool
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. cleaning \n
|
||||
1. find \n
|
||||
2. heading removal
|
||||
3. processing tags
|
||||
4. class removal
|
||||
@@ -651,9 +683,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
-------
|
||||
content_tag: str
|
||||
prepared content
|
||||
"""
|
||||
|
||||
# 0. cleaning \n
|
||||
"""
|
||||
# 1. find \n
|
||||
to_remove = []
|
||||
for child in content_tag.contents:
|
||||
if isinstance(child, NavigableString):
|
||||
@@ -661,18 +693,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
if s == '':
|
||||
to_remove.append(child)
|
||||
|
||||
# 1. heading removal
|
||||
# 2. heading removal
|
||||
if remove_title_from_chapter:
|
||||
clean_headings_content(content_tag, title_str)
|
||||
|
||||
# 2. processing tags (<li>, <table>, <code>, <pre>, <block>)
|
||||
# 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
|
||||
process_lists(content_tag)
|
||||
preprocess_table(content_tag)
|
||||
preprocess_code_tags(content_tag)
|
||||
preprocess_pre_tags(content_tag)
|
||||
preprocess_block_tags(content_tag)
|
||||
|
||||
# 3. class removal
|
||||
# 4. class removal
|
||||
for tag in content_tag.find_all(recursive=True):
|
||||
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
|
||||
'footnote-element']):
|
||||
|
||||
Reference in New Issue
Block a user