forked from LiveCarta/BookConverter
refactor with PEP8
This commit is contained in:
@@ -497,7 +497,7 @@ class EpubConverter:
|
||||
id wraps chapter"s content + subchapters" content
|
||||
id points to the start of title of a chapter
|
||||
|
||||
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter"s id
|
||||
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
|
||||
and id of the next chapter/subchapter
|
||||
Parameters
|
||||
----------
|
||||
@@ -539,6 +539,7 @@ class EpubConverter:
|
||||
|
||||
lvl: int
|
||||
level of chapter
|
||||
|
||||
Returns
|
||||
-------
|
||||
ChapterItem
|
||||
@@ -597,7 +598,7 @@ class EpubConverter:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
epub_file_path = "../../epub/9781614382264.epub"
|
||||
epub_file_path = "../../epub/9781641050234.epub"
|
||||
logger_object = BookLogger(
|
||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou
|
||||
|
||||
"""
|
||||
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
|
||||
"""Function inserts span before tag aren't supported by livecarta"""
|
||||
"""Function inserts span before tag aren't supported by LiveCarta"""
|
||||
new_tag = chapter_tag.new_tag("span")
|
||||
new_tag.attrs["id"] = id_ or ""
|
||||
new_tag.attrs["class"] = class_ or ""
|
||||
@@ -77,22 +77,57 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
|
||||
|
||||
|
||||
def prepare_title(title_of_chapter: str) -> str:
|
||||
"""Function finalise processing/cleaning title"""
|
||||
title_str = BeautifulSoup(title_of_chapter, features="lxml").string
|
||||
"""
|
||||
Function finalise processing/cleaning title
|
||||
Parameters
|
||||
----------
|
||||
title_of_chapter: str
|
||||
|
||||
Returns
|
||||
-------
|
||||
title: str
|
||||
cleaned title
|
||||
|
||||
"""
|
||||
title = BeautifulSoup(title_of_chapter, features="lxml").string
|
||||
# clean extra whitespace characters ([\r\n\t\f\v ])
|
||||
title_str = re.sub(r"[\s\xa0]", " ", title_str).strip()
|
||||
return title_str
|
||||
title = re.sub(r"[\s\xa0]", " ", title).strip()
|
||||
return title
|
||||
|
||||
|
||||
def _remove_comments(chapter_tag):
|
||||
"""
|
||||
Function remove comments
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag without comments
|
||||
|
||||
"""
|
||||
for tag in chapter_tag.find_all():
|
||||
for element in tag(text=lambda text: isinstance(text, Comment)):
|
||||
element.extract()
|
||||
|
||||
|
||||
def _wrap_strings_with_p(chapter_tag):
|
||||
# Headings that are not supported by livecarta converts to <p>
|
||||
# wrap NavigableString with <p>
|
||||
"""
|
||||
Function converts headings that aren't supported by LiveCarta with <p>
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with wrapped NavigableStrings
|
||||
|
||||
"""
|
||||
for node in chapter_tag:
|
||||
if isinstance(node, NavigableString):
|
||||
content = str(node)
|
||||
@@ -104,7 +139,19 @@ def _wrap_strings_with_p(chapter_tag):
|
||||
|
||||
|
||||
def _wrap_tags_with_table(chapter_tag):
|
||||
"""Function wraps <tag> with <table>"""
|
||||
"""
|
||||
Function wraps <tag> with <table>
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with wrapped certain tags with <table>
|
||||
|
||||
"""
|
||||
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
|
||||
table = chapter_tag.new_tag("table")
|
||||
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
||||
@@ -141,7 +188,19 @@ def _wrap_tags_with_table(chapter_tag):
|
||||
|
||||
|
||||
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
||||
"""Function to replace all tags to correspond livecarta tags"""
|
||||
"""
|
||||
Function to replace all tags to correspond LiveCarta tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with all tags replaced with LiveCarta tags
|
||||
|
||||
"""
|
||||
for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
|
||||
for key in reg_keys:
|
||||
if isinstance(key, tuple):
|
||||
@@ -164,12 +223,23 @@ def _tags_to_correspond_livecarta_tag(chapter_tag):
|
||||
|
||||
|
||||
def _unwrap_tags(chapter_tag):
|
||||
"""Function unwrap tags and move id to span"""
|
||||
"""
|
||||
Function unwrap tags and moves id to span
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with unwrapped certain tags
|
||||
|
||||
"""
|
||||
for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
|
||||
for tag in chapter_tag.select(tag_name):
|
||||
# if tag is a subtag
|
||||
if ">" in tag_name:
|
||||
parent = tag.parent
|
||||
tag.parent.attrs.update(tag.attrs)
|
||||
_add_span_to_save_ids_for_links(tag, chapter_tag)
|
||||
tag.unwrap()
|
||||
@@ -178,8 +248,8 @@ def _unwrap_tags(chapter_tag):
|
||||
def _remove_headings_content(content_tag, title_of_chapter: str):
|
||||
"""
|
||||
Function
|
||||
clean/remove headings from chapter in order to avoid duplication of chapter titles in the content
|
||||
add span with id in order to
|
||||
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
|
||||
- adds span with id in order to
|
||||
Parameters
|
||||
----------
|
||||
content_tag: soup object
|
||||
@@ -210,8 +280,20 @@ def _remove_headings_content(content_tag, title_of_chapter: str):
|
||||
break
|
||||
|
||||
|
||||
def _preprocess_table(chapter_tag: BeautifulSoup):
|
||||
"""Function to preprocess tables and tags(td|th|tr): style"""
|
||||
def _process_table(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function preprocesses tables and tags(td|th|tr)
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with processed tables
|
||||
|
||||
"""
|
||||
tables = chapter_tag.find_all("table")
|
||||
for table in tables:
|
||||
for t_tag in table.find_all(re.compile("td|th|tr")):
|
||||
@@ -236,6 +318,19 @@ def _preprocess_table(chapter_tag: BeautifulSoup):
|
||||
|
||||
|
||||
def _insert_tags_in_parents(chapter_tag):
|
||||
"""
|
||||
Function inserts tags into correspond tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with inserted tags
|
||||
|
||||
"""
|
||||
parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
|
||||
for parent_tag_name, condition in parent_tag2condition.items():
|
||||
for parent_tag in chapter_tag.select(parent_tag_name):
|
||||
@@ -252,6 +347,19 @@ def _insert_tags_in_parents(chapter_tag):
|
||||
|
||||
|
||||
def _class_removing(chapter_tag):
|
||||
"""
|
||||
Function removes classes that aren't created by converter
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag without original classes of the book
|
||||
|
||||
"""
|
||||
for tag in chapter_tag.find_all(recursive=True):
|
||||
if tag.attrs.get("class") \
|
||||
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
||||
@@ -271,9 +379,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. heading removal
|
||||
2. processing tags
|
||||
3. class removal
|
||||
1. comments removal
|
||||
2. wrap NavigableString with tag <p>
|
||||
3. wrap tags with <table>
|
||||
4. replace tags with correspond LiveCarta tags
|
||||
5. unwrap tags
|
||||
6. heading removal
|
||||
7. process_table
|
||||
8. insert tags into correspond tags
|
||||
9. class removal
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -284,23 +398,22 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
# 1. remove comments
|
||||
_remove_comments(content_tag)
|
||||
|
||||
# 2. wrap NavigableString with tag <p>
|
||||
# 2.
|
||||
_wrap_strings_with_p(content_tag)
|
||||
|
||||
# 3.
|
||||
_wrap_tags_with_table(content_tag)
|
||||
|
||||
# 4.
|
||||
_tags_to_correspond_livecarta_tag(content_tag)
|
||||
|
||||
# 5.
|
||||
_unwrap_tags(content_tag)
|
||||
|
||||
# 3. heading removal
|
||||
# 6.
|
||||
if remove_title_from_chapter:
|
||||
_remove_headings_content(content_tag, title_str)
|
||||
|
||||
# 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
|
||||
_preprocess_table(content_tag)
|
||||
# 7.
|
||||
_process_table(content_tag)
|
||||
# 8.
|
||||
_insert_tags_in_parents(content_tag)
|
||||
|
||||
# 5. remove classes that weren't created by converter
|
||||
# 9. remove classes that weren't created by converter
|
||||
_class_removing(content_tag)
|
||||
return str(content_tag)
|
||||
|
||||
Reference in New Issue
Block a user