refactor with PEP8

This commit is contained in:
Kiryl
2022-06-28 16:39:50 +03:00
parent f01f6ad778
commit 114ac78eb0
2 changed files with 144 additions and 30 deletions

View File

@@ -497,7 +497,7 @@ class EpubConverter:
id wraps chapter"s content + subchapters" content
id points to the start of title of a chapter
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter"s id
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
and id of the next chapter/subchapter
Parameters
----------
@@ -539,6 +539,7 @@ class EpubConverter:
lvl: int
level of chapter
Returns
-------
ChapterItem
@@ -597,7 +598,7 @@ class EpubConverter:
if __name__ == "__main__":
epub_file_path = "../../epub/9781614382264.epub"
epub_file_path = "../../epub/9781641050234.epub"
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])

View File

@@ -21,7 +21,7 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
"""Function inserts span before tag aren't supported by livecarta"""
"""Function inserts span before tag aren't supported by LiveCarta"""
new_tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
@@ -77,22 +77,57 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
def prepare_title(title_of_chapter: str) -> str:
"""Function finalise processing/cleaning title"""
title_str = BeautifulSoup(title_of_chapter, features="lxml").string
"""
Function finalise processing/cleaning title
Parameters
----------
title_of_chapter: str
Returns
-------
title: str
cleaned title
"""
title = BeautifulSoup(title_of_chapter, features="lxml").string
# clean extra whitespace characters ([\r\n\t\f\v ])
title_str = re.sub(r"[\s\xa0]", " ", title_str).strip()
return title_str
title = re.sub(r"[\s\xa0]", " ", title).strip()
return title
def _remove_comments(chapter_tag):
"""
Function remove comments
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag without comments
"""
for tag in chapter_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
def _wrap_strings_with_p(chapter_tag):
# Headings that are not supported by livecarta converts to <p>
# wrap NavigableString with <p>
"""
Function converts headings that aren't supported by LiveCarta with <p>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with wrapped NavigableStrings
"""
for node in chapter_tag:
if isinstance(node, NavigableString):
content = str(node)
@@ -104,7 +139,19 @@ def _wrap_strings_with_p(chapter_tag):
def _wrap_tags_with_table(chapter_tag):
"""Function wraps <tag> with <table>"""
"""
Function wraps <tag> with <table>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with wrapped certain tags with <table>
"""
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
@@ -141,7 +188,19 @@ def _wrap_tags_with_table(chapter_tag):
def _tags_to_correspond_livecarta_tag(chapter_tag):
"""Function to replace all tags to correspond livecarta tags"""
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with all tags replaced with LiveCarta tags
"""
for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
for key in reg_keys:
if isinstance(key, tuple):
@@ -164,12 +223,23 @@ def _tags_to_correspond_livecarta_tag(chapter_tag):
def _unwrap_tags(chapter_tag):
"""Function unwrap tags and move id to span"""
"""
Function unwrap tags and moves id to span
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with unwrapped certain tags
"""
for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
for tag in chapter_tag.select(tag_name):
# if tag is a subtag
if ">" in tag_name:
parent = tag.parent
tag.parent.attrs.update(tag.attrs)
_add_span_to_save_ids_for_links(tag, chapter_tag)
tag.unwrap()
@@ -178,8 +248,8 @@ def _unwrap_tags(chapter_tag):
def _remove_headings_content(content_tag, title_of_chapter: str):
"""
Function
clean/remove headings from chapter in order to avoid duplication of chapter titles in the content
add span with id in order to
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
- adds span with id in order to
Parameters
----------
content_tag: soup object
@@ -210,8 +280,20 @@ def _remove_headings_content(content_tag, title_of_chapter: str):
break
def _preprocess_table(chapter_tag: BeautifulSoup):
"""Function to preprocess tables and tags(td|th|tr): style"""
def _process_table(chapter_tag: BeautifulSoup):
"""
Function preprocesses tables and tags(td|th|tr)
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with processed tables
"""
tables = chapter_tag.find_all("table")
for table in tables:
for t_tag in table.find_all(re.compile("td|th|tr")):
@@ -236,6 +318,19 @@ def _preprocess_table(chapter_tag: BeautifulSoup):
def _insert_tags_in_parents(chapter_tag):
"""
Function inserts tags into correspond tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with inserted tags
"""
parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
for parent_tag_name, condition in parent_tag2condition.items():
for parent_tag in chapter_tag.select(parent_tag_name):
@@ -252,6 +347,19 @@ def _insert_tags_in_parents(chapter_tag):
def _class_removing(chapter_tag):
"""
Function removes classes that aren't created by converter
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag without original classes of the book
"""
for tag in chapter_tag.find_all(recursive=True):
if tag.attrs.get("class") \
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
@@ -271,9 +379,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
Steps
----------
1. heading removal
2. processing tags
3. class removal
1. comments removal
2. wrap NavigableString with tag <p>
3. wrap tags with <table>
4. replace tags with correspond LiveCarta tags
5. unwrap tags
6. heading removal
7. process_table
8. insert tags into correspond tags
9. class removal
Returns
-------
@@ -284,23 +398,22 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
# 1. remove comments
_remove_comments(content_tag)
# 2. wrap NavigableString with tag <p>
# 2.
_wrap_strings_with_p(content_tag)
# 3.
_wrap_tags_with_table(content_tag)
# 4.
_tags_to_correspond_livecarta_tag(content_tag)
# 5.
_unwrap_tags(content_tag)
# 3. heading removal
# 6.
if remove_title_from_chapter:
_remove_headings_content(content_tag, title_str)
# 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
_preprocess_table(content_tag)
# 7.
_process_table(content_tag)
# 8.
_insert_tags_in_parents(content_tag)
# 5. remove classes that weren't created by converter
# 9. remove classes that weren't created by converter
_class_removing(content_tag)
return str(content_tag)