From 114ac78eb0d6f74163be8b01ea12c9cd138a3a2a Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 28 Jun 2022 16:39:50 +0300 Subject: [PATCH] refactor with PEP8 --- src/epub_converter/epub_converter.py | 5 +- src/epub_converter/html_epub_preprocessor.py | 169 ++++++++++++++++--- 2 files changed, 144 insertions(+), 30 deletions(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index ca7b69f..1ecc7a1 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -497,7 +497,7 @@ class EpubConverter: id wraps chapter"s content + subchapters" content id points to the start of title of a chapter - In all cases we know where chapter starts. Therefore, chapter is all tags between chapter"s id + In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id and id of the next chapter/subchapter Parameters ---------- @@ -539,6 +539,7 @@ class EpubConverter: lvl: int level of chapter + Returns ------- ChapterItem @@ -597,7 +598,7 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/9781614382264.epub" + epub_file_path = "../../epub/9781641050234.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index f9c2c06..3f762b4 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -21,7 +21,7 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou """ def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list): - """Function inserts span before tag aren't supported by livecarta""" + """Function inserts span before tag aren't supported by LiveCarta""" new_tag = chapter_tag.new_tag("span") new_tag.attrs["id"] = id_ or "" new_tag.attrs["class"] = class_ or "" @@ -77,22 +77,57 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu def prepare_title(title_of_chapter: str) -> str: - """Function finalise processing/cleaning title""" - title_str = BeautifulSoup(title_of_chapter, features="lxml").string + """ + Function finalise processing/cleaning title + Parameters + ---------- + title_of_chapter: str + + Returns + ------- + title: str + cleaned title + + """ + title = BeautifulSoup(title_of_chapter, features="lxml").string # clean extra whitespace characters ([\r\n\t\f\v ]) - title_str = re.sub(r"[\s\xa0]", " ", title_str).strip() - return title_str + title = re.sub(r"[\s\xa0]", " ", title).strip() + return title def _remove_comments(chapter_tag): + """ + Function remove comments + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag without comments + + """ for tag in chapter_tag.find_all(): for element in tag(text=lambda text: isinstance(text, Comment)): element.extract() def _wrap_strings_with_p(chapter_tag): - # Headings that are not supported by livecarta converts to

- # wrap NavigableString with

+ """ + Function converts headings that aren't supported by LiveCarta with

+ Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with wrapped NavigableStrings + + """ for node in chapter_tag: if isinstance(node, NavigableString): content = str(node) @@ -104,7 +139,19 @@ def _wrap_strings_with_p(chapter_tag): def _wrap_tags_with_table(chapter_tag): - """Function wraps with """ + """ + Function wraps with
+ Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with wrapped certain tags with
+ + """ def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): table = chapter_tag.new_tag("table") table.attrs["border"], table.attrs["align"], table.attrs["style"] \ @@ -141,7 +188,19 @@ def _wrap_tags_with_table(chapter_tag): def _tags_to_correspond_livecarta_tag(chapter_tag): - """Function to replace all tags to correspond livecarta tags""" + """ + Function to replace all tags to correspond LiveCarta tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with all tags replaced with LiveCarta tags + + """ for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items(): for key in reg_keys: if isinstance(key, tuple): @@ -164,12 +223,23 @@ def _tags_to_correspond_livecarta_tag(chapter_tag): def _unwrap_tags(chapter_tag): - """Function unwrap tags and move id to span""" + """ + Function unwrap tags and moves id to span + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with unwrapped certain tags + + """ for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP: for tag in chapter_tag.select(tag_name): # if tag is a subtag if ">" in tag_name: - parent = tag.parent tag.parent.attrs.update(tag.attrs) _add_span_to_save_ids_for_links(tag, chapter_tag) tag.unwrap() @@ -178,8 +248,8 @@ def _unwrap_tags(chapter_tag): def _remove_headings_content(content_tag, title_of_chapter: str): """ Function - clean/remove headings from chapter in order to avoid duplication of chapter titles in the content - add span with id in order to + - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content + - adds span with id in order to Parameters ---------- content_tag: soup object @@ -210,8 +280,20 @@ def _remove_headings_content(content_tag, title_of_chapter: str): break -def _preprocess_table(chapter_tag: BeautifulSoup): - """Function to preprocess tables and tags(td|th|tr): style""" +def _process_table(chapter_tag: BeautifulSoup): + """ + Function preprocesses tables and tags(td|th|tr) + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with processed tables + + """ tables = chapter_tag.find_all("table") for table in tables: for t_tag in table.find_all(re.compile("td|th|tr")): @@ -236,6 +318,19 @@ def _preprocess_table(chapter_tag: BeautifulSoup): def _insert_tags_in_parents(chapter_tag): + """ + Function inserts tags into correspond tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with inserted tags + + """ parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()} for parent_tag_name, condition in parent_tag2condition.items(): for parent_tag in chapter_tag.select(parent_tag_name): @@ -252,6 +347,19 @@ def _insert_tags_in_parents(chapter_tag): def _class_removing(chapter_tag): + """ + Function removes classes that aren't created by converter + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag without original classes of the book + + """ for tag in chapter_tag.find_all(recursive=True): if tag.attrs.get("class") \ and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): @@ -271,9 +379,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro Steps ---------- - 1. heading removal - 2. processing tags - 3. class removal + 1. comments removal + 2. wrap NavigableString with tag

+ 3. wrap tags with

+ 4. replace tags with correspond LiveCarta tags + 5. unwrap tags + 6. heading removal + 7. process_table + 8. insert tags into correspond tags + 9. class removal Returns ------- @@ -284,23 +398,22 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro # 1. remove comments _remove_comments(content_tag) - # 2. wrap NavigableString with tag

+ # 2. _wrap_strings_with_p(content_tag) - + # 3. _wrap_tags_with_table(content_tag) - + # 4. _tags_to_correspond_livecarta_tag(content_tag) - + # 5. _unwrap_tags(content_tag) - - # 3. heading removal + # 6. if remove_title_from_chapter: _remove_headings_content(content_tag, title_str) - - # 4. processing tags (

  • ,
  • , ,
    , 
    , ) - _preprocess_table(content_tag) + # 7. + _process_table(content_tag) + # 8. _insert_tags_in_parents(content_tag) - # 5. remove classes that weren't created by converter + # 9. remove classes that weren't created by converter _class_removing(content_tag) return str(content_tag)