diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 6561662..c40c1ff 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -10,7 +10,7 @@ from src.livecarta_config import LiveCartaConfig def save_image_locally(img_file_path, img_content, book_id): - """ Function saves all images locally """ + """Function saves all images locally""" folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join( folder_path, f'../json/img_{book_id}/')) @@ -25,7 +25,7 @@ def save_image_locally(img_file_path, img_content, book_id): def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id): - """ Function saves all images to Amazon web service """ + """Function saves all images to Amazon web service""" link_path = access.send_image( img_file_path, doc_id=book_id, img_content=img_content) return link_path @@ -37,7 +37,7 @@ def update_images_src_links(body_tag: Tag, access=None, path2aws_path=None, book_id=None): - """ Function makes dictionary image_src_path -> Amazon web service_path """ + """Function makes dictionary image_src_path -> Amazon web service_path""" img_tags = body_tag.find_all('img') for img in img_tags: @@ -72,7 +72,7 @@ def update_images_src_links(body_tag: Tag, def preprocess_table(body_tag: BeautifulSoup): - """ Function to preprocess tables and tags(td|th|tr): style """ + """Function to preprocess tables and tags(td|th|tr): style""" tables = body_tag.find_all("table") for table in tables: ts = table.find_all(re.compile("td|th|tr")) @@ -84,13 +84,13 @@ def preprocess_table(body_tag: BeautifulSoup): r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) if width_match: size = width_match.group(1) - units = width_match.group(2) width = size+'px' t_tag.attrs['width'] = t_tag.get('width') or width if t_tag.attrs.get('style'): - t_tag.attrs['style'] = t_tag.attrs['style'].replace('border:0;', '') + t_tag.attrs['style'] = t_tag.attrs['style'].replace( + 'border:0;', '') elif t_tag.attrs.get('style') == '': del t_tag.attrs['style'] @@ -113,7 +113,7 @@ def process_lists(body_tag): def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): - """ Function inserts span before tag to be removed(aren't supported by livecarta) """ + """Function inserts span before tag to be removed(aren't supported by livecarta)""" new_tag = main_tag.new_tag("span") new_tag.attrs['id'] = id_ or '' new_tag.attrs['class'] = class_ or '' @@ -157,7 +157,7 @@ def clean_headings_content(content: Tag, title: str): def heading_tag_to_p_tag(body_tag): - """ Function to convert all lower level headings to p tags """ + """Function to convert all lower level headings to p tags""" pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = body_tag.find_all(re.compile(pattern)) for tag in header_tags: @@ -165,7 +165,7 @@ def heading_tag_to_p_tag(body_tag): def clean_title_from_numbering(title: str): - """ Function removes numbering from titles """ + """Function removes numbering from titles""" title = re.sub(r'^(\s+)+', '', title) # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title @@ -174,7 +174,7 @@ def clean_title_from_numbering(title: str): def replace_with_livecarta_anchor_tag(anchor, i): - """ Function replace noteref_tag(anchor) with new livecarta tag """ + """Function replace noteref_tag(anchor) with new livecarta tag""" new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' new_tag['data-id'] = i + 1 @@ -194,7 +194,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
Here is an example footnote1
- """ + """ footnotes = [] noterefs_tags = source_html_tag.find_all( attrs={noteref_attr_name: 'noteref'}) @@ -207,13 +207,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note [tag.decompose() for tag in bad_noterefs_tags] def parse_a_tag_href(s: str) -> Tuple[str, str]: - """ Returns name of file & id of an anchor """ + """Returns name of file & id of an anchor""" assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' f, id_ = s.split('#') return f, id_ def verify_footnote_tag(tags: list): - """ Function verifies is tag - footnote """ + """Function verifies is tag - footnote""" assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}' if len(tags) == 0: anchored_tags = list(target_html_tag.find_all(id=element_id)) @@ -259,9 +259,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note def unwrap_structural_tags(body_tag): - """ - Main function that works with structure of html. - Make changes inplace. + """Main function that works with structure of html. Make changes inplace. 1. Extracts tags that are not needed @@ -278,7 +276,7 @@ def unwrap_structural_tags(body_tag): """ def _preserve_class_in_aside_tag(tag_): - """ to save css style inherited from class, copy class to aside tag (which is parent to tag_) """ + """to save css style inherited from class, copy class to aside tag (which is parent to tag_)""" # this is for Wiley books with boxes tag_class = tag_.attrs['class'] if not isinstance( tag_.attrs['class'], list) else tag_.attrs['class'][0] @@ -434,14 +432,23 @@ def unwrap_structural_tags(body_tag): def get_tags_between_chapter_marks(first_id, href, html_soup): - """ - After processing on a first_id that corresponds to current chapter, + """After processing on a first_id that corresponds to current chapter, from initial html_soup all tags from current chapter are extracted - :param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' - :param href: name of current chapter's file - :param html_soup: soup object of current file - :return: list [Tag, NavigableString]; chapter's tags + Parameters + ---------- + first_id : + Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' + href : + Name of current chapter's file + html_soup : + Soup object of current file + + Returns + ------- + tags : list [Tag, NavigableString] + Chapter's tags + """ marked_tags = html_soup.find( attrs={'id': first_id, 'class': 'converter-chapter-mark'}) @@ -467,7 +474,7 @@ def get_tags_between_chapter_marks(first_id, href, html_soup): def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None): - """ Function wraps