diff --git a/src/data_objects.py b/src/data_objects.py index 8c51bc3..ac04284 100644 --- a/src/data_objects.py +++ b/src/data_objects.py @@ -38,7 +38,7 @@ class NavPoint: def flatten(x): - """magic function from stackoverflow for list flattening""" + """Magic function from stackoverflow for list flattening""" atom = lambda i: not isinstance(i, list) nil = lambda i: not i car = lambda i: i[0] diff --git a/src/epub_converter/css_reader.py b/src/epub_converter/css_reader.py index 59b11af..2f6996f 100644 --- a/src/epub_converter/css_reader.py +++ b/src/epub_converter/css_reader.py @@ -28,24 +28,27 @@ list_types = ['circle', 'disc', 'armenian', 'decimal', 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] -def convert_tag_values(value): - """Function 1. converts values of tags from em/%/pt to px - 2. find closest font-size px +def convert_tag_values(value: str) -> str: + """ + Function + - converts values of tags from em/%/pt to px + - find closest font-size px Parameters ---------- value: str Returns ------- - converted value: str - """ + value: str + """ def find_closest_size(value): possible_sizes = list(takewhile(lambda x: value > x, sizes_pr)) last_possible_size_index = sizes_pr.index(possible_sizes[-1]) return sizes_px[last_possible_size_index] - font_size_regexp = re.compile(r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') + font_size_regexp = re.compile( + r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') has_style_attrs = re.search(font_size_regexp, value) if has_style_attrs: if has_style_attrs.group(1): @@ -61,8 +64,7 @@ def convert_tag_values(value): return value - -""" +""" Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } Style properties that can be used to fit livecarta css style convention. If property has empty list, it means that any value can be converted. @@ -164,17 +166,20 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { def check_style_to_be_tag(style) -> List[tuple]: - """Function search style properties that can be converted to tags. + """ + Function searches style properties that can be converted to tags. It searches for them and prepare list of properties to be removed from style string Parameters ---------- style: str + Returns ------- - properties to remove: list - """ + to_remove: list + properties to remove + """ to_remove = [] for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: if f'{k[0]}:{k[1]}' in style: @@ -203,7 +208,7 @@ def update_css_style_types_to_livecarta_convention(css_rule, style_type): def build_css_content(css_content): - """ Build css content with livecarta convention """ + """Build css content with livecarta convention""" sheet = cssutils.parseString(css_content, validate=False) for css_rule in sheet: @@ -227,7 +232,7 @@ class TagStyleConverter: @staticmethod def remove_white_if_no_bgcolor(style_, tag): - """ Function remove white color if there is no text bg color """ + """Function remove text white color if there is no bg color""" if 'background' in style_: return style_ @@ -264,9 +269,11 @@ class TagStyleConverter: item = item.split(':') if item[0] in ['text-indent', 'margin-left', 'margin']: if len(item[1].split(' ')) == 3: - item[1] = convert_tag_values(item[1].split(' ')[-2]) # split returns middle value + item[1] = convert_tag_values(item[1].split( + ' ')[-2]) # split returns middle value else: - item[1] = convert_tag_values(item[1].split(' ')[-1]) # split returns last value + item[1] = convert_tag_values(item[1].split( + ' ')[-1]) # split returns last value clean_style += item[0] + ': ' + item[1] + '; ' margin_left_regexp = re.compile( @@ -360,7 +367,7 @@ class TagStyleConverter: s = f'{attr}:{value};' self.style = self.style.replace(s, '') self.style = self.style.strip() - if i == 0: + if not i: self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( attr, value)] new_tags.append(self.tag_with_inline_style) @@ -388,7 +395,7 @@ class TagStyleConverter: @staticmethod def wrap_span_in_p_to_save_style_attrs(tag): - """ Function designed to save style attrs that cannot be in p -> span """ + """Function designed to save style attrs that cannot be in p -> span""" if tag.name == 'p' and tag.attrs.get('style'): styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']] @@ -402,7 +409,6 @@ class TagStyleConverter: if has_p_style_attrs: p_style += item + ';' initial_style = initial_style.replace(item + ';', '') - # here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top' styles_to_be_saved_in_span = [((attr + ':') in initial_style) & ( '-' + attr not in initial_style) for attr in styles_cant_be_in_p] @@ -410,30 +416,30 @@ class TagStyleConverter: # if find styles that cannot be in

-> wrap them in span tag.name = 'span' p_tag = BeautifulSoup(features='lxml').new_tag('p') - li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_li_style_attr = re.search(li_attrs_regexp, initial_style) - span_style = initial_style if not has_li_style_attr else initial_style.replace( - has_li_style_attr.group(1), '') + p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') + has_p_style_attr = re.search(p_attrs_regexp, initial_style) + span_style = initial_style if not has_p_style_attr else initial_style.replace( + has_p_style_attr.group(1), '') p_tag.attrs['style'] = p_style tag.attrs['style'] = span_style tag.wrap(p_tag) - else: tag.attrs['style'] = p_style + else: + tag.attrs['style'] = p_style @staticmethod def wrap_span_in_li_to_save_style_attrs(tag): - """ Function designed to save style attrs that cannot be in li -> span """ + """Function designed to save style attrs that cannot be in li -> span""" if tag.name == 'li' and tag.attrs.get('style'): styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'list-style-type']] - styles_to_be_saved = [attr in tag.attrs.get( + styles_to_be_saved_in_span = [attr in tag.attrs.get( 'style') for attr in styles_cant_be_in_li] - if any(styles_to_be_saved): + if any(styles_to_be_saved_in_span): tag.name = 'span' li_tag = BeautifulSoup(features='lxml').new_tag('li') span_style = tag.attrs['style'] li_style = '' - for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), re.compile(r'(list-style-type:(\w+);)')]: has_li_style_attrs = re.search( @@ -442,39 +448,38 @@ class TagStyleConverter: li_style += has_li_style_attrs.group(1) span_style = span_style.replace( has_li_style_attrs.group(1), '') - li_tag.attrs['style'] = li_style tag.attrs['style'] = span_style tag.wrap(li_tag) @staticmethod def wrap_span_in_ul_ol_to_save_style_attrs(tag): - """ Function designed to save style attrs that cannot be in ul/ol -> span """ + """Function designed to save style attrs that cannot be in ul/ol -> span""" if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): styles_cant_be_in_ul_ol = [ attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] - check = [attr in tag.attrs.get('style') - for attr in styles_cant_be_in_ul_ol] - if any(check): + styles_to_be_saved_in_span = [attr in tag.attrs.get('style') + for attr in styles_cant_be_in_ul_ol] + if any(styles_to_be_saved_in_span): tag.name = 'span' - li_tag = BeautifulSoup(features='lxml').new_tag('ul') + oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name) span_style = tag.attrs['style'] - possible_li_attrs_regexp = re.compile( + possible_uol_attrs_regexp = re.compile( r'(list-style-type:(\w+);)') - has_li_style_attrs = re.search( - possible_li_attrs_regexp, span_style) - if has_li_style_attrs and has_li_style_attrs.group(1): - oul_style = has_li_style_attrs.group(1) + has_uol_style_attrs = re.search( + possible_uol_attrs_regexp, span_style) + if has_uol_style_attrs and has_uol_style_attrs.group(1): + oul_style = has_uol_style_attrs.group(1) span_style = span_style.replace(oul_style, '') - li_tag.attrs['style'] = oul_style + oul_tag.attrs['style'] = oul_style tag.attrs['style'] = span_style - tag.wrap(li_tag) + tag.wrap(oul_tag) @staticmethod def wrap_span_in_h_to_save_style_attrs(tag): - """ Function designed to save style attrs that cannot be in h -> span """ + """Function designed to save style attrs that cannot be in h -> span""" h_regexp = re.compile('(^h[1-9]$)') if re.search(h_regexp, tag.name) and tag.attrs.get('style'): @@ -482,10 +487,10 @@ class TagStyleConverter: tag.name = 'span' tag.wrap(h_tag) style = tag.attrs['style'] - li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_li_style_attr = re.search(li_attrs_regexp, style) - tag.attrs['style'] = style if not has_li_style_attr else style.replace( - has_li_style_attr.group(1), '') + h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') + has_h_style_attr = re.search(h_attrs_regexp, style) + tag.attrs['style'] = style if not has_h_style_attr else style.replace( + has_h_style_attr.group(1), '') def convert_initial_tag(self): self.tag_with_inline_style = self.change_attrs_with_corresponding_tags() @@ -496,8 +501,8 @@ class TagStyleConverter: return self.tag_with_inline_style -def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): - """ Function adds styles from .css to inline style """ +def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: + """Function adds styles from .css to inline style""" css_text = css_text.replace( '@namespace epub "http://www.idpf.org/2007/ops";', '') livecarta_tmp_ids = [] diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index cf11cc8..2a538fd 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -1,7 +1,6 @@ import re import json import codecs -import logging import os from os.path import dirname, normpath, join from itertools import chain @@ -51,7 +50,8 @@ class EpubConverter: # flag to be updated while ebooklib.toc is parsed self.id_anchor_exist_in_nav_points = False self.img_href2img_bytes = {} # file path to bytes - self.book_image_src_path2aws_path = {} # file path from to generated aws path + # file path from to generated aws path + self.book_image_src_path2aws_path = {} self.footnotes_contents: List[str] = [] # to be sent on server as is self.noterefs: List[Tag] = [] # start of the footnote self.footnotes: List[Tag] = [] # end of the footnote @@ -116,7 +116,6 @@ class EpubConverter: return nodes def get_css_content(self, css_href, html_href): - path_to_css_from_html = css_href html_folder = dirname(html_href) path_to_css_from_root = normpath( @@ -132,8 +131,8 @@ class EpubConverter: The first is css_href2css_content. It is created to connect href of css to content of css The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html ...2... = key2value - """ + """ # dictionary: href of html to related css files html_href2css_href: defaultdict = defaultdict(list) css_href2css_content: dict = {} @@ -165,6 +164,7 @@ class EpubConverter: """ This function is designed to update html_href2html_body_soup And add to html_inline_style css_style_content + """ for html_href in self.html_href2html_body_soup: if self.html_href2css_href.get(html_href): @@ -191,8 +191,8 @@ class EpubConverter: :param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx) :param lvl: level of depth - """ + """ if isinstance(element, Link): nav_point = NavPoint(element) if nav_point.id: @@ -215,7 +215,8 @@ class EpubConverter: sub_nodes = [] for elem in second: if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1: - self.offset_sub_nodes.append(self.build_adjacency_list_from_toc(elem, lvl)) + self.offset_sub_nodes.append( + self.build_adjacency_list_from_toc(elem, lvl)) else: sub_nodes.append( self.build_adjacency_list_from_toc(elem, lvl + 1)) @@ -239,8 +240,8 @@ class EpubConverter: else: assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' - def is_toc_empty(self): - """ Function checks is toc empty """ + def is_toc_empty(self) -> bool: + """Function checks is toc empty""" # there is no toc in ebook or no top chapters if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): return True @@ -258,7 +259,7 @@ class EpubConverter: self.hrefs_added_to_toc.add(nav_point.href) def add_not_added_files_to_adjacency_list(self, not_added): - """ Function add files that not added to adjacency list """ + """Function add files that not added to adjacency list""" for i, file in enumerate(not_added): nav_point = NavPoint( Section(f'To check #{i}, filename: {file}', file)) @@ -295,19 +296,26 @@ class EpubConverter: new_anchor_span.string = "\xa0" return new_anchor_span - def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag): + def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str: """ + Function used to find full path to file that is parsed from tag link TOC: a/b/c.xhtml - b/c.xhtml -> a/b/c.xhtml c.xhtml -> a/b/c.xhtml + Parameters + ---------- + cur_file_path: str + path to current file with tag link + href_in_link: str + filename got from tag link, like file1.xhtml + internal_link_tag: Tag + tag object that is parsed now - Used to find full path to file that is parsed from tag link + Returns + ------- + full_path[0]: s + prepared content - :param cur_file_path: path to current file with tag link - :param href_in_link: filename got from tag link, like file1.xhtml - :param internal_link_tag: tag object that is parsed now - :return: """ dir_name = os.path.dirname(cur_file_path) normed_path = os.path.normpath(os.path.join( @@ -331,6 +339,12 @@ class EpubConverter: Function - processing internal links in a book - make ids unique + Steps + ---------- + 1. rebuild ids to be unique in all documents + 2a. process anchor which is a whole xhtml file + 2b. process anchor which is an element in xhtml file + """ # 1. rebuild ids to be unique in all documents for toc_href in self.hrefs_added_to_toc: @@ -344,7 +358,7 @@ class EpubConverter: new_id = self.create_unique_id(toc_href, tag.attrs['id']) tag.attrs['id'] = new_id - # 2.a) process anchor which is a whole xhtml file + # 2a. process anchor which is a whole xhtml file internal_link_reg1 = re.compile( r'(^(?!https?://).+\.(htm|html|xhtml)$)') for toc_href in self.hrefs_added_to_toc: @@ -367,7 +381,7 @@ class EpubConverter: del internal_link_tag.attrs['href'] - # 2.b) process anchor which is an element in xhtml file + # 2b. process anchor which is an element in xhtml file internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)') for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] @@ -418,9 +432,9 @@ class EpubConverter: f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f' Old id={a_tag_id}') - def build_one_chapter(self, nav_point): + def build_one_chapter(self, nav_point: NavPoint): """ - Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) + Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) 3 cases: id wraps all chapter content, @@ -429,7 +443,13 @@ class EpubConverter: In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id and id of the next chapter/subchapter + Parameters + ---------- + nav_point: NavPoint + Returns + ------- + None """ if nav_point.id: soup = self.html_href2html_body_soup[nav_point.href] @@ -446,7 +466,7 @@ class EpubConverter: self.build_one_chapter(sub_node) def define_chapters_content(self): - """ Function build chapters content starts from top level chapters """ + """Function build chapters content, starts from top level chapters""" top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: for point in top_level_nav_points: @@ -483,8 +503,8 @@ class EpubConverter: self.logger.log(f'{indent}Chapter: {title} is prepared.') return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) - def convert_to_dict(self): - """ Function which convert list of html nodes to appropriate json structure. """ + def convert_to_dict(self) -> dict: + """Function which convert list of html nodes to appropriate json structure""" top_level_nav_points = self.adjacency_list[-1] top_level_chapters = [] @@ -502,7 +522,7 @@ class EpubConverter: if __name__ == "__main__": - filename = '9781641051217' + filename = '9781614382264' logger_object = BookLogger(name='epub', book_id=filename) json_converter = EpubConverter(f'../../epub/{filename}.epub', diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 8defe7a..6ec8c53 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -2,7 +2,7 @@ from src.book_solver import BookSolver from src.epub_converter.epub_converter import EpubConverter class EpubBook(BookSolver): - """ Class of .epub type book - child of BookSolver """ + """Class of .epub type book - child of BookSolver""" def __init__(self, book_id=0, access=None, main_logger=None): super().__init__(book_id, access, main_logger) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index d06241a..d340586 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -9,7 +9,7 @@ from src.access import Access from src.livecarta_config import LiveCartaConfig -def save_image_locally(img_file_path, img_content, book_id): +def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): """Function saves all images locally""" folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join( @@ -24,19 +24,19 @@ def save_image_locally(img_file_path, img_content, book_id): return new_img_path -def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id): +def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str): """Function saves all images to Amazon web service""" link_path = access.send_image( img_file_path, doc_id=book_id, img_content=img_content) return link_path -def update_images_src_links(body_tag: Tag, +def update_images_src_links(body_tag: BeautifulSoup, href2img_content: dict, - path_to_html, + path_to_html: str, access=None, - path2aws_path=None, - book_id=None): + path2aws_path: dict = None, + book_id: str = None) -> dict: """Function makes dictionary image_src_path -> Amazon web service_path""" img_tags = body_tag.find_all('img') @@ -99,13 +99,22 @@ def preprocess_table(body_tag: BeautifulSoup): table.attrs['border'] = '1' -def process_lists(body_tag): +def process_lists(body_tag: BeautifulSoup): """ - Function to process tags

  • . - Unwrap

    tags. - """ - li_tags = body_tag.find_all("li") + Function + - process tags

  • . + - unwrap

    tags. + Parameters + ---------- + body_tag: Tag, soup object + Returns + ------- + None + + """ + + li_tags = body_tag.find_all("li") for li_tag in li_tags: if li_tag.p: li_tag.attrs.update(li_tag.p.attrs) @@ -113,7 +122,7 @@ def process_lists(body_tag): def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): - """Function inserts span before tag to be removed(aren't supported by livecarta)""" + """Function inserts span before tag aren't supported by livecarta""" new_tag = main_tag.new_tag("span") new_tag.attrs['id'] = id_ or '' new_tag.attrs['class'] = class_ or '' @@ -121,8 +130,8 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): tag.insert_before(new_tag) -def clean_headings_content(content: Tag, title: str): - def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag): +def clean_headings_content(content: BeautifulSoup, title: str): + def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup): if tag_to_be_removed.attrs.get('id'): insert_span_with_attrs_before_tag(body_tag, tag_to_be_removed, @@ -194,6 +203,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note

    Here is an example footnote1

    + """ footnotes = [] noterefs_tags = source_html_tag.find_all( @@ -258,21 +268,28 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note return footnotes, new_noterefs_tags, new_footnotes_tags -def unwrap_structural_tags(body_tag): - """Main function that works with structure of html. Make changes inplace. +def unwrap_structural_tags(body_tag: BeautifulSoup): + """ + Main function that works with structure of html. Make changes inplace. + Parameters + ---------- + body_tag: Tag, soup object + Steps + ---------- 1. Extracts tags that are not needed - 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree. Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed. This tag must have a body_tag as a parent. Otherwise, it is wrapped with some tags. Like:

    - 3. Headings that are not supported by livecarta converts to

    4. Wrapping NavigableString - :param body_tag: Tag, soup object - :return: None + + Returns + ------- + None + """ def preserve_class_in_aside_tag(tag_): @@ -284,10 +301,18 @@ def unwrap_structural_tags(body_tag): if not tag_.parent.attrs.get('class'): tag_.parent.attrs['class'] = tag_class - def preserve_class_in_section_tag(tag_) -> bool: + def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool: """ - to save css style inherited from class, copy class to child

    + Function saves css style inherited from class, copies class to child

    returns True, if

    could be unwrapped + Parameters + ---------- + tag_: Tag, soup object + + Returns + ------- + None + """ # this is for Wiley books with boxes tag_class = tag_.attrs['class'] if not isinstance( @@ -314,9 +339,11 @@ def unwrap_structural_tags(body_tag): class_=tag_to_be_removed.attrs.get('class')) def replace_div_tag_with_table(): - """Function replace
    with : + """ + Function replace
    with
    : 1. Convert div with certain classes to tables 2. Add background color to div with background-color + """ for div in body_tag.find_all("div"): if div.attrs.get('class'): @@ -431,22 +458,22 @@ def unwrap_structural_tags(body_tag): return body_tag -def get_tags_between_chapter_marks(first_id, href, html_soup): +def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: """After processing on a first_id that corresponds to current chapter, from initial html_soup all tags from current chapter are extracted Parameters ---------- - first_id : + first_id: Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' - href : + href: Name of current chapter's file - html_soup : + html_soup: Tag, soup object Soup object of current file Returns ------- - tags : list [Tag, NavigableString] + tags: list [Tag, NavigableString] Chapter's tags """ @@ -536,37 +563,33 @@ def prepare_formatted(text: str) -> str: return text -def wrap_preformatted_span_with_table(main_tag, old_tag): +def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag: """Function wraps with
    """ - table = main_tag.new_tag("table") - table.attrs['border'] = '1px #ccc;' - table.attrs['style'] = 'width:100%;' - tbody = main_tag.new_tag("tbody") - tr = main_tag.new_tag("tr") - td = main_tag.new_tag("td") + table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag( + "tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") + table.attrs['border'], table.attrs['style'] = '1px #ccc;', 'width:100%;' td.attrs['bgcolor'] = '#f5f5f5' # td.attrs['border-radius'] = '4px' - old_tag.wrap(td) + span_tag.wrap(td) td.wrap(tr) tr.wrap(tbody) tbody.wrap(table) return table -def preprocess_pre_tags(chapter_tag): - """Function preprocessing
     tags
    +def preprocess_pre_tags(chapter_tag: BeautifulSoup):
    +    """
    +    Function preprocessing 
     tags
         Parameters
         ----------
    -    chapter_tag: BeautifulSoup
    +    chapter_tag: Tag, soup object
     
         Steps
         ----------
    -    1. cleaning \n
    -    2. heading removal
    -    3. processing tags
    -    4. class removal
    -    """
    +    1. Process NavigableString
    +    2. Process Tags and their children
     
    +    """
         for pre in chapter_tag.find_all("pre"):
             new_tag = BeautifulSoup(features='lxml').new_tag("span")
             new_tag.attrs = pre.attrs.copy()
    @@ -599,17 +622,26 @@ def preprocess_pre_tags(chapter_tag):
                                      "font-size: 14px; white-space: nowrap;"
             pre.replace_with(new_tag)
             table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
    +        # add 

    to save brs p_for_br = chapter_tag.new_tag("p") p_for_br.string = "\xa0" table.insert_after(p_for_br) -def preprocess_code_tags(chapter_tag: Tag): - """Function that - - transform , , tags into span - - add code style to this tags +def preprocess_code_tags(chapter_tag: BeautifulSoup): """ + Function + - transform , , tags into span + - add code style to this tags + Parameters + ---------- + chapter_tag: Tag, soup object + Returns + ------- + None + + """ for code in chapter_tag.find_all(re.compile("code|kbd|var")): code.name = "span" if code.parent.name == "pre": @@ -620,7 +652,6 @@ def preprocess_code_tags(chapter_tag: Tag): code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;' - def prepare_title(title_of_chapter: str) -> str: """Function finalise processing/cleaning title""" title_str = BeautifulSoup(title_of_chapter, features='lxml').string @@ -631,18 +662,19 @@ def prepare_title(title_of_chapter: str) -> str: def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: - """Function finalise processing/cleaning content + """ + Function finalise processing/cleaning content Parameters ---------- title_str: str - content_tag: BeautifulSoup + content_tag: Tag, soup object remove_title_from_chapter: bool Steps ---------- - 1. cleaning \n + 1. find \n 2. heading removal 3. processing tags 4. class removal @@ -651,9 +683,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro ------- content_tag: str prepared content - """ - # 0. cleaning \n + """ + # 1. find \n to_remove = [] for child in content_tag.contents: if isinstance(child, NavigableString): @@ -661,18 +693,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro if s == '': to_remove.append(child) - # 1. heading removal + # 2. heading removal if remove_title_from_chapter: clean_headings_content(content_tag, title_str) - # 2. processing tags (

  • ,
  • , ,
    , )
    +    # 3. processing tags (
  • ,
  • , ,
    , )
         process_lists(content_tag)
         preprocess_table(content_tag)
         preprocess_code_tags(content_tag)
         preprocess_pre_tags(content_tag)
         preprocess_block_tags(content_tag)
     
    -    # 3. class removal
    +    # 4. class removal
         for tag in content_tag.find_all(recursive=True):
             if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
                                                                                                     'footnote-element']):