diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 9cb6421..c06a92c 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -63,10 +63,6 @@ def update_src_links_in_images(body_tag: Tag, return path2aws_path -def preprocess_figure(): - pass - - def preprocess_table(body_tag: BeautifulSoup): tables = body_tag.find_all("table") for table in tables: @@ -81,10 +77,7 @@ def preprocess_table(body_tag: BeautifulSoup): units = width_match.group(2) width = size+'px' - width = td.get('width') or width - - if width: - td.attrs['width'] = width + td.attrs['width'] = td.get('width') or width if td.attrs.get('style'): td.attrs['style'] = td.attrs['style'].replace('border:0;', '') @@ -151,7 +144,7 @@ def clean_headings_content(content: Tag, title: str): break -def _preprocessing_headings(body_tag): +def _heading_tag2p_tag(body_tag): """ Function to convert all lower level headings to p tags """ @@ -184,8 +177,8 @@ def replace_with_livecarta_anchor_tag(anchor, i): return new_tag -def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> Tuple[ - list, list, list]: +def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \ + -> Tuple[list, list, list]: """ This function should be earlier that adding fonts in pipeline. @@ -248,6 +241,23 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note def unwrap_structural_tags(body_tag): + """ + Main function that works with structure of html. + Make changes inplace. + + 1. Extracts tags that are not needed + + 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree. + Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed. + This tag must have a body_tag as a parent. + Otherwise, it is wrapped with some tags. Like: +

+ + 3. Headings that are not supported by livecarta converts to

+ 4. Wrapping NavigableString + :param body_tag: Tag, soup object + :return: None + """ def _preserve_class_in_aside_tag(tag_): # to save css style inherited from class, copy class to aside tag (which is parent to tag_) @@ -362,8 +372,9 @@ def unwrap_structural_tags(body_tag): parents_marks_are_body = [x.parent == body_tag for x in marks] assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' - _preprocessing_headings(body_tag) + _heading_tag2p_tag(body_tag) + # wrap NavigableString with

for node in body_tag: if isinstance(node, NavigableString): content = str(node) @@ -378,19 +389,28 @@ def unwrap_structural_tags(body_tag): def get_tags_between_chapter_marks(first_id, href, html_soup): + """ + After processing on a first_id that corresponds to current chapter, + from initial html_soup all tags from current chapter are extracted + + :param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' + :param href: name of current chapter's file + :param html_soup: soup object of current file + :return: list [Tag, NavigableString]; chapter's tags + """ marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'}) if marked_tags: next_tag = marked_tags.next_sibling tags = [] while next_tag: - # TODO: why we hve there NavString - if not isinstance(next_tag, NavigableString) and\ (next_tag.attrs.get('class') == 'converter-chapter-mark'): break tags.append(next_tag) next_tag = next_tag.next_sibling + # remove tags between first_id and next found id + # save them in list for next steps tags = [tag.extract() for tag in tags] html_soup.smooth() @@ -513,6 +533,14 @@ def preprocess_code_tags(chapter_tag): def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]: + """ + Final processing/cleaning function. + + :param title: title of the chapter + :param chapter_tag: soup object + :param remove_title_from_chapter: bool + :return: tuple[str, str] + """ title_str = BeautifulSoup(title, features='lxml').string title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r' +', ' ', title_str).rstrip()