From 9fb7a7eda21891eb63d3f7ca2d61d0a6d3b1e7d5 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Fri, 20 Aug 2021 16:18:52 +0300 Subject: [PATCH] epub converter: add new type of blocks --- src/html_epub_preprocessor.py | 45 ++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index baf1b59..76f07b5 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -250,13 +250,24 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note def unwrap_structural_tags(body_tag): def _preserve_class_in_aside_tag(tag_): - # to save css style inherited from class, copy class to aside tag + # to save css style inherited from class, copy class to aside tag (which is parent to tag_) # this is for Wiley books with boxes tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0] if tag_.parent.name == 'aside': if not tag_.parent.attrs.get('class'): tag_.parent.attrs['class'] = tag_class + def _preserve_class_in_section_tag(tag_): + # to save css style inherited from class, copy class to child

+ # this is for Wiley books with boxes + tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0] + child_p_tag = tag_.find_all("p") + if len(child_p_tag) != 1: + return + child_p_tag = child_p_tag[0] + if not child_p_tag.attrs.get('class'): + child_p_tag.attrs['class'] = tag_class + def _add_table_to_abc_books(tag_, border, bg_color): wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color) @@ -292,6 +303,7 @@ def unwrap_structural_tags(body_tag): for s in body_tag.find_all("section"): if s.attrs.get('class'): _preserve_class_in_aside_tag(s) + _preserve_class_in_section_tag(s) _add_span_to_save_ids_for_links(s) s.unwrap() @@ -411,26 +423,36 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co return table +def _clean_wiley_block(block): + hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) + for hr in hrs: + hr.extract() + h = block.find(re.compile("h[1-9]")) + if h: + h.name = "p" + h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) + + def preprocess_block_tags(chapter_tag): for block in chapter_tag.find_all("blockquote"): if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: - hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) - for hr in hrs: - hr.extract() - - h = block.find(re.compile("h[1-9]")) - if h: - h.name = "p" - h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) + _clean_wiley_block(block) color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color - wrap_block_tag_with_table(chapter_tag, block, color) + wrap_block_tag_with_table(chapter_tag, block, bg_color=color) block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) block.unwrap() + for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): + _clean_wiley_block(future_block) + color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None + color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color + wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) + def _prepare_formatted(text): + # replace <,> to save them as is in html code text = text.replace("<", "\x3C") text = text.replace(">", "\x3E") text = text.replace('\t', "\xa0 \xa0 ") #     @@ -443,7 +465,7 @@ def preprocess_pre_tags(chapter_tag): new_tag = BeautifulSoup(features='lxml').new_tag("span") new_tag.attrs = pre.attrs.copy() spans = pre.find_all("span") - to_add_br = len(spans) > 1 + to_add_br = len(spans) > 1 # if in

 there are multiple , we need to add 
after each content for child in pre.children: if isinstance(child, NavigableString): @@ -470,6 +492,7 @@ def preprocess_pre_tags(chapter_tag): def preprocess_code_tags(chapter_tag): + # function that emulates style of , , for code in chapter_tag.find_all(re.compile("code|kdb|var")): code.name = 'span' if code.parent.name == "pre":