From 7963486d7cbe7df44dd8e2c27445c90db581c7ff Mon Sep 17 00:00:00 2001 From: shirshasa Date: Thu, 8 Jul 2021 15:08:34 +0300 Subject: [PATCH] epub converter: add aside as table - block processing --- src/html_epub_preprocessor.py | 45 +++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index ff50c4c..9e3497f 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -129,6 +129,7 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): new_tag = main_tag.new_tag("span") new_tag.attrs['id'] = id_ or '' new_tag.attrs['class'] = class_ or '' + new_tag.string = "\xa0" tag.insert_before(new_tag) @@ -298,6 +299,11 @@ def unwrap_structural_tags(body_tag): div.unwrap() for s in body_tag.find_all("section"): + if s.attrs.get('class'): + class_ = s.attrs['class'] if not isinstance(s.attrs['class'], list) else s.attrs['class'][0] + if s.parent.name == 'aside': + if not s.parent.attrs.get('class'): + s.parent.attrs['class'] = class_ _add_span_to_save_ids_for_links(s) s.unwrap() @@ -398,6 +404,44 @@ def wrap_span_with_table(main_tag, old_tag): return table +def wrap_block_with_table(main_tag, old_tag, color=None): + table = main_tag.new_tag("table") + table.attrs['border'] = '1px solid' + table.attrs['align'] = 'center' + table.attrs['style'] = 'width:95%;' + tbody = main_tag.new_tag("tbody") + tr = main_tag.new_tag("tr") + td = main_tag.new_tag("td") + td.attrs['border-radius'] = '8px' + if color: + td.attrs['bgcolor'] = color + old_tag.wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + table.insert_after(BeautifulSoup(features='lxml').new_tag("br")) + return table + + +def preprocess_block_tags(chapter_tag): + for block in chapter_tag.find_all("blockquote"): + if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: + hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) + for hr in hrs: + hr.extract() + + h = block.find(re.compile("h[1-9]")) + if h: + h.name = "p" + h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) + + color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None + color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color + wrap_block_with_table(chapter_tag, block, color) + block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) + block.unwrap() + + def preprocess_pre_tags(chapter_tag): for pre in chapter_tag.find_all("pre"): new_tag = BeautifulSoup(features='lxml').new_tag("span") @@ -451,6 +495,7 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr preprocess_table(chapter_tag) preprocess_code_tags(chapter_tag) preprocess_pre_tags(chapter_tag) + preprocess_block_tags(chapter_tag) # 2. class removal for tag in chapter_tag.find_all(recursive=True): if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']):