From e6f84165165956f9b9272f4d700e59e622d3caa4 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Fri, 3 Sep 2021 14:02:45 +0300 Subject: [PATCH] epub converter: fix --- src/html_epub_preprocessor.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 10714cf..265ae61 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -302,6 +302,11 @@ def unwrap_structural_tags(body_tag): 'figure', 'footer', 'iframe', 'span', 'p' ] + # comments removal + for tag in body_tag.find_all(): + for element in tag(text=lambda text: isinstance(text, Comment)): + element.extract() + for div in body_tag.find_all("div"): if div.attrs.get('class'): div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0] @@ -500,7 +505,7 @@ def preprocess_pre_tags(chapter_tag): for child in pre.children: if isinstance(child, NavigableString): - cleaned_text = _prepare_formatted(pre.text) + cleaned_text = _prepare_formatted(str(child)) sub_strings = re.split('\r\n|\n|\r', cleaned_text) for string in sub_strings: new_tag.append(NavigableString(string)) @@ -519,8 +524,7 @@ def preprocess_pre_tags(chapter_tag): new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \ "font-size: 14px; white-space: nowrap;" - pre.insert_before(new_tag) - pre.extract() + pre.replace_with(new_tag) table = wrap_preformatted_span_with_table(chapter_tag, new_tag) p_for_br = chapter_tag.new_tag("p") p_for_br.string = "\xa0" @@ -570,10 +574,6 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', 'footnote-element']): del tag.attrs['class'] - # 3. comments removal - for tag in chapter_tag.find_all(): - for element in tag(text=lambda text: isinstance(text, Comment)): - element.extract() # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) title_str = clean_title_from_numbering(title_str) return title_str, str(chapter_tag)