From c4c776ea3ebaa12399d4bd65d4df8fecc3f07da7 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Wed, 1 Sep 2021 16:12:35 +0300 Subject: [PATCH] epub converter: add comments removal --- src/html_epub_preprocessor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index cb3972b..9cb6421 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -3,7 +3,7 @@ import pathlib import re from typing import List, Tuple -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString, Tag, Comment from access import Access from livecarta_config import LawCartaConfig @@ -538,7 +538,10 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', 'footnote-element']): del tag.attrs['class'] - + # 3. comments removal + comments = chapter_tag.findAll(text=lambda text: isinstance(text, Comment)) + for comment in comments: + comment.extract() # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) title_str = clean_title_from_numbering(title_str) return title_str, str(chapter_tag)