diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index cb3972b..9cb6421 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -3,7 +3,7 @@ import pathlib import re from typing import List, Tuple -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString, Tag, Comment from access import Access from livecarta_config import LawCartaConfig @@ -538,7 +538,10 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', 'footnote-element']): del tag.attrs['class'] - + # 3. comments removal + comments = chapter_tag.findAll(text=lambda text: isinstance(text, Comment)) + for comment in comments: + comment.extract() # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) title_str = clean_title_from_numbering(title_str) return title_str, str(chapter_tag)