diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 6d04ddc..0002c8e 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -323,7 +323,7 @@ def unwrap_structural_tags(body_tag): x.parent.unwrap() # warning! could reflect on formatting/internal links in some cases parents_marks_are_body = [x.parent == body_tag for x in marks] - assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level.' + assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' _preprocessing_headings(body_tag) @@ -340,16 +340,16 @@ def unwrap_structural_tags(body_tag): return body_tag -def get_tags_between_ids(first_id, href, html_soup): - h_marked = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'}) - if h_marked: - p = h_marked.next_sibling +def get_tags_between_chapter_marks(first_id, href, html_soup): + marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'}) + if marked_tags: + next_tag = marked_tags.next_sibling tags = [] - while p: - if p.name == 'tmp' and p.attrs.get('class') == 'converter-chapter-mark': + while next_tag: + if next_tag.attrs.get('class') == 'converter-chapter-mark': break - tags.append(p) - p = p.next_sibling + tags.append(next_tag) + next_tag = next_tag.next_sibling tags = [tag.extract() for tag in tags] html_soup.smooth() @@ -368,7 +368,7 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr to_remove = [] for child in chapter_tag.contents: if isinstance(child, NavigableString): - s = re.sub(r'([\n\t\xa0])', '', child.string) + s = re.sub(r'([\n\t])', '', child.string) if s == '': to_remove.append(child)