diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index e2fe136..627d1f1 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -96,13 +96,11 @@ def _wrap_strings_with_p(chapter_tag): for node in chapter_tag: if isinstance(node, NavigableString): content = str(node) - content = re.sub(r"([\n\t\xa0])", " ", content) - # remove spaces at the beginning and at the end of the string: - content = content.strip() + content = re.sub(r"([\s\xa0])", " ", content).strip() if content: - tag = chapter_tag.new_tag("p") - tag.append(str(node)) - node.replace_with(tag) + p_tag = chapter_tag.new_tag("p") + p_tag.append(str(node)) + node.replace_with(p_tag) def _remove_headings_content(content_tag, title_of_chapter: str): @@ -146,6 +144,7 @@ def _tags_to_correspond_livecarta_tag(chapter_tag): for key in reg_key: tags = chapter_tag.find_all(re.compile(key)) for tag in tags: + # todo can cause appearance of \n

...

->

\n

...

\n

(section) tag.name = to_replace_value def _unwrap_tags(chapter_tag): @@ -300,8 +299,6 @@ def _clean_wiley_block(block): h.insert_before(BeautifulSoup(features="lxml").new_tag("br")) - - def _preprocess_block_tags(chapter_tag: Tag): """Function preprocessing tags""" for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}): @@ -323,6 +320,13 @@ def _preprocess_block_tags(chapter_tag: Tag): _wrap_tag_with_table(chapter_tag, future_block, bg_color=color) +def _class_removing(chapter_tag): + for tag in chapter_tag.find_all(recursive=True): + if tag.attrs.get("class") \ + and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): + del tag.attrs["class"] + + def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: """ Function finalise processing/cleaning content @@ -368,9 +372,6 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro _preprocess_div_tags(content_tag) _preprocess_block_tags(content_tag) - # 5. remove classes that were created by converter - for tag in content_tag.find_all(recursive=True): - if hasattr(tag, "attrs") and tag.attrs.get("class") \ - and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): - del tag.attrs["class"] + # 5. remove classes that weren't created by converter + _class_removing(content_tag) return str(content_tag)