epub converter: fix

2021-09-03 14:02:45 +03:00
parent 60b83ce650
commit e6f8416516
1 changed files with 7 additions and 7 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -302,6 +302,11 @@ def unwrap_structural_tags(body_tag):
        'figure', 'footer', 'iframe', 'span', 'p'
    ]
    # comments removal
    for tag in body_tag.find_all():
        for element in tag(text=lambda text: isinstance(text, Comment)):
            element.extract()
    for div in body_tag.find_all("div"):
        if div.attrs.get('class'):
            div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0]
@@ -500,7 +505,7 @@ def preprocess_pre_tags(chapter_tag):
        for child in pre.children:
            if isinstance(child, NavigableString):
-                cleaned_text = _prepare_formatted(pre.text)
+                cleaned_text = _prepare_formatted(str(child))
                sub_strings = re.split('\r\n|\n|\r', cleaned_text)
                for string in sub_strings:
                    new_tag.append(NavigableString(string))
@@ -519,8 +524,7 @@ def preprocess_pre_tags(chapter_tag):
        new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
                                 "font-size: 14px; white-space: nowrap;"
-        pre.insert_before(new_tag)
+        pre.replace_with(new_tag)
        pre.extract()
        table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
        p_for_br = chapter_tag.new_tag("p")
        p_for_br.string = "\xa0"
@@ -570,10 +574,6 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr
        if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
                                                                                                'footnote-element']):
            del tag.attrs['class']
    # 3. comments removal
    for tag in chapter_tag.find_all():
        for element in tag(text=lambda text: isinstance(text, Comment)):
            element.extract()
    # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
    title_str = clean_title_from_numbering(title_str)
    return title_str, str(chapter_tag)