epub converter: fix headings levels

2021-04-30 15:21:35 +03:00
parent d21b11f99a
commit b472c5b9f7
3 changed files with 49 additions and 23 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -1,7 +1,7 @@
 import os
 import pathlib
 import re
-from typing import List
+from typing import List, Tuple

 from bs4 import BeautifulSoup, NavigableString, Tag

@@ -87,7 +87,6 @@ def preprocess_table(body_tag: BeautifulSoup):

        if border_sizes:
            border_size = sum(border_sizes) / len(border_sizes)
-            print(border_size)
            table.attrs['border'] = f'{border_size:.2}'


@@ -108,7 +107,7 @@ def clean_headings_content(content: Tag, title: str):
    for child in content.contents:
        if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
            text = re.sub(r'([\n\t\xa0])', ' ', child.text)
-            text = re.sub(r' +', ' ', text).rstrip()
+            text = re.sub(r' +', ' ', text).strip()
            if title == text:
                child.extract()
            elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
@@ -294,29 +293,30 @@ def get_tags_between_ids(first_id, href, html_soup):
    return tags


-def prepare_title_and_content(title, content_tag: BeautifulSoup):
+def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
    title_str = BeautifulSoup(title, features='lxml').string
    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
    title_str = re.sub(r' +', ' ', title_str).rstrip()
    # 0. cleaning \n
    to_remove = []
-    for child in content_tag.contents:
+    for child in chapter_tag.contents:
        if isinstance(child, NavigableString):
            s = re.sub(r'([\n\t\xa0])', '', child.string)
            if s == '':
                to_remove.append(child)

    [x.extract() for x in to_remove]
-    # 1. rule#1 for heading removal
-    clean_headings_content(content_tag, title_str)
-    _process_lists(content_tag)
-    _preprocessing_headings(content_tag)
-    preprocess_table(content_tag)
+    # 1. heading removal
+    if remove_title_from_chapter:
+        clean_headings_content(chapter_tag, title_str)
+    _process_lists(chapter_tag)
+    _preprocessing_headings(chapter_tag)
+    preprocess_table(chapter_tag)
    # 2. class removal
-    for tag in content_tag.find_all(recursive=True):
+    for tag in chapter_tag.find_all(recursive=True):
        if hasattr(tag, 'attrs') and tag.attrs.get('class'):
            del tag.attrs['class']

    # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
    title_str = clean_title_from_numbering(title_str)
-    return title_str, str(content_tag)
+    return title_str, str(chapter_tag)