epub converter: add css processing

2021-04-22 17:26:17 +03:00
parent 8f284651c4
commit e0e64a0c38
3 changed files with 229 additions and 31 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -27,7 +27,7 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
    return link


-def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
+def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
    img_tags = body_tag.find_all('img')

    for img in img_tags:
@@ -189,8 +189,7 @@ def unwrap_structural_tags(body_tag):
        'figure', 'footer', 'iframe', 'span', 'p'
    ]

-    divs = body_tag.find_all("div")
-    for div in divs:
+    for div in body_tag.find_all("div"):
        if div.contents:
            is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
            if all(is_not_struct_tag):
@@ -198,35 +197,34 @@ def unwrap_structural_tags(body_tag):
                continue
        div.unwrap()

-    secs = body_tag.find_all("section")
-    for s in secs:
+    for s in body_tag.find_all("section"):
        s.unwrap()

-    articles = body_tag.find_all("article")
-    for s in articles:
+    for s in body_tag.find_all("article"):
        s.unwrap()

-    articles = body_tag.find_all("main")
-    for s in articles:
+    for s in body_tag.find_all("aside"):
+        s.name = 'blockquote'
+
+    for s in body_tag.find_all("main"):
        s.unwrap()

-    articles = body_tag.find_all("body")
-    for s in articles:
+    for s in body_tag.find_all("body"):
        s.unwrap()

-    articles = body_tag.find_all("html")
-    for s in articles:
+    for s in body_tag.find_all("html"):
        s.unwrap()

-    spans = body_tag.find_all("span")
    # not all cases, if span has <p>s and NavigableString, it won't unwrap
-    for s in spans:
-        if not s.string and s.contents:
+    for s in body_tag.find_all("span"):
+        if s.contents:
            is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
            if all(is_not_struct_tag):
                continue
        s.unwrap()

+    _preprocessing_headings(body_tag)
+
    for node in body_tag:
        if isinstance(node, NavigableString):
            content = str(node)
@@ -278,6 +276,6 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
    _process_lists(content_tag)
    _preprocessing_headings(content_tag)

-    content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
+    # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
    title_str = clean_title_from_numbering(title_str)
-    return title_str, content_str
+    return title_str, str(content_tag)