epub converter: add headers and image processing

2021-04-15 14:45:41 +03:00
parent 880b045de0
commit 5e58cb3d92
2 changed files with 104 additions and 30 deletions
--- a/src/epub_converter.py
+++ b/src/epub_converter.py
@@ -1,6 +1,7 @@
 import codecs
 import json
 import re
+import os
 from collections import defaultdict
 from typing import Dict, Union

@@ -8,17 +9,31 @@ import ebooklib
 from bs4 import BeautifulSoup
 from ebooklib import epub
 from ebooklib.epub import Link, Section
+from ebooklib.utils import debug

 from src.data_objects import ChapterItem, NavPoint
-from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids
+from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
+    preprocess_image
+
+
+# todo: https://docs.python.org/3/howto/unicode.html


 class EpubBookAdapter:
    def __init__(self, file):
        self.file = file
        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
+        self.href2img_bytes = {}
+
+        for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
+            debug(x)
+            file_name = x.file_name
+            content = x.content
+            # todo: check how file path is count in lib
+            self.href2img_bytes[file_name] = content
+
        self.id_anchor_exist_in_nav_points = False
-        self.href2soup_html = self.build_href2soup_content()
+        self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
        # если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
        # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
        self.href2ids = defaultdict(list)
@@ -164,11 +179,13 @@ class EpubBookAdapter:
    def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
        title = node.title
        if node.id:
-            content = self.id_anchor2soup[(node.href, node.id)]
+            content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
        else:
-            content = self.href2soup_html[node.href]
-        content_preprocessed = str(content)  # todo self.preprocess_html(content, node.id)
-        content_preprocessed = re.sub(r'([\n\t\xa0])', ' ', content_preprocessed)
+            content: BeautifulSoup = self.href2soup_html[node.href]
+
+        preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None)
+        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
+
        sub_nodes = []
        # warning! not EpubHtmlItems won;t be added to chapter
        if self.adjacency_list.get(node):
@@ -196,9 +213,6 @@ if __name__ == "__main__":
        "content": l
    }

-    output_file = open('output.out', 'w')
-    output_file.write(str(tmp))
-
    with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
        json.dump(tmp, f, ensure_ascii=False)

--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -1,10 +1,47 @@
+import os
+import pathlib
 import re

 from bs4 import BeautifulSoup, NavigableString

+from src.access import Access

-def preprocess_image():
-    pass
+
+def save_image_locally(img_file_path, img_content, book_id):
+    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
+    new_path.mkdir(exist_ok=True)
+
+    new_img_path = new_path / os.path.basename(img_file_path)
+    f = open(new_img_path, 'wb+')
+    f.write(img_content)
+    f.close()
+
+    return new_img_path
+
+
+def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
+    link = access.send_image_by_bytes(img_file_path, img_content, book_id)
+    return link
+
+
+def preprocess_image(body_tag, href2img_content, path_to_html, access=None):
+    img_tags = body_tag.find_all('img')
+
+    for img in img_tags:
+        path_to_img_from_html = img.attrs.get('src')
+        html_folder = os.path.dirname(path_to_html)
+        path_to_img_from_root = os.path.normpath(os.path.join(html_folder ,path_to_img_from_html))
+
+        assert path_to_img_from_root in href2img_content, f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
+
+        img_content = href2img_content[path_to_img_from_root]
+        if access is not None:
+            new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
+        else:
+            new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
+
+        img.attrs['src'] = str(new_folder)


 def preprocess_table():
@@ -15,8 +52,12 @@ def preprocess_quote():
    pass


-def clean_heading_in_content():
-    pass
+def clean_heading_in_content(content, title: str):
+    for child in content.contents:
+        if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
+            if title == child.text:
+                child.extract()
+            break


 def preprocess_footnotes():
@@ -28,8 +69,18 @@ def add_fonts():


 def unwrap_structural_tags(body_tag):
+    structural_tags_names = [
+        'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
+        'figure', 'footer', 'iframe', 'span'
+    ]
+
    divs = body_tag.find_all("div")
    for div in divs:
+        if div.contents:
+            is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
+            if all(is_not_struct_tag):
+                div.name = 'p'
+                continue
        div.unwrap()

    secs = body_tag.find_all("section")
@@ -48,19 +99,18 @@ def unwrap_structural_tags(body_tag):
    for s in articles:
        s.unwrap()

-    # articles = body_tag.find_all("html")
-    # for s in articles:
-    #     s.unwrap()
+    articles = body_tag.find_all("html")
+    for s in articles:
+        s.unwrap()

    spans = body_tag.find_all("span")
    # not all cases, if span has <p>s and NavigableString, it won't unwrap
    for s in spans:
        if not s.string and s.contents:
-            is_string = [isinstance(child, NavigableString) for child in s.contents]
-            if any(is_string):
-                pass
-            else:
-                s.unwrap()
+            is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
+            if all(is_not_struct_tag):
+                continue
+        s.unwrap()

    for node in body_tag:
        if isinstance(node, NavigableString):
@@ -75,15 +125,6 @@ def unwrap_structural_tags(body_tag):
    return body_tag


-def str2html_soup(html_text: str, element_id=None):
-    html_soup = BeautifulSoup(html_text, features='lxml')
-    if element_id:
-        x = html_soup.find(id=element_id)
-        return str(x)
-    else:
-        return str(html_text)
-
-
 def get_tags_between_ids(first_id, href, html_soup):
    h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
    if h_marked:
@@ -102,3 +143,22 @@ def get_tags_between_ids(first_id, href, html_soup):
        assert 0, f'Warning: no match for {first_id, href}'

    return tags
+
+
+def prepare_title_and_content(title, content: BeautifulSoup):
+    title_str = BeautifulSoup(title, features='lxml').string
+    # 0. cleaning \n
+    to_remove = []
+    for child in content.contents:
+        if isinstance(child, NavigableString):
+            s = re.sub(r'([\n\t\xa0])', '', child.string)
+            if s == '':
+                to_remove.append(child)
+
+    [x.extract() for x in to_remove]
+    # 1. rule#1 for heading removal
+    clean_heading_in_content(content, title_str)
+
+    content_str = re.sub(r'([\n\t\xa0])', ' ', str(content))
+    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
+    return title_str, content_str