epub converter: add files

2021-04-14 14:29:19 +03:00
parent 4eb30bd80c
commit 880b045de0
3 changed files with 371 additions and 0 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -0,0 +1,104 @@
+import re
+
+from bs4 import BeautifulSoup, NavigableString
+
+
+def preprocess_image():
+    pass
+
+
+def preprocess_table():
+    pass
+
+
+def preprocess_quote():
+    pass
+
+
+def clean_heading_in_content():
+    pass
+
+
+def preprocess_footnotes():
+    pass
+
+
+def add_fonts():
+    pass
+
+
+def unwrap_structural_tags(body_tag):
+    divs = body_tag.find_all("div")
+    for div in divs:
+        div.unwrap()
+
+    secs = body_tag.find_all("section")
+    for s in secs:
+        s.unwrap()
+
+    articles = body_tag.find_all("article")
+    for s in articles:
+        s.unwrap()
+
+    articles = body_tag.find_all("main")
+    for s in articles:
+        s.unwrap()
+
+    articles = body_tag.find_all("body")
+    for s in articles:
+        s.unwrap()
+
+    # articles = body_tag.find_all("html")
+    # for s in articles:
+    #     s.unwrap()
+
+    spans = body_tag.find_all("span")
+    # not all cases, if span has <p>s and NavigableString, it won't unwrap
+    for s in spans:
+        if not s.string and s.contents:
+            is_string = [isinstance(child, NavigableString) for child in s.contents]
+            if any(is_string):
+                pass
+            else:
+                s.unwrap()
+
+    for node in body_tag:
+        if isinstance(node, NavigableString):
+            content = str(node)
+            content = re.sub(r'([\n\t\xa0])', ' ', content)
+            content = content.strip()
+            if content:
+                tag = body_tag.new_tag('p')
+                tag.append(str(node))
+                node.replace_with(tag)
+
+    return body_tag
+
+
+def str2html_soup(html_text: str, element_id=None):
+    html_soup = BeautifulSoup(html_text, features='lxml')
+    if element_id:
+        x = html_soup.find(id=element_id)
+        return str(x)
+    else:
+        return str(html_text)
+
+
+def get_tags_between_ids(first_id, href, html_soup):
+    h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
+    if h_marked:
+        p = h_marked.next_sibling
+        tags = []
+        while p:
+            if p.name == 'h1' and p.attrs.get('class') == 'internal-mark':
+                break
+            tags.append(p)
+            p = p.next_sibling
+
+        tags = [tag.extract() for tag in tags]
+        html_soup.smooth()
+
+    else:
+        assert 0, f'Warning: no match for {first_id, href}'
+
+    return tags