epub converter: add files

This commit is contained in:
shirshasa
2021-04-14 14:29:19 +03:00
parent 4eb30bd80c
commit 880b045de0
3 changed files with 371 additions and 0 deletions

View File

@@ -0,0 +1,104 @@
import re
from bs4 import BeautifulSoup, NavigableString
def preprocess_image():
pass
def preprocess_table():
pass
def preprocess_quote():
pass
def clean_heading_in_content():
pass
def preprocess_footnotes():
pass
def add_fonts():
pass
def unwrap_structural_tags(body_tag):
divs = body_tag.find_all("div")
for div in divs:
div.unwrap()
secs = body_tag.find_all("section")
for s in secs:
s.unwrap()
articles = body_tag.find_all("article")
for s in articles:
s.unwrap()
articles = body_tag.find_all("main")
for s in articles:
s.unwrap()
articles = body_tag.find_all("body")
for s in articles:
s.unwrap()
# articles = body_tag.find_all("html")
# for s in articles:
# s.unwrap()
spans = body_tag.find_all("span")
# not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in spans:
if not s.string and s.contents:
is_string = [isinstance(child, NavigableString) for child in s.contents]
if any(is_string):
pass
else:
s.unwrap()
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def str2html_soup(html_text: str, element_id=None):
html_soup = BeautifulSoup(html_text, features='lxml')
if element_id:
x = html_soup.find(id=element_id)
return str(x)
else:
return str(html_text)
def get_tags_between_ids(first_id, href, html_soup):
h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
if h_marked:
p = h_marked.next_sibling
tags = []
while p:
if p.name == 'h1' and p.attrs.get('class') == 'internal-mark':
break
tags.append(p)
p = p.next_sibling
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f'Warning: no match for {first_id, href}'
return tags