forked from LiveCarta/BookConverter
epub converter: add files
This commit is contained in:
104
src/html_epub_preprocessor.py
Normal file
104
src/html_epub_preprocessor.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
|
||||
|
||||
def preprocess_image():
|
||||
pass
|
||||
|
||||
|
||||
def preprocess_table():
|
||||
pass
|
||||
|
||||
|
||||
def preprocess_quote():
|
||||
pass
|
||||
|
||||
|
||||
def clean_heading_in_content():
|
||||
pass
|
||||
|
||||
|
||||
def preprocess_footnotes():
|
||||
pass
|
||||
|
||||
|
||||
def add_fonts():
|
||||
pass
|
||||
|
||||
|
||||
def unwrap_structural_tags(body_tag):
|
||||
divs = body_tag.find_all("div")
|
||||
for div in divs:
|
||||
div.unwrap()
|
||||
|
||||
secs = body_tag.find_all("section")
|
||||
for s in secs:
|
||||
s.unwrap()
|
||||
|
||||
articles = body_tag.find_all("article")
|
||||
for s in articles:
|
||||
s.unwrap()
|
||||
|
||||
articles = body_tag.find_all("main")
|
||||
for s in articles:
|
||||
s.unwrap()
|
||||
|
||||
articles = body_tag.find_all("body")
|
||||
for s in articles:
|
||||
s.unwrap()
|
||||
|
||||
# articles = body_tag.find_all("html")
|
||||
# for s in articles:
|
||||
# s.unwrap()
|
||||
|
||||
spans = body_tag.find_all("span")
|
||||
# not all cases, if span has <p>s and NavigableString, it won't unwrap
|
||||
for s in spans:
|
||||
if not s.string and s.contents:
|
||||
is_string = [isinstance(child, NavigableString) for child in s.contents]
|
||||
if any(is_string):
|
||||
pass
|
||||
else:
|
||||
s.unwrap()
|
||||
|
||||
for node in body_tag:
|
||||
if isinstance(node, NavigableString):
|
||||
content = str(node)
|
||||
content = re.sub(r'([\n\t\xa0])', ' ', content)
|
||||
content = content.strip()
|
||||
if content:
|
||||
tag = body_tag.new_tag('p')
|
||||
tag.append(str(node))
|
||||
node.replace_with(tag)
|
||||
|
||||
return body_tag
|
||||
|
||||
|
||||
def str2html_soup(html_text: str, element_id=None):
|
||||
html_soup = BeautifulSoup(html_text, features='lxml')
|
||||
if element_id:
|
||||
x = html_soup.find(id=element_id)
|
||||
return str(x)
|
||||
else:
|
||||
return str(html_text)
|
||||
|
||||
|
||||
def get_tags_between_ids(first_id, href, html_soup):
|
||||
h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
|
||||
if h_marked:
|
||||
p = h_marked.next_sibling
|
||||
tags = []
|
||||
while p:
|
||||
if p.name == 'h1' and p.attrs.get('class') == 'internal-mark':
|
||||
break
|
||||
tags.append(p)
|
||||
p = p.next_sibling
|
||||
|
||||
tags = [tag.extract() for tag in tags]
|
||||
html_soup.smooth()
|
||||
|
||||
else:
|
||||
assert 0, f'Warning: no match for {first_id, href}'
|
||||
|
||||
return tags
|
||||
Reference in New Issue
Block a user