forked from LiveCarta/BookConverter
epub converter: add css processing
This commit is contained in:
@@ -27,7 +27,7 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
|
||||
return link
|
||||
|
||||
|
||||
def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
|
||||
def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
|
||||
img_tags = body_tag.find_all('img')
|
||||
|
||||
for img in img_tags:
|
||||
@@ -189,8 +189,7 @@ def unwrap_structural_tags(body_tag):
|
||||
'figure', 'footer', 'iframe', 'span', 'p'
|
||||
]
|
||||
|
||||
divs = body_tag.find_all("div")
|
||||
for div in divs:
|
||||
for div in body_tag.find_all("div"):
|
||||
if div.contents:
|
||||
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
|
||||
if all(is_not_struct_tag):
|
||||
@@ -198,35 +197,34 @@ def unwrap_structural_tags(body_tag):
|
||||
continue
|
||||
div.unwrap()
|
||||
|
||||
secs = body_tag.find_all("section")
|
||||
for s in secs:
|
||||
for s in body_tag.find_all("section"):
|
||||
s.unwrap()
|
||||
|
||||
articles = body_tag.find_all("article")
|
||||
for s in articles:
|
||||
for s in body_tag.find_all("article"):
|
||||
s.unwrap()
|
||||
|
||||
articles = body_tag.find_all("main")
|
||||
for s in articles:
|
||||
for s in body_tag.find_all("aside"):
|
||||
s.name = 'blockquote'
|
||||
|
||||
for s in body_tag.find_all("main"):
|
||||
s.unwrap()
|
||||
|
||||
articles = body_tag.find_all("body")
|
||||
for s in articles:
|
||||
for s in body_tag.find_all("body"):
|
||||
s.unwrap()
|
||||
|
||||
articles = body_tag.find_all("html")
|
||||
for s in articles:
|
||||
for s in body_tag.find_all("html"):
|
||||
s.unwrap()
|
||||
|
||||
spans = body_tag.find_all("span")
|
||||
# not all cases, if span has <p>s and NavigableString, it won't unwrap
|
||||
for s in spans:
|
||||
if not s.string and s.contents:
|
||||
for s in body_tag.find_all("span"):
|
||||
if s.contents:
|
||||
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
|
||||
if all(is_not_struct_tag):
|
||||
continue
|
||||
s.unwrap()
|
||||
|
||||
_preprocessing_headings(body_tag)
|
||||
|
||||
for node in body_tag:
|
||||
if isinstance(node, NavigableString):
|
||||
content = str(node)
|
||||
@@ -278,6 +276,6 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
|
||||
_process_lists(content_tag)
|
||||
_preprocessing_headings(content_tag)
|
||||
|
||||
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
||||
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
||||
title_str = clean_title_from_numbering(title_str)
|
||||
return title_str, content_str
|
||||
return title_str, str(content_tag)
|
||||
|
||||
Reference in New Issue
Block a user