epub converter: add css processing

This commit is contained in:
shirshasa
2021-04-22 17:26:17 +03:00
parent 8f284651c4
commit e0e64a0c38
3 changed files with 229 additions and 31 deletions

View File

@@ -27,7 +27,7 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
return link
def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
img_tags = body_tag.find_all('img')
for img in img_tags:
@@ -189,8 +189,7 @@ def unwrap_structural_tags(body_tag):
'figure', 'footer', 'iframe', 'span', 'p'
]
divs = body_tag.find_all("div")
for div in divs:
for div in body_tag.find_all("div"):
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
@@ -198,35 +197,34 @@ def unwrap_structural_tags(body_tag):
continue
div.unwrap()
secs = body_tag.find_all("section")
for s in secs:
for s in body_tag.find_all("section"):
s.unwrap()
articles = body_tag.find_all("article")
for s in articles:
for s in body_tag.find_all("article"):
s.unwrap()
articles = body_tag.find_all("main")
for s in articles:
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
s.unwrap()
articles = body_tag.find_all("body")
for s in articles:
for s in body_tag.find_all("body"):
s.unwrap()
articles = body_tag.find_all("html")
for s in articles:
for s in body_tag.find_all("html"):
s.unwrap()
spans = body_tag.find_all("span")
# not all cases, if span has <p>s and NavigableString, it won't unwrap
for s in spans:
if not s.string and s.contents:
for s in body_tag.find_all("span"):
if s.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
if all(is_not_struct_tag):
continue
s.unwrap()
_preprocessing_headings(body_tag)
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
@@ -278,6 +276,6 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
_process_lists(content_tag)
_preprocessing_headings(content_tag)
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str)
return title_str, content_str
return title_str, str(content_tag)