This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/epub_converter/html_epub_preprocessor.py

638 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import re
import pathlib
from typing import Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from src.access import Access
from src.livecarta_config import LiveCartaConfig
def save_image_locally(img_file_path, img_content, book_id):
""" Function saves all images locally """
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'../json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
""" Function saves all images to Amazon web service """
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link_path
def update_images_src_links(body_tag: Tag,
href2img_content: dict,
path_to_html,
access=None,
path2aws_path=None,
book_id=None):
""" Function makes dictionary image_src_path -> Amazon web service_path """
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(
html_folder, path_to_img_from_html)).replace('\\', '/')
assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(
access, path_to_img_from_root, img_content, book_id)
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(
path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):
del img.attrs['width']
if img.attrs.get('height'):
del img.attrs['height']
if img.attrs.get('style'):
del img.attrs['style']
return path2aws_path
def preprocess_table(body_tag: BeautifulSoup):
""" Function to preprocess tables and tags(td|th|tr): style """
tables = body_tag.find_all("table")
for table in tables:
ts = table.find_all(re.compile("td|th|tr"))
for t_tag in ts:
style = t_tag.get('style')
width = ''
if style:
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match:
size = width_match.group(1)
units = width_match.group(2)
width = size+'px'
t_tag.attrs['width'] = t_tag.get('width') or width
if t_tag.attrs.get('style'):
t_tag.attrs['style'] = t_tag.attrs['style'].replace('border:0;', '')
elif t_tag.attrs.get('style') == '':
del t_tag.attrs['style']
if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
table.attrs['border'] = '1'
def process_lists(body_tag):
"""
Function to process tags <li>.
Unwrap <p> tags.
"""
li_tags = body_tag.find_all("li")
for li_tag in li_tags:
if li_tag.p:
li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap()
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
""" Function inserts span before tag to be removed(aren't supported by livecarta) """
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
new_tag.string = "\xa0"
tag.insert_before(new_tag)
def clean_headings_content(content: Tag, title: str):
def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=tag_to_be_removed.attrs.get(
'id'),
class_=tag_to_be_removed.attrs.get('class'))
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class'))
title = title.lower()
for child in content.contents:
if isinstance(child, NavigableString):
text = child
else:
text = child.text
if text and re.sub(r'([\n\t\xa0])', '', text):
text = re.sub(r'([\n\t\xa0])', ' ', text)
text = re.sub(r' +', ' ', text).strip()
text = text.lower()
if title == text:
add_span_to_save_ids_for_links(child, content)
child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
add_span_to_save_ids_for_links(child, content)
child.extract()
break
def heading_tag_to_p_tag(body_tag):
""" Function to convert all lower level headings to p tags """
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
def clean_title_from_numbering(title: str):
""" Function removes numbering from titles """
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
def replace_with_livecarta_anchor_tag(anchor, i):
""" Function replace noteref_tag(anchor) with new livecarta tag """
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
if anchor.parent.name == 'sup':
anchor.parent.unwrap()
anchor.replace_with(new_tag)
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-> Tuple[list, list, list]:
"""
This function preprocessing footnotes
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(
attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set(
[tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [
tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
new_noterefs_tags = []
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str) -> Tuple[str, str]:
""" Returns name of file & id of an anchor """
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#')
return f, id_
def verify_footnote_tag(tags: list):
""" Function verifies is tag - footnote """
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(
f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
return tags
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(
f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={'epub:type': re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(
replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(
attrs={'role': 'doc-backlink'}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
return footnotes, new_noterefs_tags, new_footnotes_tags
def unwrap_structural_tags(body_tag):
"""
Main function that works with structure of html.
Make changes inplace.
1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
This tag must have a body_tag as a parent.
Otherwise, it is wrapped with some tags. Like:
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString
:param body_tag: Tag, soup object
:return: None
"""
def _preserve_class_in_aside_tag(tag_):
""" to save css style inherited from class, copy class to aside tag (which is parent to tag_) """
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
if tag_.parent.name == 'aside':
if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class
def preserve_class_in_section_tag(tag_) -> bool:
"""
to save css style inherited from class, copy class to child <p>
returns True, if <section> could be unwrapped
"""
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
if 'feature' not in tag_class:
return True
child_p_tags = tag_.find_all("p")
if len(child_p_tags) == 1:
child_p_tag = child_p_tags[0]
if not child_p_tag.attrs.get('class'):
child_p_tag.attrs['class'] = tag_class
return True
elif len(child_p_tags) > 1:
tag_.name = 'p'
return False
else:
return True
def add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
def replace_div_tag_with_table():
"""Function replace <div> with <table>:
1. Convert div with certain classes to tables
2. Add background color to div with background-color
"""
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(
div.attrs['class'], list) else div.attrs['class'][0]
if div_class in ['C409', 'C409a']:
wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')
elif div_class in ['C441', 'C816']:
wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')
if div.attrs.get('style'):
if 'background-color' in div.attrs['style']:
end_index = div.attrs['style'].find(
'background-color') + len('background-color')
start_index_of_color = end_index + 2
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
elif div.attrs.get('style') == '':
del div.attrs['style']
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
if div.contents:
is_not_struct_tag = [
child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
add_span_to_save_ids_for_links(div)
div.unwrap()
# comments removal
for tag in body_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
replace_div_tag_with_table()
for s in body_tag.find_all("section"):
could_be_unwrapped = True
if s.attrs.get('class'):
could_be_unwrapped = preserve_class_in_section_tag(s)
add_span_to_save_ids_for_links(s)
if could_be_unwrapped:
s.unwrap()
for s in body_tag.find_all("article"):
add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("figure"):
s.name = 'p'
# to center image inside this tag
s.attrs['style'] = "text-align: center;"
for s in body_tag.find_all("figcaption"):
add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("body"):
add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("html"):
add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# check marks for chapter starting are on the same 1 level
marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
parents_marks_are_body = [x.parent == body_tag for x in marks]
# fix marks to be on 1 level
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(
parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
heading_tag_to_p_tag(body_tag)
# wrap NavigableString with <p>
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def get_tags_between_chapter_marks(first_id, href, html_soup):
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
:param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
:param href: name of current chapter's file
:param html_soup: soup object of current file
:return: list [Tag, NavigableString]; chapter's tags
"""
marked_tags = html_soup.find(
attrs={'id': first_id, 'class': 'converter-chapter-mark'})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and\
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f'Warning: no match for {first_id, href}'
return tags
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
""" Function wraps <block> with <table> """
table = main_tag.new_tag("table")
table.attrs['border'] = border
table.attrs['align'] = 'center'
table.attrs['style'] = f'width:{width}%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
# td.attrs['border-radius'] = '8px'
if bg_color:
td.attrs['bgcolor'] = bg_color
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
return table
def clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
hr.extract()
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
def preprocess_block_tags(chapter_tag):
""" Function preprocessing <block> tags """
for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get(
'class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
clean_wiley_block(future_block)
color = '#DDDDDD' if future_block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if future_block.attrs.get(
'class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
def prepare_formatted(text):
""" Function replaces special symbols with their Unicode representation """
text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
text = text.replace(' ', "\xa0")
text = text.replace('𝑓', "\xf0\x9d\x91\x93")
return text
def wrap_preformatted_span_with_table(main_tag, old_tag):
""" Function wraps <span> with <table> """
table = main_tag.new_tag("table")
table.attrs['border'] = '1px #ccc;'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['bgcolor'] = '#f5f5f5'
# td.attrs['border-radius'] = '4px'
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def preprocess_pre_tags(chapter_tag):
""" Function preprocessing <pre> tags """
for pre in chapter_tag.find_all("pre"):
new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy()
spans = pre.find_all("span")
# if in <pre> there are multiple <span>, we need to add <br> after each content
to_add_br = len(spans) > 1
for child in pre.children:
if isinstance(child, NavigableString):
cleaned_text = prepare_formatted(str(child))
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
for string in sub_strings:
new_tag.append(NavigableString(string))
new_tag.append(BeautifulSoup(
features='lxml').new_tag('br'))
else:
for sub_child in child.children:
if isinstance(sub_child, NavigableString):
cleaned_text2 = prepare_formatted(str(sub_child))
sub_child.replace_with(NavigableString(cleaned_text2))
else:
sub_child.string = prepare_formatted(sub_child.text)
cleaned_tag = child.extract()
new_tag.append(cleaned_tag)
if to_add_br:
new_tag.append(BeautifulSoup(
features='lxml').new_tag('br'))
new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
"font-size: 14px; white-space: nowrap;"
pre.replace_with(new_tag)
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
p_for_br = chapter_tag.new_tag("p")
p_for_br.string = "\xa0"
table.insert_after(p_for_br)
def preprocess_code_tags(chapter_tag):
""" Function that emulates style of <code>, <kdb>, <var> """
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
code.name = 'span'
if code.parent.name == "pre":
continue
code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;'
def prepare_title(title_of_chapter: str) -> str:
""" Function finalise processing/cleaning title """
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
title_str = clean_title_from_numbering(title_str)
return title_str
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""
Function finalise processing/cleaning content
1. cleaning \n
2. heading removal
3. processing tags
4. class removal
"""
# 0. cleaning \n
to_remove = []
for child in content_tag.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t])', '', child.string)
if s == '':
to_remove.append(child)
# 1. heading removal
if remove_title_from_chapter:
clean_headings_content(content_tag, title_str)
# 2. processing tags (<li>, <table>, <code>, <pre>, <block>)
process_lists(content_tag)
preprocess_table(content_tag)
preprocess_code_tags(content_tag)
preprocess_pre_tags(content_tag)
preprocess_block_tags(content_tag)
# 3. class removal
for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']):
del tag.attrs['class']
return str(content_tag)