This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/html_epub_preprocessor.py
2021-09-07 16:08:59 +03:00

581 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import pathlib
import re
from typing import List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from access import Access
from livecarta_config import LawCartaConfig
def save_image_locally(img_file_path, img_content, book_id):
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
link = access.send_image(img_file_path, doc_id=book_id, img_content=img_content)
return link
def update_src_links_in_images(body_tag: Tag,
href2img_content: dict,
path_to_html,
access=None,
path2aws_path=None):
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html))
assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):
del img.attrs['width']
if img.attrs.get('height'):
del img.attrs['height']
if img.attrs.get('style'):
del img.attrs['style']
return path2aws_path
def preprocess_table(body_tag: BeautifulSoup):
tables = body_tag.find_all("table")
for table in tables:
tds = table.find_all(re.compile("td|th|tr"))
for td in tds:
style = td.get('style')
width = ''
if style:
width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match:
size = width_match.group(1)
units = width_match.group(2)
width = size+'px'
td.attrs['width'] = td.get('width') or width
if td.attrs.get('style'):
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
if td.attrs.get('style') == '':
del td.attrs['style']
if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
table.attrs['border'] = '1'
def process_lists(body_tag):
"""
Function to process tags <li>.
Unwrap <p> tags.
"""
li_tags = body_tag.find_all("li")
for il_tag in li_tags:
if il_tag.p:
il_tag.attrs.update(il_tag.p.attrs)
il_tag.p.unwrap()
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
new_tag.string = "\xa0"
tag.insert_before(new_tag)
def clean_headings_content(content: Tag, title: str):
def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=tag_to_be_removed.attrs.get('id'),
class_=tag_to_be_removed.attrs.get('class'))
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class'))
title = title.lower()
for child in content.contents:
if isinstance(child, NavigableString):
text = child
else:
text = child.text
if text and re.sub(r'([\n\t\xa0])', '', text):
text = re.sub(r'([\n\t\xa0])', ' ', text)
text = re.sub(r' +', ' ', text).strip()
text = text.lower()
if title == text:
_add_span_to_save_ids_for_links(child, content)
child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
_add_span_to_save_ids_for_links(child, content)
child.extract()
break
def _heading_tag2p_tag(body_tag):
"""
Function to convert all lower level headings to p tags
"""
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
def clean_title_from_numbering(title: str):
"""
Function to remove digits from headers.
"""
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
return title
def replace_with_livecarta_anchor_tag(anchor, i):
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
if anchor.parent.name == 'sup':
anchor.parent.unwrap()
anchor.replace_with(new_tag)
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-> Tuple[list, list, list]:
"""
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
new_noterefs_tags = []
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str):
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#')
return f, id_
def verify_footnote_tag(tags: list):
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
return tags
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={'epub:type': re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(attrs={'role': 'doc-backlink'}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
return footnotes, new_noterefs_tags, new_footnotes_tags
def unwrap_structural_tags(body_tag):
"""
Main function that works with structure of html.
Make changes inplace.
1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
This tag must have a body_tag as a parent.
Otherwise, it is wrapped with some tags. Like:
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString
:param body_tag: Tag, soup object
:return: None
"""
def _preserve_class_in_aside_tag(tag_):
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
if tag_.parent.name == 'aside':
if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class
def _preserve_class_in_section_tag(tag_) -> bool:
# to save css style inherited from class, copy class to child <p>
# this is for Wiley books with boxes
# returns True, if <section> could be unwrapped
tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
if 'feature' not in tag_class:
return True
child_p_tags = tag_.find_all("p")
if len(child_p_tags) == 1:
child_p_tag = child_p_tags[0]
if not child_p_tag.attrs.get('class'):
child_p_tag.attrs['class'] = tag_class
return True
elif len(child_p_tags) > 1:
tag_.name = 'p'
return False
else:
return True
def _add_table_to_abc_books(tag_, border, bg_color):
wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color)
def _add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
# comments removal
for tag in body_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0]
if div_class in ['C409', 'C409a']:
_add_table_to_abc_books(div, border='solid 3px', bg_color='#e7e7e9')
elif div_class in ['C441', 'C816']:
_add_table_to_abc_books(div, border='solid #6e6e70 1px', bg_color='#e7e7e8')
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
_add_span_to_save_ids_for_links(div)
div.unwrap()
for s in body_tag.find_all("section"):
could_be_unwrapped = True
if s.attrs.get('class'):
could_be_unwrapped = _preserve_class_in_section_tag(s)
_add_span_to_save_ids_for_links(s)
if could_be_unwrapped:
s.unwrap()
for s in body_tag.find_all("article"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("figure"):
s.name = 'p'
s.attrs['style'] = "text-align: center;" # to center image inside this tag
for s in body_tag.find_all("figcaption"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("body"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("html"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# check marks for chapter starting are on the same 1 level
marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
parents_marks_are_body = [x.parent == body_tag for x in marks]
# fix marks to be on 1 level
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
_heading_tag2p_tag(body_tag)
# wrap NavigableString with <p>
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def get_tags_between_chapter_marks(first_id, href, html_soup):
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
:param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
:param href: name of current chapter's file
:param html_soup: soup object of current file
:return: list [Tag, NavigableString]; chapter's tags
"""
marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and\
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f'Warning: no match for {first_id, href}'
return tags
def wrap_preformatted_span_with_table(main_tag, old_tag):
table = main_tag.new_tag("table")
table.attrs['border'] = '1px #ccc;'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['bgcolor'] = '#f5f5f5'
# td.attrs['border-radius'] = '4px'
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
table = main_tag.new_tag("table")
table.attrs['border'] = border
table.attrs['align'] = 'center'
table.attrs['style'] = f'width:{width}%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
# td.attrs['border-radius'] = '8px'
if bg_color:
td.attrs['bgcolor'] = bg_color
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
return table
def _clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
hr.extract()
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
def preprocess_block_tags(chapter_tag):
for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
_clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(future_block)
color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None
color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
def _prepare_formatted(text):
# replace <,> to save them as is in html code
text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
text = text.replace(' ', "\xa0")
text = text.replace('𝑓', "\xf0\x9d\x91\x93")
return text
def preprocess_pre_tags(chapter_tag):
for pre in chapter_tag.find_all("pre"):
new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy()
spans = pre.find_all("span")
to_add_br = len(spans) > 1 # if in <pre> there are multiple <span>, we need to add <br> after each content
for child in pre.children:
if isinstance(child, NavigableString):
cleaned_text = _prepare_formatted(str(child))
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
for string in sub_strings:
new_tag.append(NavigableString(string))
new_tag.append(BeautifulSoup(features='lxml').new_tag('br'))
else:
for sub_child in child.children:
if isinstance(sub_child, NavigableString):
cleaned_text2 = _prepare_formatted(str(sub_child))
sub_child.replace_with(NavigableString(cleaned_text2))
else:
sub_child.string = _prepare_formatted(sub_child.text)
cleaned_tag = child.extract()
new_tag.append(cleaned_tag)
if to_add_br:
new_tag.append(BeautifulSoup(features='lxml').new_tag('br'))
new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
"font-size: 14px; white-space: nowrap;"
pre.replace_with(new_tag)
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
p_for_br = chapter_tag.new_tag("p")
p_for_br.string = "\xa0"
table.insert_after(p_for_br)
def preprocess_code_tags(chapter_tag):
# function that emulates style of <code>, <kdb>, <var>
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
code.name = 'span'
if code.parent.name == "pre":
continue
code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;'
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
"""
Final processing/cleaning function.
:param title: title of the chapter
:param chapter_tag: soup object
:param remove_title_from_chapter: bool
:return: tuple[str, str]
"""
title_str = BeautifulSoup(title, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
# 0. cleaning \n
to_remove = []
for child in chapter_tag.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t])', '', child.string)
if s == '':
to_remove.append(child)
[x.extract() for x in to_remove]
# 1. heading removal
if remove_title_from_chapter:
clean_headings_content(chapter_tag, title_str)
process_lists(chapter_tag)
preprocess_table(chapter_tag)
preprocess_code_tags(chapter_tag)
preprocess_pre_tags(chapter_tag)
preprocess_block_tags(chapter_tag)
# 2. class removal
for tag in chapter_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']):
del tag.attrs['class']
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str)
return title_str, str(chapter_tag)