This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/html_epub_preprocessor.py

507 lines
18 KiB
Python

import os
import pathlib
import re
from html import escape
from typing import List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag
from access import Access
from livecarta_config import LawCartaConfig
def save_image_locally(img_file_path, img_content, book_id):
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
link = access.send_image_by_bytes(img_file_path, img_content, book_id)
return link
def update_src_links_in_images(body_tag: Tag,
href2img_content: dict,
path_to_html,
access=None,
path2aws_path=None):
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html))
assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):
del img.attrs['width']
if img.attrs.get('height'):
del img.attrs['height']
return path2aws_path
def preprocess_figure():
pass
def preprocess_table(body_tag: BeautifulSoup):
tables = body_tag.find_all("table")
for table in tables:
tds = table.find_all(re.compile("td|th|tr"))
border_sizes = []
for td in tds:
style = td.get('style')
width = ''
if style:
border_match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) or\
re.search(r"border-top-width: ?(\d+\.?\d*)(p[tx])", style) or\
re.search(r"border-left-width: ?(\d+\.?\d*)(p[tx])", style) or \
re.search(r"border-right-width: ?(\d+\.?\d*)(p[tx])", style) or \
re.search(r"border-bottom-width: ?(\d+\.?\d*)(p[tx])", style)
if border_match:
size = border_match.group(1)
units = border_match.group(2)
border_sizes.append(float(size))
width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match:
size = width_match.group(1)
units = width_match.group(2)
width = size+'px'
width = td.get('width') or width
if width:
td.attrs['width'] = width
if td.attrs.get('style'):
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
if td.attrs.get('style') == '':
del td.attrs['style']
if border_sizes:
border_size = sum(border_sizes) / len(border_sizes)
table.attrs['border'] = f'{border_size:.2}'
else:
table.attrs['border'] = '1'
def _process_lists(body_tag):
"""
Function to process tags <li>.
Unwrap <p> tags.
"""
li_tags = body_tag.find_all("li")
for il_tag in li_tags:
if il_tag.p:
il_tag.attrs.update(il_tag.p.attrs)
il_tag.p.unwrap()
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
new_tag.string = "\xa0"
tag.insert_before(new_tag)
def clean_headings_content(content: Tag, title: str):
def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=tag_to_be_removed.attrs.get('id'),
class_=tag_to_be_removed.attrs.get('class'))
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class'))
title = title.lower()
for child in content.contents:
if isinstance(child, NavigableString):
text = child
else:
text = child.text
if text and re.sub(r'([\n\t\xa0])', '', text):
text = re.sub(r'([\n\t\xa0])', ' ', text)
text = re.sub(r' +', ' ', text).strip()
text = text.lower()
if title == text:
_add_span_to_save_ids_for_links(child, content)
child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
_add_span_to_save_ids_for_links(child, content)
child.extract()
break
def _preprocessing_headings(body_tag):
"""
Function to convert all lower level headings to p tags
"""
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
def clean_title_from_numbering(title: str):
"""
Function to remove digits from headers.
"""
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
return title
def replace_with_livecarta_anchor_tag(anchor, i):
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
anchor.replace_with(new_tag)
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> List[str]:
"""
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str):
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#')
return f, id_
def verify_footnote_tag(tags: list):
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
return tags
def get_footnote_tags2str(t):
unicode_string = ''
for child in t.children:
if type(child) is NavigableString:
unicode_string += str(child)
else:
unicode_string += child.decode_contents()
return unicode_string.strip()
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={'epub:type': re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
replace_with_livecarta_anchor_tag(noteref_tag, i)
content = get_footnote_tags2str(footnote_tag)
footnote_tag.decompose()
footnotes.append(content)
return footnotes
def unwrap_structural_tags(body_tag):
def _add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
# should be before other tags processing, not to remove converter empty tags with id
# for s in body_tag.find_all("span"):
# if (s.attrs.get('epub:type') == 'pagebreak') or s.attrs.get('id'):
# continue
# if s.contents:
# is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
# if all(is_not_struct_tag):
# continue
#
# _add_span_to_save_ids_for_links(s)
# s.unwrap()
for div in body_tag.find_all("div"):
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
_add_span_to_save_ids_for_links(div)
div.unwrap()
for s in body_tag.find_all("section"):
if s.attrs.get('class'):
class_ = s.attrs['class'] if not isinstance(s.attrs['class'], list) else s.attrs['class'][0]
if s.parent.name == 'aside':
if not s.parent.attrs.get('class'):
s.parent.attrs['class'] = class_
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("article"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("figure"):
s.name = 'p'
s.attrs['style'] = "text-align: center;"
for s in body_tag.find_all("figcaption"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("body"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("html"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# check marks for chapter starting are on the same 1 level
marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
parents_marks_are_body = [x.parent == body_tag for x in marks]
# fix marks to be on 1 level
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
_preprocessing_headings(body_tag)
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def get_tags_between_chapter_marks(first_id, href, html_soup):
marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
# TODO: why we hve there NavString
if not isinstance(next_tag, NavigableString) and\
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f'Warning: no match for {first_id, href}'
return tags
def wrap_span_with_table(main_tag, old_tag):
table = main_tag.new_tag("table")
table.attrs['border'] = '1px solid #ccc;'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['bgcolor'] = '#f5f5f5'
td.attrs['border-radius'] = '4px'
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def wrap_block_with_table(main_tag, old_tag, color=None):
table = main_tag.new_tag("table")
table.attrs['border'] = '1px solid'
table.attrs['align'] = 'center'
table.attrs['style'] = 'width:95%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['border-radius'] = '8px'
if color:
td.attrs['bgcolor'] = color
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
return table
def preprocess_block_tags(chapter_tag):
for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
hr.extract()
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color
wrap_block_with_table(chapter_tag, block, color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap()
def preprocess_pre_tags(chapter_tag):
for pre in chapter_tag.find_all("pre"):
new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy()
for child in pre.children:
if isinstance(child, NavigableString):
text = pre.text
text = text.replace('\t', "\xa0 \xa0 \xa0 ")
text = text.replace(' ', "\xa0 ")
elements = re.split('\r\n|\n|\r', text)
for i in elements:
new_tag.append(NavigableString(i))
new_tag.append(BeautifulSoup(features='lxml').new_tag('br'))
else:
new_tag.append(child.extract())
new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
"font-size: 14px; white-space: pre-wrap;"
pre.insert_before(new_tag)
pre.extract()
table = wrap_span_with_table(chapter_tag, new_tag)
p_for_br = chapter_tag.new_tag("p")
p_for_br.string = "\xa0"
table.insert_after(p_for_br)
def preprocess_code_tags(chapter_tag):
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
code.name = 'span'
code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;'
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
title_str = BeautifulSoup(title, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
# 0. cleaning \n
to_remove = []
for child in chapter_tag.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t])', '', child.string)
if s == '':
to_remove.append(child)
[x.extract() for x in to_remove]
# 1. heading removal
if remove_title_from_chapter:
clean_headings_content(chapter_tag, title_str)
_process_lists(chapter_tag)
preprocess_table(chapter_tag)
preprocess_code_tags(chapter_tag)
preprocess_pre_tags(chapter_tag)
preprocess_block_tags(chapter_tag)
# 2. class removal
for tag in chapter_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']):
del tag.attrs['class']
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str)
return title_str, str(chapter_tag)