This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/html_epub_preprocessor.py
2021-07-02 12:31:33 +03:00

407 lines
14 KiB
Python

import os
import pathlib
import re
from typing import List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag
from access import Access
from livecarta_config import LawCartaConfig
def save_image_locally(img_file_path, img_content, book_id):
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
link = access.send_image_by_bytes(img_file_path, img_content, book_id)
return link
def update_src_links_in_images(body_tag: Tag,
href2img_content: dict,
path_to_html,
access=None,
path2aws_path=None):
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html))
assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):
del img.attrs['width']
if img.attrs.get('height'):
del img.attrs['height']
return path2aws_path
def preprocess_figure():
pass
def preprocess_table(body_tag: BeautifulSoup):
tables = body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
border_sizes = []
for td in tds:
style = td.get('style')
width = ''
if style:
border_match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) or\
re.search(r"border-top-width: ?(\d+\.?\d*)(p[tx])", style) or\
re.search(r"border-left-width: ?(\d+\.?\d*)(p[tx])", style) or \
re.search(r"border-right-width: ?(\d+\.?\d*)(p[tx])", style) or \
re.search(r"border-bottom-width: ?(\d+\.?\d*)(p[tx])", style)
if border_match:
size = border_match.group(1)
units = border_match.group(2)
border_sizes.append(float(size))
width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match:
size = width_match.group(1)
units = width_match.group(2)
width = size+'px'
width = td.get('width') or width
if width:
td.attrs['width'] = width
if td.attrs.get('style'):
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
if border_sizes:
border_size = sum(border_sizes) / len(border_sizes)
table.attrs['border'] = f'{border_size:.2}'
else:
table.attrs['border'] = '1'
def _process_lists(body_tag):
"""
Function to process tags <li>.
Unwrap <p> tags.
"""
li_tags = body_tag.find_all("li")
for il_tag in li_tags:
if il_tag.p:
il_tag.attrs.update(il_tag.p.attrs)
il_tag.p.unwrap()
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
tag.insert_before(new_tag)
def clean_headings_content(content: Tag, title: str):
def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=tag_to_be_removed.attrs.get('id'),
class_=tag_to_be_removed.attrs.get('class'))
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class'))
title = title.lower()
for child in content.contents:
if isinstance(child, NavigableString):
text = child
else:
text = child.text
if text and re.sub(r'([\n\t\xa0])', '', text):
text = re.sub(r'([\n\t\xa0])', ' ', text)
text = re.sub(r' +', ' ', text).strip()
text = text.lower()
if title == text:
_add_span_to_save_ids_for_links(child, content)
child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
_add_span_to_save_ids_for_links(child, content)
child.extract()
break
def _preprocessing_headings(body_tag):
"""
Function to convert all lower level headings to p tags
"""
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
def clean_title_from_numbering(title: str):
"""
Function to remove digits from headers.
"""
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
return title
def replace_with_livecarta_anchor_tag(anchor, i):
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
anchor.replace_with(new_tag)
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> List[str]:
"""
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str):
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#')
return f, id_
def verify_footnote_tag(tags: list):
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
return tags
def get_footnote_tags2str(t):
unicode_string = ''
for child in t.children:
if type(child) is NavigableString:
unicode_string += str(child)
else:
unicode_string += child.decode_contents()
return unicode_string.strip()
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={'epub:type': re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
replace_with_livecarta_anchor_tag(noteref_tag, i)
content = get_footnote_tags2str(footnote_tag)
footnote_tag.decompose()
footnotes.append(content)
return footnotes
def unwrap_structural_tags(body_tag):
def _add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
# should be before other tags processing, not to remove converter empty tags with id
# for s in body_tag.find_all("span"):
# if (s.attrs.get('epub:type') == 'pagebreak') or s.attrs.get('id'):
# continue
# if s.contents:
# is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
# if all(is_not_struct_tag):
# continue
#
# _add_span_to_save_ids_for_links(s)
# s.unwrap()
for div in body_tag.find_all("div"):
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
_add_span_to_save_ids_for_links(div)
div.unwrap()
for s in body_tag.find_all("section"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("article"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("figure"):
s.name = 'p'
s.attrs['style'] = "text-align: center;"
for s in body_tag.find_all("figcaption"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("body"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("html"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# check marks for chapter starting are on the same 1 level
marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
parents_marks_are_body = [x.parent == body_tag for x in marks]
# fix marks to be on 1 level
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
_preprocessing_headings(body_tag)
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def get_tags_between_chapter_marks(first_id, href, html_soup):
marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
# TODO: why we hve there NavString
if not isinstance(next_tag, NavigableString) and\
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f'Warning: no match for {first_id, href}'
return tags
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
title_str = BeautifulSoup(title, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
# 0. cleaning \n
to_remove = []
for child in chapter_tag.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t])', '', child.string)
if s == '':
to_remove.append(child)
[x.extract() for x in to_remove]
# 1. heading removal
if remove_title_from_chapter:
clean_headings_content(chapter_tag, title_str)
_process_lists(chapter_tag)
preprocess_table(chapter_tag)
# 2. class removal
for tag in chapter_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']):
del tag.attrs['class']
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str)
return title_str, str(chapter_tag)