forked from LiveCarta/BookConverter
323 lines
11 KiB
Python
323 lines
11 KiB
Python
import os
|
|
import pathlib
|
|
import re
|
|
from typing import List
|
|
|
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
|
|
from access import Access
|
|
from livecarta_config import LawCartaConfig
|
|
|
|
|
|
def save_image_locally(img_file_path, img_content, book_id):
|
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
|
|
new_path.mkdir(exist_ok=True)
|
|
|
|
new_img_path = new_path / os.path.basename(img_file_path)
|
|
f = open(new_img_path, 'wb+')
|
|
f.write(img_content)
|
|
f.close()
|
|
|
|
return new_img_path
|
|
|
|
|
|
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
|
|
link = access.send_image_by_bytes(img_file_path, img_content, book_id)
|
|
return link
|
|
|
|
|
|
def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
|
|
img_tags = body_tag.find_all('img')
|
|
|
|
for img in img_tags:
|
|
path_to_img_from_html = img.attrs.get('src')
|
|
html_folder = os.path.dirname(path_to_html)
|
|
path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html))
|
|
|
|
assert path_to_img_from_root in href2img_content, \
|
|
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
|
|
|
|
img_content = href2img_content[path_to_img_from_root]
|
|
if access is not None:
|
|
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
|
|
else:
|
|
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
|
|
|
|
img.attrs['src'] = str(new_folder)
|
|
|
|
|
|
def preprocess_figure():
|
|
pass
|
|
|
|
|
|
def preprocess_table(body_tag: BeautifulSoup):
|
|
tables = body_tag.find_all("table")
|
|
for table in tables:
|
|
tds = table.find_all("td")
|
|
|
|
border_sizes = []
|
|
for td in tds:
|
|
style = td.get('style')
|
|
width = ''
|
|
if style:
|
|
border_match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) or\
|
|
re.search(r"border-top-width: ?(\d+\.?\d*)(p[tx])", style) or\
|
|
re.search(r"border-left-width: ?(\d+\.?\d*)(p[tx])", style) or \
|
|
re.search(r"border-right-width: ?(\d+\.?\d*)(p[tx])", style) or \
|
|
re.search(r"border-bottom-width: ?(\d+\.?\d*)(p[tx])", style)
|
|
|
|
if border_match:
|
|
size = border_match.group(1)
|
|
units = border_match.group(2)
|
|
border_sizes.append(float(size))
|
|
|
|
width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
|
|
|
|
if width_match:
|
|
size = width_match.group(1)
|
|
units = width_match.group(2)
|
|
width = size+'px'
|
|
|
|
width = td.get('width') or width
|
|
|
|
td.attrs = {}
|
|
if width:
|
|
td.attrs['width'] = width
|
|
|
|
if border_sizes:
|
|
border_size = sum(border_sizes) / len(border_sizes)
|
|
print(border_size)
|
|
table.attrs['border'] = f'{border_size:.2}'
|
|
|
|
|
|
def _process_lists(body_tag):
|
|
"""
|
|
Function to process tags <li>.
|
|
Unwrap <p> tags.
|
|
"""
|
|
li_tags = body_tag.find_all("li")
|
|
|
|
for il_tag in li_tags:
|
|
if il_tag.p:
|
|
il_tag.attrs.update(il_tag.p.attrs)
|
|
il_tag.p.unwrap()
|
|
|
|
|
|
def clean_headings_content(content: Tag, title: str):
|
|
for child in content.contents:
|
|
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
|
|
text = re.sub(r'([\n\t\xa0])', ' ', child.text)
|
|
text = re.sub(r' +', ' ', text).rstrip()
|
|
if title == text:
|
|
child.extract()
|
|
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
|
|
child.extract()
|
|
break
|
|
|
|
|
|
def _preprocessing_headings(body_tag):
|
|
"""
|
|
Function to convert all lower level headings to p tags
|
|
"""
|
|
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
|
header_tags = body_tag.find_all(re.compile(pattern))
|
|
for tag in header_tags:
|
|
tag.name = 'p'
|
|
|
|
|
|
def clean_title_from_numbering(title: str):
|
|
"""
|
|
Function to remove digits from headers.
|
|
"""
|
|
title = re.sub(r'^(\s+)+', '', title)
|
|
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
|
|
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
|
|
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
|
|
return title
|
|
|
|
|
|
def replace_with_livecarta_anchor_tag(anchor, i):
|
|
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
|
new_tag['class'] = 'footnote-element'
|
|
new_tag['data-id'] = i + 1
|
|
new_tag['id'] = f'footnote-{i + 1}'
|
|
new_tag.string = '*'
|
|
anchor.replace_with(new_tag)
|
|
|
|
|
|
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> List[str]:
|
|
"""
|
|
This function should be earlier that adding fonts in pipeline.
|
|
|
|
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
|
|
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
|
|
|
"""
|
|
footnotes = []
|
|
noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'})
|
|
bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')])
|
|
noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
|
|
[tag.decompose() for tag in bad_noterefs_tags]
|
|
|
|
def parse_a_tag_href(s: str):
|
|
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
|
|
f, id_ = s.split('#')
|
|
return f, id_
|
|
|
|
def verify_footnote_tag(tags: list):
|
|
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
|
|
if len(tags) == 0:
|
|
anchored_tags = list(target_html_tag.find_all(id=element_id))
|
|
if len(anchored_tags):
|
|
print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
|
|
return anchored_tags
|
|
else:
|
|
assert 0, f'Error, No element with id: {href} found.'
|
|
|
|
return tags
|
|
|
|
def get_footnote_tags2str(t):
|
|
unicode_string = ''
|
|
for child in t.children:
|
|
if type(child) is NavigableString:
|
|
unicode_string += str(child)
|
|
else:
|
|
unicode_string += child.decode_contents()
|
|
|
|
return unicode_string.strip()
|
|
|
|
def remove_internal_links_with_text(t):
|
|
for tag_a in t.find_all('a', {'href': re.compile('(^.+\.(html|xhtml)#.+)|(^#.+)')}):
|
|
tag_a.decompose()
|
|
|
|
for i, noteref_tag in enumerate(noterefs_tags):
|
|
href = noteref_tag.attrs['href']
|
|
file, element_id = parse_a_tag_href(href)
|
|
if not file:
|
|
target_html_tag = source_html_tag
|
|
else:
|
|
target_html_tag = href2soup_html[file]
|
|
|
|
possible_footnote = 'note|footnote|endnote|rearenote'
|
|
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
|
|
attrs={'epub:type': re.compile(possible_footnote)}))
|
|
|
|
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
|
|
footnote_tag = expected_footnote_tags[0]
|
|
replace_with_livecarta_anchor_tag(noteref_tag, i)
|
|
remove_internal_links_with_text(footnote_tag)
|
|
content = get_footnote_tags2str(footnote_tag)
|
|
|
|
footnote_tag.decompose()
|
|
footnotes.append(content)
|
|
|
|
return footnotes
|
|
|
|
|
|
def add_fonts():
|
|
pass
|
|
|
|
|
|
def unwrap_structural_tags(body_tag):
|
|
structural_tags_names = [
|
|
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
|
|
'figure', 'footer', 'iframe', 'span', 'p'
|
|
]
|
|
|
|
for div in body_tag.find_all("div"):
|
|
if div.contents:
|
|
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
|
|
if all(is_not_struct_tag):
|
|
div.name = 'p'
|
|
continue
|
|
div.unwrap()
|
|
|
|
for s in body_tag.find_all("section"):
|
|
s.unwrap()
|
|
|
|
for s in body_tag.find_all("article"):
|
|
s.unwrap()
|
|
|
|
for s in body_tag.find_all("aside"):
|
|
s.name = 'blockquote'
|
|
|
|
for s in body_tag.find_all("main"):
|
|
s.unwrap()
|
|
|
|
for s in body_tag.find_all("body"):
|
|
s.unwrap()
|
|
|
|
for s in body_tag.find_all("html"):
|
|
s.unwrap()
|
|
|
|
# not all cases, if span has <p>s and NavigableString, it won't unwrap
|
|
for s in body_tag.find_all("span"):
|
|
if s.contents:
|
|
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
|
|
if all(is_not_struct_tag):
|
|
continue
|
|
s.unwrap()
|
|
|
|
_preprocessing_headings(body_tag)
|
|
|
|
for node in body_tag:
|
|
if isinstance(node, NavigableString):
|
|
content = str(node)
|
|
content = re.sub(r'([\n\t\xa0])', ' ', content)
|
|
content = content.strip()
|
|
if content:
|
|
tag = body_tag.new_tag('p')
|
|
tag.append(str(node))
|
|
node.replace_with(tag)
|
|
|
|
return body_tag
|
|
|
|
|
|
def get_tags_between_ids(first_id, href, html_soup):
|
|
h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'})
|
|
if h_marked:
|
|
p = h_marked.next_sibling
|
|
tags = []
|
|
while p:
|
|
if p.name == 'h1' and p.attrs.get('class') == 'internal-mark':
|
|
break
|
|
tags.append(p)
|
|
p = p.next_sibling
|
|
|
|
tags = [tag.extract() for tag in tags]
|
|
html_soup.smooth()
|
|
|
|
else:
|
|
assert 0, f'Warning: no match for {first_id, href}'
|
|
|
|
return tags
|
|
|
|
|
|
def prepare_title_and_content(title, content_tag: BeautifulSoup):
|
|
title_str = BeautifulSoup(title, features='lxml').string
|
|
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
|
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
|
# 0. cleaning \n
|
|
to_remove = []
|
|
for child in content_tag.contents:
|
|
if isinstance(child, NavigableString):
|
|
s = re.sub(r'([\n\t\xa0])', '', child.string)
|
|
if s == '':
|
|
to_remove.append(child)
|
|
|
|
[x.extract() for x in to_remove]
|
|
# 1. rule#1 for heading removal
|
|
clean_headings_content(content_tag, title_str)
|
|
_process_lists(content_tag)
|
|
_preprocessing_headings(content_tag)
|
|
preprocess_table(content_tag)
|
|
# 2. class removal
|
|
for tag in content_tag.find_all(recursive=True):
|
|
if hasattr(tag, 'attrs') and tag.attrs.get('class'):
|
|
del tag.attrs['class']
|
|
|
|
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
|
title_str = clean_title_from_numbering(title_str)
|
|
return title_str, str(content_tag)
|