import os import pathlib import re from typing import List from bs4 import BeautifulSoup, NavigableString, Tag from access import Access from livecarta_config import LawCartaConfig def save_image_locally(img_file_path, img_content, book_id): folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/')) new_path.mkdir(exist_ok=True) new_img_path = new_path / os.path.basename(img_file_path) f = open(new_img_path, 'wb+') f.write(img_content) f.close() return new_img_path def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id): link = access.send_image_by_bytes(img_file_path, img_content, book_id) return link def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None): img_tags = body_tag.find_all('img') for img in img_tags: path_to_img_from_html = img.attrs.get('src') html_folder = os.path.dirname(path_to_html) path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html)) assert path_to_img_from_root in href2img_content, \ f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' img_content = href2img_content[path_to_img_from_root] if access is not None: new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id') else: new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id') img.attrs['src'] = str(new_folder) def preprocess_figure(): pass def preprocess_table(body_tag: BeautifulSoup): tables = body_tag.find_all("table") for table in tables: tds = table.find_all("td") border_sizes = [] for td in tds: style = td.get('style') width = '' if style: border_match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) or\ re.search(r"border-top-width: ?(\d+\.?\d*)(p[tx])", style) or\ re.search(r"border-left-width: ?(\d+\.?\d*)(p[tx])", style) or \ re.search(r"border-right-width: ?(\d+\.?\d*)(p[tx])", style) or \ re.search(r"border-bottom-width: ?(\d+\.?\d*)(p[tx])", style) if border_match: size = border_match.group(1) units = border_match.group(2) border_sizes.append(float(size)) width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) if width_match: size = width_match.group(1) units = width_match.group(2) width = size+'px' width = td.get('width') or width td.attrs = {} if width: td.attrs['width'] = width if border_sizes: border_size = sum(border_sizes) / len(border_sizes) print(border_size) table.attrs['border'] = f'{border_size:.2}' def _process_lists(body_tag): """ Function to process tags
  • . Unwrap

    tags. """ li_tags = body_tag.find_all("li") for il_tag in li_tags: if il_tag.p: il_tag.attrs.update(il_tag.p.attrs) il_tag.p.unwrap() def clean_headings_content(content: Tag, title: str): for child in content.contents: if child.text and re.sub(r'([\n\t\xa0])', '', child.text): text = re.sub(r'([\n\t\xa0])', ' ', child.text) text = re.sub(r' +', ' ', text).rstrip() if title == text: child.extract() elif (title in text) and (child.name in ['h1', 'h2', 'h3']): child.extract() break def _preprocessing_headings(body_tag): """ Function to convert all lower level headings to p tags """ pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = body_tag.find_all(re.compile(pattern)) for tag in header_tags: tag.name = 'p' def clean_title_from_numbering(title: str): """ Function to remove digits from headers. """ title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) return title def replace_with_livecarta_anchor_tag(anchor, i): new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' new_tag['data-id'] = i + 1 new_tag['id'] = f'footnote-{i + 1}' new_tag.string = '*' anchor.replace_with(new_tag) def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> List[str]: """ This function should be earlier that adding fonts in pipeline.

    Here is an example footnote1

    """ footnotes = [] noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'}) bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')]) noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags] [tag.decompose() for tag in bad_noterefs_tags] def parse_a_tag_href(s: str): assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' f, id_ = s.split('#') return f, id_ def verify_footnote_tag(tags: list): assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}' if len(tags) == 0: anchored_tags = list(target_html_tag.find_all(id=element_id)) if len(anchored_tags): print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}') return anchored_tags else: assert 0, f'Error, No element with id: {href} found.' return tags def get_footnote_tags2str(t): unicode_string = '' for child in t.children: if type(child) is NavigableString: unicode_string += str(child) else: unicode_string += child.decode_contents() return unicode_string.strip() def remove_internal_links_with_text(t): for tag_a in t.find_all('a', {'href': re.compile('(^.+\.(html|xhtml)#.+)|(^#.+)')}): tag_a.decompose() for i, noteref_tag in enumerate(noterefs_tags): href = noteref_tag.attrs['href'] file, element_id = parse_a_tag_href(href) if not file: target_html_tag = source_html_tag else: target_html_tag = href2soup_html[file] possible_footnote = 'note|footnote|endnote|rearenote' expected_footnote_tags = list(target_html_tag.find_all(id=element_id, attrs={'epub:type': re.compile(possible_footnote)})) expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) footnote_tag = expected_footnote_tags[0] replace_with_livecarta_anchor_tag(noteref_tag, i) remove_internal_links_with_text(footnote_tag) content = get_footnote_tags2str(footnote_tag) footnote_tag.decompose() footnotes.append(content) return footnotes def add_fonts(): pass def unwrap_structural_tags(body_tag): structural_tags_names = [ 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p' ] for div in body_tag.find_all("div"): if div.contents: is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents] if all(is_not_struct_tag): div.name = 'p' continue div.unwrap() for s in body_tag.find_all("section"): s.unwrap() for s in body_tag.find_all("article"): s.unwrap() for s in body_tag.find_all("aside"): s.name = 'blockquote' for s in body_tag.find_all("main"): s.unwrap() for s in body_tag.find_all("body"): s.unwrap() for s in body_tag.find_all("html"): s.unwrap() # not all cases, if span has

    s and NavigableString, it won't unwrap for s in body_tag.find_all("span"): if s.contents: is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents] if all(is_not_struct_tag): continue s.unwrap() _preprocessing_headings(body_tag) for node in body_tag: if isinstance(node, NavigableString): content = str(node) content = re.sub(r'([\n\t\xa0])', ' ', content) content = content.strip() if content: tag = body_tag.new_tag('p') tag.append(str(node)) node.replace_with(tag) return body_tag def get_tags_between_ids(first_id, href, html_soup): h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'}) if h_marked: p = h_marked.next_sibling tags = [] while p: if p.name == 'h1' and p.attrs.get('class') == 'internal-mark': break tags.append(p) p = p.next_sibling tags = [tag.extract() for tag in tags] html_soup.smooth() else: assert 0, f'Warning: no match for {first_id, href}' return tags def prepare_title_and_content(title, content_tag: BeautifulSoup): title_str = BeautifulSoup(title, features='lxml').string title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r' +', ' ', title_str).rstrip() # 0. cleaning \n to_remove = [] for child in content_tag.contents: if isinstance(child, NavigableString): s = re.sub(r'([\n\t\xa0])', '', child.string) if s == '': to_remove.append(child) [x.extract() for x in to_remove] # 1. rule#1 for heading removal clean_headings_content(content_tag, title_str) _process_lists(content_tag) _preprocessing_headings(content_tag) preprocess_table(content_tag) # 2. class removal for tag in content_tag.find_all(recursive=True): if hasattr(tag, 'attrs') and tag.attrs.get('class'): del tag.attrs['class'] # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) title_str = clean_title_from_numbering(title_str) return title_str, str(content_tag)