change structure of project

This commit is contained in:
Kiryl
2021-09-30 13:08:09 +03:00
parent 61d85f6c22
commit af1b6138a9
5 changed files with 35 additions and 18 deletions

View File

@@ -0,0 +1,492 @@
import re
from typing import List
import cssutils
from bs4 import BeautifulSoup
from ebooklib import epub
from premailer import transform
from itertools import takewhile
from logging import CRITICAL
from src.livecarta_config import LiveCartaConfig
from src.util.color_reader import str2hex
cssutils.log.setLevel(CRITICAL)
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
'48px', '49px', '50px', '64px', '72px']
list_types = ['circle', 'disc', 'armenian', 'decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_font_size(value):
if 'pt' in value:
if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
return ''
else:
return value.replace('pt', 'px')
if value == '100%':
return ''
try:
if '%' in value:
value = float(value.replace('%', ''))
value = value / 100.0
elif 'em' in value:
value = float(value.replace('em', ''))
else:
return ''
if value > 5:
return ''
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index]
except ValueError:
return ''
def convert_indents(value):
# 30px = 3.2% = 1.25em = 23pt
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)')
has_style_attrs = re.search(text_indent_regexp, value)
if has_style_attrs:
if has_style_attrs.group(1):
value = value.replace(has_style_attrs.group(1),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
'px')
elif has_style_attrs.group(2):
value = value.replace(has_style_attrs.group(2),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
'px')
elif has_style_attrs.group(4):
value = value.replace(has_style_attrs.group(4),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(5))))))) + 'px')
return value
"""
LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [],
'font': [],
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
'color': [],
'background-color': [],
'background': [],
'width': [],
'border-top-width': [],
'border-right-width': [],
'border-left-width': [],
'border-bottom-width': [],
'border': [],
'list-style-type': [],
'list-style-image': [],
'margin-left': []
}
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
return color
def get_text_color(x):
color = str2hex(x)
color = color if color not in ['#000000', '#000', 'black'] else ''
return color
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents,
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_font_size,
'color': get_text_color,
'background-color': get_bg_color,
'background': get_bg_color,
'border': lambda x: x if x != '0' else '',
'border-top-width': lambda x: x if x != '0' else '',
'border-right-width': lambda x: x if x != '0' else '',
'border-left-width': lambda x: x if x != '0' else '',
'border-bottom-width': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in list_types else 'disc',
'list-style-image': lambda x: 'disc',
'margin-left': convert_indents
}
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong',
('font-weight', '600'): 'strong',
('font-weight', '700'): 'strong',
('font-weight', '800'): 'strong',
('font-weight', '900'): 'strong',
('font-style', 'italic'): 'i',
('text-decoration', 'underline'): 'u',
('text-decoration', 'line-through'): 's',
('text-decoration-line', 'underline'): 'u',
('text-decoration-line', 'line-through'): 's',
('vertical-align', 'super'): 'sup',
}
def check_style_to_be_tag(style) -> List[tuple]:
""" Some css style properties converts to tags.
Search for them and prepare list of properties to be removed from style string"""
to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
to_remove.append(k)
return to_remove
def update_css_style_types_to_livecarta_convention(css_rule, style_type):
if style_type.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
return
cleaned_value = style_type.value.replace('\"', '') # value of style
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[style_type.name]
if there_are_constraints_on_value and value_not_in_possible_values_list:
# style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
else:
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] # function that converts our data
css_rule.style[style_type.name] = func(cleaned_value)
def build_css_content(css_content):
sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet:
if css_rule.type == css_rule.STYLE_RULE:
for style_type in css_rule.style:
update_css_style_types_to_livecarta_convention(css_rule, style_type)
css_text = sheet._getCssText().decode()
return css_text
class TagStyleConverter:
def __init__(self, tag_with_initial_style, tag_with_ultimate_style):
self.tag_with_initial_style = tag_with_initial_style # tag with inline style to be updated with style attribute
self.tag_initial_name = tag_with_initial_style.name
self.tag_with_ultimate_style = tag_with_ultimate_style # tag with inline style + style parsed from css file
self.style = self.preprocess_style()
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
if 'background' in style_:
return style_
# if text color is white, check that we have bg-color
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
# if bg color is inherited, just return style as is
for parent_tag in tag.parents:
# white bg color not need to be checked as we do not write 'white bg color'
tag_with_bg = ['span', 'td', 'tr', 'p']
tag_will_be_saved = parent_tag.name in tag_with_bg
has_bg = parent_tag.attrs.get('style') and ('background' in parent_tag.attrs.get('style'))
if has_bg and tag_will_be_saved:
return style_
children = tag.find_all()
for child in children:
if child.attrs.get('style') and ('background' in child.attrs.get('style')):
tmp_style = child.attrs['style'] + '; color:#fff; '
child.attrs['style'] = tmp_style
# for child with bg color we added white text color, so this tag don't need white color
style_ = style_.replace('color:#fff;', '')
style_ = style_.replace('color:#ffffff;', '')
style_ = style_.replace('color:white;', '')
return style_
@staticmethod
def process_indents_in_px(split_style: list) -> str:
# clean with convert_indents() style string and make new clean_style
clean_style = ''
for item in split_style:
item = item.split(':')
if item[0] in ['text-indent', 'margin-left']:
item[1] = convert_indents(item[1])
clean_style += item[0] + ': ' + item[1] + '; '
margin_left_regexp = re.compile(
r'(margin-left:( *-*\w+);*)')
text_indent_regexp = re.compile(
r'(text-indent:( *-*\w+);*)')
has_margin_left = re.search(margin_left_regexp, clean_style)
has_text_indent = re.search(text_indent_regexp, clean_style)
#formula_of_indent: indent = abs(margin_left - text_indent)
if has_margin_left:
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2))))))
if has_text_indent:
num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))))
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
clean_style = clean_style.replace(has_margin_left.group(1), '')
return clean_style
clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' +
str(abs(num_ml)) + 'px; ')
return clean_style
elif has_text_indent:
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ')
return clean_style
return clean_style
def preprocess_style(self):
ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';'
ultimate_style = self.remove_white_if_no_bgcolor(ultimate_style, self.tag_with_ultimate_style)
ultimate_style = ultimate_style.replace('background:', 'background-color:')
ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type')
split_ultimate_style = ultimate_style.replace(' ', '').split(';') # make for repetition check and convert to px
# check for another ; in style string in preprocess_style()
while '' in split_ultimate_style:
split_ultimate_style.remove('')
ultimate_style: str = self.process_indents_in_px(split_ultimate_style)
if self.tag_with_initial_style.attrs.get('style'):
initial_style = self.tag_with_initial_style.attrs['style']
split_initial_style = initial_style.replace(' ', '').split(';')
# check for another ; in style string in preprocess_style()
while '' in split_initial_style:
split_initial_style.remove('')
# repetition check - if tag had already had inline style, add this to style parsed from css
repeat_styles = list(set(split_ultimate_style) & set(split_initial_style))
for item in repeat_styles:
split_initial_style.remove(item)
if split_initial_style:
# if initial style is not empty - start convert and add to ultimate style
print('we enter repetition check', '\n')
initial_style: str = self.process_indents_in_px(split_initial_style)
ultimate_style += initial_style
return ultimate_style
def change_attrs_with_corresponding_tags(self):
# adds <b>, <u>, <sup>, etc
to_remove = check_style_to_be_tag(self.style)
new_tags = []
for i, (attr, value) in enumerate(to_remove):
s = f'{attr}:{value};'
self.style = self.style.replace(s, '')
self.style = self.style.strip()
if i == 0:
self.tag_with_initial_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tags.append(self.tag_with_initial_style)
else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag)
new_tags.append(new_tag)
top_tag = self.tag_with_initial_style
if new_tags:
tmp_attrs = top_tag.attrs.copy()
top_tag.attrs = {}
top_tag2 = BeautifulSoup(features='lxml').new_tag(self.tag_initial_name)
top_tag2.attrs = tmp_attrs
if self.style:
top_tag2.attrs['style'] = self.style
new_tags[-1].wrap(top_tag2)
else:
top_tag.attrs['style'] = self.style
return top_tag
@staticmethod
def wrap_span_in_p_to_save_style_attrs(tag):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent']]
if tag.name == 'p' and tag.attrs.get('style'):
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
if any(styles_to_be_saved):
tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p')
span_style = tag.attrs['style']
p_style = ''
possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)')
for i in range(span_style.count(';') + 1):
has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
if has_p_style_attrs:
if has_p_style_attrs.group(1):
p_style += has_p_style_attrs.group(1)
span_style = span_style.replace(has_p_style_attrs.group(1), '')
if has_p_style_attrs.group(3):
p_style += has_p_style_attrs.group(3)
span_style = span_style.replace(has_p_style_attrs.group(3), '')
p_tag.attrs['style'] = p_style
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, span_style)
span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '')
tag.attrs['style'] = span_style
tag.wrap(p_tag)
@staticmethod
def add_span_to_save_style_attrs_in_li(t):
if t.name == 'li' and t.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']]
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
if any(check):
t.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li')
old_style = t.attrs['style']
new_style = ''
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
re.compile(r'(list-style-type:(\w+);)')]:
has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
new_style += has_li_style_attrs.group(1)
old_style = old_style.replace(has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = new_style
t.attrs['style'] = old_style
t.wrap(li_tag)
@staticmethod
def add_span_to_save_style_attrs_in_ul_ol(t):
if t.name in ['ul', 'ol'] and t.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
if any(check):
t.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
old_style = t.attrs['style']
possible_li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
new_style = has_li_style_attrs.group(1)
old_style = old_style.replace(new_style, '')
li_tag.attrs['style'] = new_style
t.attrs['style'] = old_style
t.wrap(li_tag)
@staticmethod
def add_span_to_save_style_attrs(t):
no_style_in_livecarta_regexp = re.compile('(^h[1-9]$)')
if re.search(no_style_in_livecarta_regexp, t.name) and t.attrs.get('style'):
new_tag = BeautifulSoup(features='lxml').new_tag(t.name)
t.name = 'span'
t.wrap(new_tag)
style = t.attrs['style']
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, style)
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
def convert_initial_tag(self):
self.tag_with_initial_style = self.change_attrs_with_corresponding_tags()
self.wrap_span_in_p_to_save_style_attrs(self.tag_with_initial_style)
self.add_span_to_save_style_attrs_in_li(self.tag_with_initial_style)
self.add_span_to_save_style_attrs_in_ul_ol(self.tag_with_initial_style)
self.add_span_to_save_style_attrs(self.tag_with_initial_style)
return self.tag_with_initial_style
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []
h_regex = f'(^h[1-9]$)'
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp)
for i, x in enumerate(tags_with_possible_style_attr):
x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i)
# here we add css styles to inline style
# sometimes in html_with_css_styles
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
for i in livecarta_tmp_ids:
tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i})
tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})
del tag_with_initial_style.attrs['livecarta_id']
if tag_with_ultimate_style.attrs.get('style'):
style_converter = TagStyleConverter(tag_with_initial_style, tag_with_ultimate_style)
style_converter.convert_initial_tag()
return html_soup
if __name__ == '__main__':
file = '../../epub/9781627222174.epub'
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode()
css_cleaned = build_css_content(css_)
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml')
print(convert_html_soup_with_css_style(html_soup, css_cleaned))

View File

@@ -0,0 +1,465 @@
import codecs
import json
import logging
import os
import re
from os.path import dirname, normpath, join
from collections import defaultdict
from typing import Dict, Union, List
from itertools import chain
import ebooklib
from bs4 import BeautifulSoup, Tag
from ebooklib import epub
from ebooklib.epub import Link, Section
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \
update_src_links_in_images, preprocess_footnotes
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
from src.livecarta_config import LiveCartaConfig
from src.util.helpers import BookLogger
class EpubConverter:
def __init__(self, file, access=None, logger=None):
self.file = file
self.access = access
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file)
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
# key = -1 for top level NavPoints
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
# container for all chapters soup objects
# here soup object is only part of the .xhtml file
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
self.internal_anchors = set()
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
self.href2img_bytes = {} # file path to bytes
self.old_image_path2aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
self.logger.log('Image processing.')
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.href2img_bytes[file_name] = content
self.logger.log('HTML files reading.')
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.logger.log('CSS files processing.')
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log('CSS styles adding.')
self.add_css_styles_to_html_soup()
self.logger.log('Footnotes processing.')
for href in self.html_href2html_body_soup:
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
self.html_href2html_body_soup)
self.footnotes_contents.extend(content)
self.noterefs.extend(noterefs)
self.footnotes.extend(footnotes_tags)
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
noteref.attrs['data-id'] = i + 1
noteref.attrs['id'] = f'footnote-{i + 1}'
footnote.attrs['href'] = f'#footnote-{i + 1}'
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
self.logger.log('TOC processing.')
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed
if self.is_toc_empty():
self.build_adjacency_list_from_spine()
not_added = [x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f'Html documents not added to TOC: {not_added}.')
self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.')
self.label_chapters_ids_with_tmp_id()
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
self.process_internal_links()
self.logger.log(f'Building chapters content.')
self.define_chapters_content()
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
# for now just for HTML objects, as it is simplest chapter
nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_body_text = item.get_body_content()
# html.parser closes tags if needed
soup = BeautifulSoup(html_body_text, features='html.parser')
nodes[item.file_name] = soup
return nodes
def get_css_content(self, css_href, html_href):
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
assert css_obj, f'Css style {css_href} was not in manifest.'
css_content: str = css_obj.get_content().decode()
return css_content
def build_html_and_css_relations(self):
'''
This function is designed to get 2 dictionaries:
The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
...2... = key2value
'''
html_href2css_href: defaultdict = defaultdict(list) # dictionary: href of html to related css files
css_href2css_content: dict = {}
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content
html_href = item.file_name
soup_html_content = BeautifulSoup(html_content, features='lxml')
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): #check if file links to css file
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
continue
css_href = tag.attrs.get('href')
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_content(
self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all('style')):
css_content = tag.string
html_href2css_href[html_href].append(f'href{i}')
css_href2css_content[f'href{i}'] = build_css_content(css_content)
return html_href2css_href, css_href2css_content,
def add_css_styles_to_html_soup(self):
'''
This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content
'''
for href in self.html_href2html_body_soup:
if self.html_href2css_href.get(href):
css =''
for key in self.html_href2css_href[href]:
css += self.css_href2css_content[key]
content: BeautifulSoup = self.html_href2html_body_soup[href]
content = convert_html_soup_with_css_style(content, css)
self.html_href2html_body_soup[href] = content
def build_manifest_id2html_href(self):
links = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
links[item.id] = item.file_name
return links
def build_adjacency_list_from_toc(self, element, lvl=0):
"""
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
key = -1 if root, value = None if leaf
:param element: [Link, tuple, list] - element that appears in TOC( usually parsed from nav.ncx)
:param lvl: level of depth
"""
if isinstance(element, Link):
# todo: check if link exists
nav_point = NavPoint(element)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
self.adjacency_list[nav_point] = None
self.hrefs_added_to_toc.add(nav_point.href)
return nav_point
elif isinstance(element, tuple):
first, second = element
assert isinstance(first, Section)
nav_point = NavPoint(first)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
sub_nodes = []
for i in second:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[nav_point] = sub_nodes
self.hrefs_added_to_toc.add(nav_point.href)
return nav_point
elif isinstance(element, list) and (lvl == 0):
sub_nodes = []
for i in element:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[-1] = sub_nodes
else:
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
def is_toc_empty(self):
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return True
return False
def build_adjacency_list_from_spine(self):
manifest_id2href = self.build_manifest_id2html_href()
self.adjacency_list = {
-1: []
}
for id_, _ in self.ebooklib_book.spine:
nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added):
for i, file in enumerate(not_added):
nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file))
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(file)
def label_chapters_ids_with_tmp_id(self):
for href in self.html_href2html_body_soup:
ids = self.href2subchapter_ids[href]
for i in ids:
soup = self.html_href2html_body_soup[href]
tag = soup.find(id=i)
new_h = soup.new_tag('tmp')
new_h.attrs['class'] = 'converter-chapter-mark'
new_h.attrs['id'] = i
tag.insert_before(new_h)
def process_html_soup_structure_to_line(self):
# go to line structure
for href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[href]
self.html_href2html_body_soup[href] = unwrap_structural_tags(soup)
@staticmethod
def _create_unique_id(href, id_):
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
@staticmethod
def _create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs['id'] = id_
new_anchor_span.attrs['class'] = 'link-anchor'
new_anchor_span.string = "\xa0"
return new_anchor_span
def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
"""
TOC: a/b/c.xhtml
b/c.xhtml -> a/b/c.xhtml
c.xhtml -> a/b/c.xhtml
Used to find full path to file that is parsed from tag link
:param cur_file_path: path to current file with tag link
:param href_in_link: filename got from tag link, like file1.xhtml
:param internal_link_tag: tag object that is parsed now
:return:
"""
dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
full_path = [path for path in self.hrefs_added_to_toc if normed_path in path]
if not full_path:
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.')
internal_link_tag.attrs['converter-mark'] = 'bad-link'
return None
if len(full_path) > 1:
self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
f' while {internal_link_tag} processing. The first one will be chosen.')
return full_path[0]
def process_internal_links(self):
# 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
if tag.attrs.get('class') == 'converter-chapter-mark':
continue
if tag.attrs.get('class') == 'footnote-element':
continue
new_id = self._create_unique_id(toc_href, tag.attrs['id'])
tag.attrs['id'] = new_id
# 2.a) process anchor which is a whole xhtml file
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)')
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
a_tag_href = internal_link_tag.attrs['href']
# find full path
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc:
continue
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self._create_new_anchor_span(soup, new_id)
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href']
# 2.b) process anchor which is a an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
# find full path
if a_tag_href:
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href,
internal_link_tag)
else:
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
if not a_tag_href_matched_to_toc:
continue
new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote
if anchor_tags:
if len(anchor_tags) > 1:
self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n'
f'{anchor_tags}\n'
f' While processing {internal_link_tag}')
anchor_tag = anchor_tags[0]
assert anchor_tag.attrs['id'] in [new_id, a_tag_id]
# if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
# create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs['id'] not in self.internal_anchors:
new_anchor_span = self._create_new_anchor_span(soup, new_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id)
del anchor_tag.attrs['id']
del internal_link_tag.attrs['href']
else:
internal_link_tag.attrs['converter-mark'] = 'bad-link'
self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={a_tag_id}')
def build_one_chapter(self, nav_point):
"""
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
3 cases:
id wraps all chapter content,
id wraps chapter's content + subchapters' content
id points to the start of title of a chapter
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
and id of the next chapter/subchapter
"""
if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser')
for tag in chapter_tags:
new_tree.append(tag)
self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] = new_tree
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
self.build_one_chapter(sub_node)
def define_chapters_content(self):
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points:
self.build_one_chapter(point)
def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
title = nav_point.title
if nav_point.id:
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
else:
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
self.old_image_path2aws_path = update_src_links_in_images(content,
self.href2img_bytes,
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.old_image_path2aws_path)
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
remove_title_from_chapter=is_chapter)
sub_nodes = []
# warning! not EpubHtmlItems won;t be added to chapter
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item)
if self.logger:
indent = ' ' * lvl
self.logger.log(f'{indent}Chapter: {title} is prepared.')
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self):
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
for nav_point in top_level_nav_points:
chapter = self.node2livecarta_chapter_item(nav_point)
top_level_chapters.append(chapter)
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
self.logger.log(f'Anchors found: {len(self.internal_anchors)}.')
self.logger.log('End conversion.')
return {
"content": top_level_dict_chapters,
"footnotes": self.footnotes_contents
}
if __name__ == "__main__":
logger = logging.getLogger('epub')
file_handler = logging.StreamHandler()
logger.addHandler(file_handler)
file_handler = logging.FileHandler('../epub.log', mode='w+')
logger.addHandler(file_handler)
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubConverter('../../epub/9781634252221.epub',
logger=logger_object)
tmp = json_converter.convert_to_dict()
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
json.dump(tmp, f, ensure_ascii=False)

View File

@@ -0,0 +1,16 @@
from src.epub_converter.epub_converter import EpubConverter
from src.book_solver import BookSolver
class EpubBook(BookSolver):
def __init__(self, book_id=0, access=None, main_logger=None,
logging_format='%(asctime)s - %(levelname)s - %(message)s'):
super().__init__(book_id, access, main_logger, logging_format)
self.book_type = 'epub'
def get_converted_book(self):
json_converter = EpubConverter(self.file_path, access=self.access, logger=self.logger_object)
content_dict = json_converter.convert_to_dict()
self.status_wrapper.set_generating()
return content_dict

View File

@@ -0,0 +1,580 @@
import os
import pathlib
import re
from typing import Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from src.access import Access
from src.livecarta_config import LiveCartaConfig
def save_image_locally(img_file_path, img_content, book_id):
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'../json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
link = access.send_image(img_file_path, doc_id=book_id, img_content=img_content)
return link
def update_src_links_in_images(body_tag: Tag,
href2img_content: dict,
path_to_html,
access=None,
path2aws_path=None):
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html)).replace('\\', '/')
assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):
del img.attrs['width']
if img.attrs.get('height'):
del img.attrs['height']
if img.attrs.get('style'):
del img.attrs['style']
return path2aws_path
def preprocess_table(body_tag: BeautifulSoup):
tables = body_tag.find_all("table")
for table in tables:
tds = table.find_all(re.compile("td|th|tr"))
for td in tds:
style = td.get('style')
width = ''
if style:
width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match:
size = width_match.group(1)
units = width_match.group(2)
width = size+'px'
td.attrs['width'] = td.get('width') or width
if td.attrs.get('style'):
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
if td.attrs.get('style') == '':
del td.attrs['style']
if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
table.attrs['border'] = '1'
def process_lists(body_tag):
"""
Function to process tags <li>.
Unwrap <p> tags.
"""
li_tags = body_tag.find_all("li")
for il_tag in li_tags:
if il_tag.p:
il_tag.attrs.update(il_tag.p.attrs)
il_tag.p.unwrap()
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
new_tag.string = "\xa0"
tag.insert_before(new_tag)
def clean_headings_content(content: Tag, title: str):
def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=tag_to_be_removed.attrs.get('id'),
class_=tag_to_be_removed.attrs.get('class'))
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class'))
title = title.lower()
for child in content.contents:
if isinstance(child, NavigableString):
text = child
else:
text = child.text
if text and re.sub(r'([\n\t\xa0])', '', text):
text = re.sub(r'([\n\t\xa0])', ' ', text)
text = re.sub(r' +', ' ', text).strip()
text = text.lower()
if title == text:
_add_span_to_save_ids_for_links(child, content)
child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
_add_span_to_save_ids_for_links(child, content)
child.extract()
break
def _heading_tag2p_tag(body_tag):
"""
Function to convert all lower level headings to p tags
"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
def clean_title_from_numbering(title: str):
"""
Function to remove digits from headers.
"""
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
return title
def replace_with_livecarta_anchor_tag(anchor, i):
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
if anchor.parent.name == 'sup':
anchor.parent.unwrap()
anchor.replace_with(new_tag)
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-> Tuple[list, list, list]:
"""
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
new_noterefs_tags = []
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str):
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#')
return f, id_
def verify_footnote_tag(tags: list):
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
return tags
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={'epub:type': re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(attrs={'role': 'doc-backlink'}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
return footnotes, new_noterefs_tags, new_footnotes_tags
def unwrap_structural_tags(body_tag):
"""
Main function that works with structure of html.
Make changes inplace.
1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
This tag must have a body_tag as a parent.
Otherwise, it is wrapped with some tags. Like:
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString
:param body_tag: Tag, soup object
:return: None
"""
def _preserve_class_in_aside_tag(tag_):
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
if tag_.parent.name == 'aside':
if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class
def _preserve_class_in_section_tag(tag_) -> bool:
# to save css style inherited from class, copy class to child <p>
# this is for Wiley books with boxes
# returns True, if <section> could be unwrapped
tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
if 'feature' not in tag_class:
return True
child_p_tags = tag_.find_all("p")
if len(child_p_tags) == 1:
child_p_tag = child_p_tags[0]
if not child_p_tag.attrs.get('class'):
child_p_tag.attrs['class'] = tag_class
return True
elif len(child_p_tags) > 1:
tag_.name = 'p'
return False
else:
return True
def _add_table_to_abc_books(tag_, border, bg_color):
wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color)
def _add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
# comments removal
for tag in body_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0]
if div_class in ['C409', 'C409a']:
_add_table_to_abc_books(div, border='solid 3px', bg_color='#e7e7e9')
elif div_class in ['C441', 'C816']:
_add_table_to_abc_books(div, border='solid #6e6e70 1px', bg_color='#e7e7e8')
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
_add_span_to_save_ids_for_links(div)
div.unwrap()
for s in body_tag.find_all("section"):
could_be_unwrapped = True
if s.attrs.get('class'):
could_be_unwrapped = _preserve_class_in_section_tag(s)
_add_span_to_save_ids_for_links(s)
if could_be_unwrapped:
s.unwrap()
for s in body_tag.find_all("article"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("figure"):
s.name = 'p'
s.attrs['style'] = "text-align: center;" # to center image inside this tag
for s in body_tag.find_all("figcaption"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("body"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("html"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# check marks for chapter starting are on the same 1 level
marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
parents_marks_are_body = [x.parent == body_tag for x in marks]
# fix marks to be on 1 level
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
_heading_tag2p_tag(body_tag)
# wrap NavigableString with <p>
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def get_tags_between_chapter_marks(first_id, href, html_soup):
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
:param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
:param href: name of current chapter's file
:param html_soup: soup object of current file
:return: list [Tag, NavigableString]; chapter's tags
"""
marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and\
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f'Warning: no match for {first_id, href}'
return tags
def wrap_preformatted_span_with_table(main_tag, old_tag):
table = main_tag.new_tag("table")
table.attrs['border'] = '1px #ccc;'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['bgcolor'] = '#f5f5f5'
# td.attrs['border-radius'] = '4px'
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
table = main_tag.new_tag("table")
table.attrs['border'] = border
table.attrs['align'] = 'center'
table.attrs['style'] = f'width:{width}%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
# td.attrs['border-radius'] = '8px'
if bg_color:
td.attrs['bgcolor'] = bg_color
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
return table
def _clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
hr.extract()
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
def preprocess_block_tags(chapter_tag):
for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
_clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(future_block)
color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None
color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
def _prepare_formatted(text):
# replace <,> to save them as is in html code
text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
text = text.replace(' ', "\xa0")
text = text.replace('𝑓', "\xf0\x9d\x91\x93")
return text
def preprocess_pre_tags(chapter_tag):
for pre in chapter_tag.find_all("pre"):
new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy()
spans = pre.find_all("span")
to_add_br = len(spans) > 1 # if in <pre> there are multiple <span>, we need to add <br> after each content
for child in pre.children:
if isinstance(child, NavigableString):
cleaned_text = _prepare_formatted(str(child))
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
for string in sub_strings:
new_tag.append(NavigableString(string))
new_tag.append(BeautifulSoup(features='lxml').new_tag('br'))
else:
for sub_child in child.children:
if isinstance(sub_child, NavigableString):
cleaned_text2 = _prepare_formatted(str(sub_child))
sub_child.replace_with(NavigableString(cleaned_text2))
else:
sub_child.string = _prepare_formatted(sub_child.text)
cleaned_tag = child.extract()
new_tag.append(cleaned_tag)
if to_add_br:
new_tag.append(BeautifulSoup(features='lxml').new_tag('br'))
new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
"font-size: 14px; white-space: nowrap;"
pre.replace_with(new_tag)
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
p_for_br = chapter_tag.new_tag("p")
p_for_br.string = "\xa0"
table.insert_after(p_for_br)
def preprocess_code_tags(chapter_tag):
# function that emulates style of <code>, <kdb>, <var>
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
code.name = 'span'
if code.parent.name == "pre":
continue
code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;'
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
"""
Final processing/cleaning function.
:param title: title of the chapter
:param chapter_tag: soup object
:param remove_title_from_chapter: bool
:return: tuple[str, str]
"""
title_str = BeautifulSoup(title, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
# 0. cleaning \n
to_remove = []
for child in chapter_tag.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t])', '', child.string)
if s == '':
to_remove.append(child)
[x.extract() for x in to_remove]
# 1. heading removal
if remove_title_from_chapter:
clean_headings_content(chapter_tag, title_str)
process_lists(chapter_tag)
preprocess_table(chapter_tag)
preprocess_code_tags(chapter_tag)
preprocess_pre_tags(chapter_tag)
preprocess_block_tags(chapter_tag)
# 2. class removal
for tag in chapter_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']):
del tag.attrs['class']
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str)
return title_str, str(chapter_tag)