upgrade inline processor & make repetition check

This commit is contained in:
Kiryl
2021-09-29 18:13:09 +03:00
parent ebb5f0802e
commit 552741fbb5
3 changed files with 202 additions and 183 deletions

View File

@@ -58,8 +58,8 @@ def convert_font_size(value):
def convert_indents(value): def convert_indents(value):
# 30px = 3.2% = 1.25em = 23pt # 30px = 3.2% = 1.25em = 23pt
positive_text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)') text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)')
has_style_attrs = re.search(positive_text_indent_regexp, value) has_style_attrs = re.search(text_indent_regexp, value)
if has_style_attrs: if has_style_attrs:
if has_style_attrs.group(1): if has_style_attrs.group(1):
value = value.replace(has_style_attrs.group(1), value = value.replace(has_style_attrs.group(1),
@@ -89,8 +89,8 @@ LIVECARTA_STYLE_ATTRS = {
'text-indent': [], 'text-indent': [],
'font-variant': ['small-caps'], 'font-variant': ['small-caps'],
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [], # ??? 'align': [],
'font': [], # ??? 'font': [],
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys() 'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
if x != LiveCartaConfig.DEFAULT_FONT_NAME], if x != LiveCartaConfig.DEFAULT_FONT_NAME],
'font-size': [], 'font-size': [],
@@ -182,41 +182,40 @@ def check_style_to_be_tag(style) -> List[tuple]:
to_remove.append(k) to_remove.append(k)
return to_remove return to_remove
def update_css_style_types_to_livecarta_convention(css_rule, style_type):
def update_property_to_livecarta_convention(rule, property_): if style_type.name not in LIVECARTA_STYLE_ATTRS:
if property_.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file # property not in LIVECARTA_STYLE_ATTRS, remove from css file
rule.style[property_.name] = '' css_rule.style[style_type.name] = ''
return return
cleaned_value = property_.value.replace('\"', '') cleaned_value = style_type.value.replace('\"', '') # value of style
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(property_.name) there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[property_.name] value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[style_type.name]
if there_are_constraints_on_value and value_not_in_possible_values_list: if there_are_constraints_on_value and value_not_in_possible_values_list:
# property + value not in LIVECARTA_STYLE_ATTRS, remove from css file # style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file
rule.style[property_.name] = '' css_rule.style[style_type.name] = ''
else: else:
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING: if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name] func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] # function that converts our data
rule.style[property_.name] = func(cleaned_value) css_rule.style[style_type.name] = func(cleaned_value)
def build_css_content(css_content):
sheet = cssutils.parseString(css_content, validate=False)
def clean_css(css): for css_rule in sheet:
sheet = cssutils.parseString(css, validate=False) if css_rule.type == css_rule.STYLE_RULE:
for rule in sheet: for style_type in css_rule.style:
if rule.type == rule.STYLE_RULE: update_css_style_types_to_livecarta_convention(css_rule, style_type)
for property_ in rule.style:
update_property_to_livecarta_convention(rule, property_)
css_text = sheet._getCssText().decode() css_text = sheet._getCssText().decode()
return css_text return css_text
class TagStyleConverter: class TagStyleConverter:
def __init__(self, tag, tag_with_style): def __init__(self, tag_with_initial_style, tag_with_ultimate_style):
self.tag = tag # tag to be updated with style attribute self.tag_with_initial_style = tag_with_initial_style # tag with inline style to be updated with style attribute
self.tag_initial_name = tag.name self.tag_initial_name = tag_with_initial_style.name
self.tag_with_style = tag_with_style # tag with inline style parsed from css file self.tag_with_ultimate_style = tag_with_ultimate_style # tag with inline style + style parsed from css file
self.style = self.preprocess_style() self.style = self.preprocess_style()
@staticmethod @staticmethod
@@ -248,76 +247,83 @@ class TagStyleConverter:
return style_ return style_
@staticmethod @staticmethod
def convert_indentions_to_px(style): def process_indents_in_px(split_style: list) -> str:
margin_left_regexp = re.compile( # clean with convert_indents() style string and make new clean_style
r'(margin-left:( *-*\w+%);*)|(margin-left:( *-*\w+);*)') clean_style = ''
text_indent_regexp = re.compile( for item in split_style:
r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)') item = item.split(':')
item[1] = convert_indents(item[1])
clean_style += item[0] + ': ' + item[1] + '; '
has_margin_left = re.search(margin_left_regexp, style) margin_left_regexp = re.compile(
has_text_indent = re.search(text_indent_regexp, style) r'(margin-left:( *-*\w+);*)')
# consider that 5% = 30px text_indent_regexp = re.compile(
r'(text-indent:( *-*\w+);*)')
has_margin_left = re.search(margin_left_regexp, clean_style)
has_text_indent = re.search(text_indent_regexp, clean_style)
#formula_of_indent: indent = abs(margin_left - text_indent)
if has_margin_left: if has_margin_left:
hml_group = 0
num_ml = 0 num_ml = 0
if has_margin_left.group(1): if has_margin_left.group(1):
hml_group = has_margin_left.group(1)
num_ml = abs(int("".join( num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2))))) * 6) filter(str.isdigit, str(has_margin_left.group(2))))))
elif has_margin_left.group(3):
hml_group = has_margin_left.group(3)
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(4))))))
if has_text_indent: if has_text_indent:
if has_text_indent.group(1): if has_text_indent.group(1):
num_ti = abs(int("".join( num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))) * 6) filter(str.isdigit, str(has_text_indent.group(2))))))
style = style.replace(has_text_indent.group(1), 'text-indent: ' + clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ') str(abs(num_ml - num_ti)) + 'px; ')
style = style.replace(hml_group, '') clean_style = clean_style.replace(has_margin_left.group(1), '')
return style return clean_style
elif has_text_indent.group(3): clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' +
num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(4))))))
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
style = style.replace(hml_group, '')
return style
style = style.replace(hml_group, 'text-indent: ' +
str(abs(num_ml)) + 'px; ') str(abs(num_ml)) + 'px; ')
return style return clean_style
elif has_text_indent: elif has_text_indent:
if has_text_indent.group(1): if has_text_indent.group(1):
style = style.replace(has_text_indent.group(1), 'text-indent: ' + clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(int("".join( str(abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)) + 'px; ') filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ')
return style return clean_style
elif has_text_indent.group(3): return clean_style
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
str("".join(
filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ')
return style
return style
def preprocess_style(self): def preprocess_style(self):
style = self.tag_with_style.attrs.get('style') + ';' ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';'
style = self.remove_white_if_no_bgcolor(style, self.tag_with_style) ultimate_style = self.remove_white_if_no_bgcolor(ultimate_style, self.tag_with_ultimate_style)
style = style.replace('background:', 'background-color:') ultimate_style = ultimate_style.replace('background:', 'background-color:')
style = style.replace('list-style-image', 'list-style-type') ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type')
# todo: make hmtl_reader + do a repetition check with inline_style split_ultimate_style = ultimate_style.replace(' ', '').split(';') # make for repetition check and convert to px
style = self.convert_indentions_to_px(style)
# if tag had already had inline style, add this to style parsed from css
if self.tag.attrs.get('style'):
inline_style = self.convert_indentions_to_px(self.tag.attrs['style'])
style += inline_style
return style # check for another ; in style string in preprocess_style()
while '' in split_ultimate_style:
split_ultimate_style.remove('')
ultimate_style: str = self.process_indents_in_px(split_ultimate_style)
if self.tag_with_initial_style.attrs.get('style'):
initial_style = self.tag_with_initial_style.attrs['style']
split_initial_style = initial_style.replace(' ', '').split(';')
# check for another ; in style string in preprocess_style()
while '' in split_initial_style:
split_initial_style.remove('')
# repetition check - if tag had already had inline style, add this to style parsed from css
repeat_styles = list(set(split_ultimate_style) & set(split_initial_style))
for item in repeat_styles:
split_initial_style.remove(item)
if split_initial_style:
# if initial style is not empty - start convert and add to ultimate style
print('we enter repetition check', '\n')
initial_style: str = self.process_indents_in_px(split_initial_style)
ultimate_style += initial_style
return ultimate_style
def change_attrs_with_corresponding_tags(self): def change_attrs_with_corresponding_tags(self):
# adds <b>, <u>, <sup>, etc # adds <b>, <u>, <sup>, etc
@@ -328,15 +334,15 @@ class TagStyleConverter:
self.style = self.style.replace(s, '') self.style = self.style.replace(s, '')
self.style = self.style.strip() self.style = self.style.strip()
if i == 0: if i == 0:
self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] self.tag_with_initial_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tags.append(self.tag) new_tags.append(self.tag_with_initial_style)
else: else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tag = BeautifulSoup(features='lxml').new_tag(name) new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag) new_tags[-1].wrap(new_tag)
new_tags.append(new_tag) new_tags.append(new_tag)
top_tag = self.tag top_tag = self.tag_with_initial_style
if new_tags: if new_tags:
tmp_attrs = top_tag.attrs.copy() tmp_attrs = top_tag.attrs.copy()
@@ -363,7 +369,8 @@ class TagStyleConverter:
p_tag = BeautifulSoup(features='lxml').new_tag('p') p_tag = BeautifulSoup(features='lxml').new_tag('p')
span_style = tag.attrs['style'] span_style = tag.attrs['style']
p_style = '' p_style = ''
possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)') for i in range(span_style.count(';')):
possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)')
has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style) has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
if has_p_style_attrs: if has_p_style_attrs:
if has_p_style_attrs.group(1): if has_p_style_attrs.group(1):
@@ -439,49 +446,53 @@ class TagStyleConverter:
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
def convert_initial_tag(self): def convert_initial_tag(self):
self.tag = self.change_attrs_with_corresponding_tags() self.tag_with_initial_style = self.change_attrs_with_corresponding_tags()
self.wrap_span_in_p_to_save_style_attrs(self.tag) self.wrap_span_in_p_to_save_style_attrs(self.tag_with_initial_style)
self.add_span_to_save_style_attrs_in_li(self.tag) self.add_span_to_save_style_attrs_in_li(self.tag_with_initial_style)
self.add_span_to_save_style_attrs_in_ul_ol(self.tag) self.add_span_to_save_style_attrs_in_ul_ol(self.tag_with_initial_style)
self.add_span_to_save_style_attrs(self.tag) self.add_span_to_save_style_attrs(self.tag_with_initial_style)
return self.tag return self.tag_with_initial_style
def add_inline_style_to_html_soup(soup1: BeautifulSoup, css_text: str): def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '') css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = [] livecarta_tmp_ids = []
h_regex = f'(^h[1-9]$)' h_regex = f'(^h[1-9]$)'
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex) could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
tags_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp) tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp)
for i, x in enumerate(tags_with_possible_style_attr): for i, x in enumerate(tags_with_possible_style_attr):
x.attrs['livecarta_id'] = i x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i) livecarta_tmp_ids.append(i)
html_with_inline_style: str = transform(str(soup1), css_text=css_text,
# here we add css styles to inline style
# sometimes in html_with_css_styles
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False, remove_classes=False,
external_styles=False, external_styles=False,
allow_network=False, allow_network=False,
disable_validation=True, disable_validation=True,
) )
soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
for i in livecarta_tmp_ids: for i in livecarta_tmp_ids:
tag = soup1.find(attrs={'livecarta_id': i}) tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i})
tag_with_style = soup2.find(attrs={'livecarta_id': i}) tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})
del tag.attrs['livecarta_id'] del tag_with_initial_style.attrs['livecarta_id']
if tag_with_style.attrs.get('style'): if tag_with_ultimate_style.attrs.get('style'):
style_converter = TagStyleConverter(tag, tag_with_style) style_converter = TagStyleConverter(tag_with_initial_style, tag_with_ultimate_style)
style_converter.convert_initial_tag() style_converter.convert_initial_tag()
return soup1 return html_soup
if __name__ == '__main__': if __name__ == '__main__':
file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub' file = '../epub/9781627222174.epub'
ebooklib_book = epub.read_epub(file) ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css') css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode() css_ = css_.get_content().decode()
css_cleaned = clean_css(css_) css_cleaned = build_css_content(css_)
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode() html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml') html_soup = BeautifulSoup(html_, features='lxml')
print(add_inline_style_to_html_soup(html_soup, css_cleaned)) print(convert_html_soup_with_css_style(html_soup, css_cleaned))

View File

@@ -17,7 +17,7 @@ from data_objects import ChapterItem, NavPoint
from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \
update_src_links_in_images, preprocess_footnotes update_src_links_in_images, preprocess_footnotes
from css_reader import clean_css, add_inline_style_to_html_soup from css_reader import build_css_content, convert_html_soup_with_css_style
from livecarta_config import LiveCartaConfig from livecarta_config import LiveCartaConfig
from util.helpers import BookLogger from util.helpers import BookLogger
@@ -29,9 +29,9 @@ class EpubConverter:
self.logger: BookLogger = logger self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file) self.ebooklib_book = epub.read_epub(file)
self.href2soup_html: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
self.added_to_toc_hrefs = set() # enumerate all file paths that where added to TOC self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints) # toc tree structure stored as adj.list (NavPoint to list of NavPoints)
# key = -1 for top level NavPoints # key = -1 for top level NavPoints
@@ -44,7 +44,7 @@ class EpubConverter:
self.internal_anchors = set() self.internal_anchors = set()
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
self.href2img_bytes = {} # file path to bytes self.href2img_bytes = {} # file path to bytes
self.old_image_path2_aws_path = {} # file path from <a> to generated aws path self.old_image_path2aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote self.footnotes: List[Tag] = [] # end of the footnote
@@ -57,17 +57,18 @@ class EpubConverter:
self.href2img_bytes[file_name] = content self.href2img_bytes[file_name] = content
self.logger.log('HTML files reading.') self.logger.log('HTML files reading.')
self.href2soup_html = self.build_href2soup_content() self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.logger.log('CSS files processing.') self.logger.log('CSS files processing.')
self.css_href2content, self.html_href2css_href = self.build_css_content() self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log('CSS styles adding.') self.logger.log('CSS styles adding.')
self.add_css_styles2soup() self.add_css_styles_to_html_soup()
self.logger.log('Footnotes processing.') self.logger.log('Footnotes processing.')
for href in self.href2soup_html: for href in self.html_href2html_body_soup:
content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href], content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
self.href2soup_html) self.html_href2html_body_soup)
self.footnotes_contents.extend(content) self.footnotes_contents.extend(content)
self.noterefs.extend(noterefs) self.noterefs.extend(noterefs)
self.footnotes.extend(footnotes_tags) self.footnotes.extend(footnotes_tags)
@@ -83,7 +84,7 @@ class EpubConverter:
# build simple toc from spine if needed # build simple toc from spine if needed
if self.is_toc_empty(): if self.is_toc_empty():
self.build_adjacency_list_from_spine() self.build_adjacency_list_from_spine()
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs] not_added = [x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f'Html documents not added to TOC: {not_added}.') self.logger.log(f'Html documents not added to TOC: {not_added}.')
self.add_not_added_files_to_adjacency_list(not_added) self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.') self.logger.log(f'Html internal links and structure processing.')
@@ -96,62 +97,69 @@ class EpubConverter:
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements # using EpubElements
# for now just for HTML objects, as it is simplest chapter # for now just for HTML objects, as it is simplest chapter
# todo: check if other chapters exist
nodes = dict() nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_body_text = item.get_body_content() html_body_text = item.get_body_content()
# html.parser closes tags if needed # html.parser closes tags if needed
soup = BeautifulSoup(html_body_text, features='html.parser') soup = BeautifulSoup(html_body_text, features='html.parser')
nodes[item.file_name] = soup nodes[item.file_name] = soup
return nodes return nodes
def _read_css(self, css_href, html_path): def get_css_content(self, css_href, html_href):
'''
'''
path_to_css_from_html = css_href path_to_css_from_html = css_href
html_folder = dirname(html_path) html_folder = dirname(html_href)
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/') path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
assert css_obj, f'Css style {css_href} was not in manifest.' assert css_obj, f'Css style {css_href} was not in manifest.'
css_content: str = css_obj.get_content().decode() css_content: str = css_obj.get_content().decode()
return css_content return css_content
def build_css_content(self): def build_html_and_css_relations(self):
css_href2content, html_href2css_href = {}, {} '''
html_href2css_href = defaultdict(list) This function is designed to get 2 dictionaries:
# html_href2css_href 1-to-many The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
...2... = key2value
'''
html_href2css_href: defaultdict = defaultdict(list) # dictionary: href of html to related css files
css_href2css_content: dict = {}
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_text = item.content html_content = item.content
html_path = item.file_name html_href = item.file_name
soup = BeautifulSoup(html_text, features='lxml') soup_html_content = BeautifulSoup(html_content, features='lxml')
for tag in soup.find_all('link', attrs={"type": "text/css"}): for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): #check if file links to css file
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
continue continue
css_href = tag.attrs.get('href') css_href = tag.attrs.get('href')
html_href2css_href[html_path].append(css_href) html_href2css_href[html_href].append(css_href)
if css_href not in css_href2content: if css_href not in css_href2css_content:
css_href2content[css_href] = clean_css(self._read_css(css_href, html_path)) # css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_content(
self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup.find_all('style')): for i, tag in enumerate(soup_html_content.find_all('style')):
css_content = tag.string css_content = tag.string
html_href2css_href[html_path].append(f'href{i}') html_href2css_href[html_href].append(f'href{i}')
css_href2content[f'href{i}'] = clean_css(css_content) css_href2css_content[f'href{i}'] = build_css_content(css_content)
return css_href2content, html_href2css_href return html_href2css_href, css_href2css_content,
def add_css_styles2soup(self): def add_css_styles_to_html_soup(self):
for href in self.href2soup_html: for href in self.html_href2html_body_soup:
if self.html_href2css_href.get(href): if self.html_href2css_href.get(href):
css ='' css =''
for key in self.html_href2css_href[href]: for key in self.html_href2css_href[href]:
css += self.css_href2content[key] css += self.css_href2css_content[key]
content: BeautifulSoup = self.href2soup_html[href] content: BeautifulSoup = self.html_href2html_body_soup[href]
content = add_inline_style_to_html_soup(content, css) # todo func here to make content
self.href2soup_html[href] = content content = convert_html_soup_with_css_style(content, css)
self.html_href2html_body_soup[href] = content
def build_manifest_id2href(self): def build_manifest_id2html_href(self):
links = dict() links = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
links[item.id] = item.file_name links[item.id] = item.file_name
@@ -160,7 +168,7 @@ class EpubConverter:
def build_adjacency_list_from_toc(self, element, lvl=0): def build_adjacency_list_from_toc(self, element, lvl=0):
""" """
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
key = -1 if root, value = None if leaf key = -1 if root, value = None if leaf
@@ -175,7 +183,7 @@ class EpubConverter:
self.id_anchor_exist_in_nav_points = True self.id_anchor_exist_in_nav_points = True
self.href2subchapter_ids[nav_point.href].append(nav_point.id) self.href2subchapter_ids[nav_point.href].append(nav_point.id)
self.adjacency_list[nav_point] = None self.adjacency_list[nav_point] = None
self.added_to_toc_hrefs.add(nav_point.href) self.hrefs_added_to_toc.add(nav_point.href)
return nav_point return nav_point
elif isinstance(element, tuple): elif isinstance(element, tuple):
@@ -191,7 +199,7 @@ class EpubConverter:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[nav_point] = sub_nodes self.adjacency_list[nav_point] = sub_nodes
self.added_to_toc_hrefs.add(nav_point.href) self.hrefs_added_to_toc.add(nav_point.href)
return nav_point return nav_point
elif isinstance(element, list) and (lvl == 0): elif isinstance(element, list) and (lvl == 0):
@@ -210,26 +218,26 @@ class EpubConverter:
return False return False
def build_adjacency_list_from_spine(self): def build_adjacency_list_from_spine(self):
manifest_id2href = self.build_manifest_id2href() manifest_id2href = self.build_manifest_id2html_href()
self.adjacency_list = { self.adjacency_list = {
-1: [] -1: []
} }
for id_, _ in self.ebooklib_book.spine: for id_, _ in self.ebooklib_book.spine:
nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
self.adjacency_list[-1].append(nav_point) self.adjacency_list[-1].append(nav_point)
self.added_to_toc_hrefs.add(nav_point.href) self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added): def add_not_added_files_to_adjacency_list(self, not_added):
for i, file in enumerate(not_added): for i, file in enumerate(not_added):
nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file)) nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file))
self.adjacency_list[-1].append(nav_point) self.adjacency_list[-1].append(nav_point)
self.added_to_toc_hrefs.add(file) self.hrefs_added_to_toc.add(file)
def label_chapters_ids_with_tmp_id(self): def label_chapters_ids_with_tmp_id(self):
for href in self.href2soup_html: for href in self.html_href2html_body_soup:
ids = self.href2subchapter_ids[href] ids = self.href2subchapter_ids[href]
for i in ids: for i in ids:
soup = self.href2soup_html[href] soup = self.html_href2html_body_soup[href]
tag = soup.find(id=i) tag = soup.find(id=i)
new_h = soup.new_tag('tmp') new_h = soup.new_tag('tmp')
new_h.attrs['class'] = 'converter-chapter-mark' new_h.attrs['class'] = 'converter-chapter-mark'
@@ -238,9 +246,9 @@ class EpubConverter:
def process_html_soup_structure_to_line(self): def process_html_soup_structure_to_line(self):
# go to line structure # go to line structure
for href in self.href2soup_html: for href in self.html_href2html_body_soup:
soup = self.href2soup_html[href] soup = self.html_href2html_body_soup[href]
self.href2soup_html[href] = unwrap_structural_tags(soup) self.html_href2html_body_soup[href] = unwrap_structural_tags(soup)
@staticmethod @staticmethod
def _create_unique_id(href, id_): def _create_unique_id(href, id_):
@@ -270,7 +278,7 @@ class EpubConverter:
""" """
dir_name = os.path.dirname(cur_file_path) dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/') normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
full_path = [path for path in self.added_to_toc_hrefs if normed_path in path] full_path = [path for path in self.hrefs_added_to_toc if normed_path in path]
if not full_path: if not full_path:
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.') f'While processing href in {internal_link_tag}.')
@@ -285,8 +293,8 @@ class EpubConverter:
def process_internal_links(self): def process_internal_links(self):
# 1. rebuild ids to be unique in all documents # 1. rebuild ids to be unique in all documents
for toc_href in self.added_to_toc_hrefs: for toc_href in self.hrefs_added_to_toc:
for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}): for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
if tag.attrs.get('class') == 'converter-chapter-mark': if tag.attrs.get('class') == 'converter-chapter-mark':
continue continue
@@ -298,8 +306,8 @@ class EpubConverter:
# 2.a) process anchor which is a whole xhtml file # 2.a) process anchor which is a whole xhtml file
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)') internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)')
for toc_href in self.added_to_toc_hrefs: for toc_href in self.hrefs_added_to_toc:
soup = self.href2soup_html[toc_href] soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
a_tag_href = internal_link_tag.attrs['href'] a_tag_href = internal_link_tag.attrs['href']
# find full path # find full path
@@ -309,7 +317,7 @@ class EpubConverter:
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '') new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
if new_id not in self.internal_anchors: if new_id not in self.internal_anchors:
anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc] anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self._create_new_anchor_span(soup, new_id) new_anchor_span = self._create_new_anchor_span(soup, new_id)
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
self.internal_anchors.add(new_id) self.internal_anchors.add(new_id)
@@ -318,8 +326,8 @@ class EpubConverter:
# 2.b) process anchor which is a an element in xhtml file # 2.b) process anchor which is a an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
for toc_href in self.added_to_toc_hrefs: for toc_href in self.hrefs_added_to_toc:
soup = self.href2soup_html[toc_href] soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#') a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
# find full path # find full path
@@ -332,7 +340,7 @@ class EpubConverter:
continue continue
new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id) new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc] anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id}) anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote
@@ -374,7 +382,7 @@ class EpubConverter:
""" """
if nav_point.id: if nav_point.id:
soup = self.href2soup_html[nav_point.href] soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup) chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser') new_tree = BeautifulSoup('', 'html.parser')
for tag in chapter_tags: for tag in chapter_tags:
@@ -396,13 +404,13 @@ class EpubConverter:
if nav_point.id: if nav_point.id:
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
else: else:
content: BeautifulSoup = self.href2soup_html[nav_point.href] content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
self.old_image_path2_aws_path = update_src_links_in_images(content, self.old_image_path2aws_path = update_src_links_in_images(content,
self.href2img_bytes, self.href2img_bytes,
path_to_html=nav_point.href, path_to_html=nav_point.href,
access=self.access, access=self.access,
path2aws_path=self.old_image_path2_aws_path) path2aws_path=self.old_image_path2aws_path)
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content, title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
@@ -447,7 +455,7 @@ if __name__ == "__main__":
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubConverter('../epub/index_with_html.epub', json_converter = EpubConverter('../epub/9781641050692.epub',
logger=logger_object) logger=logger_object)
tmp = json_converter.convert_to_dict() tmp = json_converter.convert_to_dict()

View File

@@ -1,7 +1,7 @@
import os import os
import pathlib import pathlib
import re import re
from typing import List, Tuple from typing import Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment from bs4 import BeautifulSoup, NavigableString, Tag, Comment