forked from LiveCarta/BookConverter
upgrade inline processor & make repetition check
This commit is contained in:
@@ -58,8 +58,8 @@ def convert_font_size(value):
|
|||||||
|
|
||||||
def convert_indents(value):
|
def convert_indents(value):
|
||||||
# 30px = 3.2% = 1.25em = 23pt
|
# 30px = 3.2% = 1.25em = 23pt
|
||||||
positive_text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)')
|
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)')
|
||||||
has_style_attrs = re.search(positive_text_indent_regexp, value)
|
has_style_attrs = re.search(text_indent_regexp, value)
|
||||||
if has_style_attrs:
|
if has_style_attrs:
|
||||||
if has_style_attrs.group(1):
|
if has_style_attrs.group(1):
|
||||||
value = value.replace(has_style_attrs.group(1),
|
value = value.replace(has_style_attrs.group(1),
|
||||||
@@ -89,8 +89,8 @@ LIVECARTA_STYLE_ATTRS = {
|
|||||||
'text-indent': [],
|
'text-indent': [],
|
||||||
'font-variant': ['small-caps'],
|
'font-variant': ['small-caps'],
|
||||||
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
|
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
|
||||||
'align': [], # ???
|
'align': [],
|
||||||
'font': [], # ???
|
'font': [],
|
||||||
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
|
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
|
||||||
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
|
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
|
||||||
'font-size': [],
|
'font-size': [],
|
||||||
@@ -182,41 +182,40 @@ def check_style_to_be_tag(style) -> List[tuple]:
|
|||||||
to_remove.append(k)
|
to_remove.append(k)
|
||||||
return to_remove
|
return to_remove
|
||||||
|
|
||||||
|
def update_css_style_types_to_livecarta_convention(css_rule, style_type):
|
||||||
def update_property_to_livecarta_convention(rule, property_):
|
if style_type.name not in LIVECARTA_STYLE_ATTRS:
|
||||||
if property_.name not in LIVECARTA_STYLE_ATTRS:
|
|
||||||
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
|
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||||
rule.style[property_.name] = ''
|
css_rule.style[style_type.name] = ''
|
||||||
return
|
return
|
||||||
|
|
||||||
cleaned_value = property_.value.replace('\"', '')
|
cleaned_value = style_type.value.replace('\"', '') # value of style
|
||||||
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(property_.name)
|
there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name)
|
||||||
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[property_.name]
|
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[style_type.name]
|
||||||
if there_are_constraints_on_value and value_not_in_possible_values_list:
|
if there_are_constraints_on_value and value_not_in_possible_values_list:
|
||||||
# property + value not in LIVECARTA_STYLE_ATTRS, remove from css file
|
# style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||||
rule.style[property_.name] = ''
|
css_rule.style[style_type.name] = ''
|
||||||
else:
|
else:
|
||||||
if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
|
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
|
||||||
func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
|
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] # function that converts our data
|
||||||
rule.style[property_.name] = func(cleaned_value)
|
css_rule.style[style_type.name] = func(cleaned_value)
|
||||||
|
|
||||||
|
def build_css_content(css_content):
|
||||||
|
sheet = cssutils.parseString(css_content, validate=False)
|
||||||
|
|
||||||
def clean_css(css):
|
for css_rule in sheet:
|
||||||
sheet = cssutils.parseString(css, validate=False)
|
if css_rule.type == css_rule.STYLE_RULE:
|
||||||
for rule in sheet:
|
for style_type in css_rule.style:
|
||||||
if rule.type == rule.STYLE_RULE:
|
update_css_style_types_to_livecarta_convention(css_rule, style_type)
|
||||||
for property_ in rule.style:
|
|
||||||
update_property_to_livecarta_convention(rule, property_)
|
|
||||||
|
|
||||||
css_text = sheet._getCssText().decode()
|
css_text = sheet._getCssText().decode()
|
||||||
return css_text
|
return css_text
|
||||||
|
|
||||||
|
|
||||||
class TagStyleConverter:
|
class TagStyleConverter:
|
||||||
def __init__(self, tag, tag_with_style):
|
def __init__(self, tag_with_initial_style, tag_with_ultimate_style):
|
||||||
self.tag = tag # tag to be updated with style attribute
|
self.tag_with_initial_style = tag_with_initial_style # tag with inline style to be updated with style attribute
|
||||||
self.tag_initial_name = tag.name
|
self.tag_initial_name = tag_with_initial_style.name
|
||||||
self.tag_with_style = tag_with_style # tag with inline style parsed from css file
|
self.tag_with_ultimate_style = tag_with_ultimate_style # tag with inline style + style parsed from css file
|
||||||
self.style = self.preprocess_style()
|
self.style = self.preprocess_style()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -248,76 +247,83 @@ class TagStyleConverter:
|
|||||||
return style_
|
return style_
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def convert_indentions_to_px(style):
|
def process_indents_in_px(split_style: list) -> str:
|
||||||
margin_left_regexp = re.compile(
|
# clean with convert_indents() style string and make new clean_style
|
||||||
r'(margin-left:( *-*\w+%);*)|(margin-left:( *-*\w+);*)')
|
clean_style = ''
|
||||||
text_indent_regexp = re.compile(
|
for item in split_style:
|
||||||
r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)')
|
item = item.split(':')
|
||||||
|
item[1] = convert_indents(item[1])
|
||||||
|
clean_style += item[0] + ': ' + item[1] + '; '
|
||||||
|
|
||||||
has_margin_left = re.search(margin_left_regexp, style)
|
margin_left_regexp = re.compile(
|
||||||
has_text_indent = re.search(text_indent_regexp, style)
|
r'(margin-left:( *-*\w+);*)')
|
||||||
# consider that 5% = 30px
|
text_indent_regexp = re.compile(
|
||||||
|
r'(text-indent:( *-*\w+);*)')
|
||||||
|
|
||||||
|
has_margin_left = re.search(margin_left_regexp, clean_style)
|
||||||
|
has_text_indent = re.search(text_indent_regexp, clean_style)
|
||||||
|
#formula_of_indent: indent = abs(margin_left - text_indent)
|
||||||
if has_margin_left:
|
if has_margin_left:
|
||||||
hml_group = 0
|
|
||||||
num_ml = 0
|
num_ml = 0
|
||||||
if has_margin_left.group(1):
|
if has_margin_left.group(1):
|
||||||
hml_group = has_margin_left.group(1)
|
|
||||||
num_ml = abs(int("".join(
|
num_ml = abs(int("".join(
|
||||||
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
|
filter(str.isdigit, str(has_margin_left.group(2))))))
|
||||||
|
|
||||||
elif has_margin_left.group(3):
|
|
||||||
hml_group = has_margin_left.group(3)
|
|
||||||
num_ml = abs(int("".join(
|
|
||||||
filter(str.isdigit, str(has_margin_left.group(4))))))
|
|
||||||
|
|
||||||
if has_text_indent:
|
if has_text_indent:
|
||||||
if has_text_indent.group(1):
|
if has_text_indent.group(1):
|
||||||
num_ti = abs(int("".join(
|
num_ti = abs(int("".join(
|
||||||
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
|
filter(str.isdigit, str(has_text_indent.group(2))))))
|
||||||
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
|
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
|
||||||
str(abs(num_ml - num_ti)) + 'px; ')
|
str(abs(num_ml - num_ti)) + 'px; ')
|
||||||
style = style.replace(hml_group, '')
|
clean_style = clean_style.replace(has_margin_left.group(1), '')
|
||||||
return style
|
return clean_style
|
||||||
|
|
||||||
elif has_text_indent.group(3):
|
clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' +
|
||||||
num_ti = abs(int("".join(
|
|
||||||
filter(str.isdigit, str(has_text_indent.group(4))))))
|
|
||||||
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
|
|
||||||
str(abs(num_ml - num_ti)) + 'px; ')
|
|
||||||
style = style.replace(hml_group, '')
|
|
||||||
return style
|
|
||||||
|
|
||||||
style = style.replace(hml_group, 'text-indent: ' +
|
|
||||||
str(abs(num_ml)) + 'px; ')
|
str(abs(num_ml)) + 'px; ')
|
||||||
return style
|
return clean_style
|
||||||
|
|
||||||
elif has_text_indent:
|
elif has_text_indent:
|
||||||
if has_text_indent.group(1):
|
if has_text_indent.group(1):
|
||||||
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
|
clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' +
|
||||||
str(abs(int("".join(
|
str(abs(int("".join(
|
||||||
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)) + 'px; ')
|
filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ')
|
||||||
return style
|
return clean_style
|
||||||
elif has_text_indent.group(3):
|
return clean_style
|
||||||
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
|
|
||||||
str("".join(
|
|
||||||
filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ')
|
|
||||||
return style
|
|
||||||
return style
|
|
||||||
|
|
||||||
def preprocess_style(self):
|
def preprocess_style(self):
|
||||||
style = self.tag_with_style.attrs.get('style') + ';'
|
ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';'
|
||||||
style = self.remove_white_if_no_bgcolor(style, self.tag_with_style)
|
ultimate_style = self.remove_white_if_no_bgcolor(ultimate_style, self.tag_with_ultimate_style)
|
||||||
style = style.replace('background:', 'background-color:')
|
ultimate_style = ultimate_style.replace('background:', 'background-color:')
|
||||||
style = style.replace('list-style-image', 'list-style-type')
|
ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type')
|
||||||
|
|
||||||
# todo: make hmtl_reader + do a repetition check with inline_style
|
split_ultimate_style = ultimate_style.replace(' ', '').split(';') # make for repetition check and convert to px
|
||||||
style = self.convert_indentions_to_px(style)
|
|
||||||
# if tag had already had inline style, add this to style parsed from css
|
|
||||||
if self.tag.attrs.get('style'):
|
|
||||||
inline_style = self.convert_indentions_to_px(self.tag.attrs['style'])
|
|
||||||
style += inline_style
|
|
||||||
|
|
||||||
return style
|
# check for another ; in style string in preprocess_style()
|
||||||
|
while '' in split_ultimate_style:
|
||||||
|
split_ultimate_style.remove('')
|
||||||
|
ultimate_style: str = self.process_indents_in_px(split_ultimate_style)
|
||||||
|
|
||||||
|
if self.tag_with_initial_style.attrs.get('style'):
|
||||||
|
|
||||||
|
initial_style = self.tag_with_initial_style.attrs['style']
|
||||||
|
split_initial_style = initial_style.replace(' ', '').split(';')
|
||||||
|
|
||||||
|
# check for another ; in style string in preprocess_style()
|
||||||
|
while '' in split_initial_style:
|
||||||
|
split_initial_style.remove('')
|
||||||
|
|
||||||
|
# repetition check - if tag had already had inline style, add this to style parsed from css
|
||||||
|
repeat_styles = list(set(split_ultimate_style) & set(split_initial_style))
|
||||||
|
for item in repeat_styles:
|
||||||
|
split_initial_style.remove(item)
|
||||||
|
|
||||||
|
if split_initial_style:
|
||||||
|
# if initial style is not empty - start convert and add to ultimate style
|
||||||
|
print('we enter repetition check', '\n')
|
||||||
|
initial_style: str = self.process_indents_in_px(split_initial_style)
|
||||||
|
ultimate_style += initial_style
|
||||||
|
|
||||||
|
return ultimate_style
|
||||||
|
|
||||||
def change_attrs_with_corresponding_tags(self):
|
def change_attrs_with_corresponding_tags(self):
|
||||||
# adds <b>, <u>, <sup>, etc
|
# adds <b>, <u>, <sup>, etc
|
||||||
@@ -328,15 +334,15 @@ class TagStyleConverter:
|
|||||||
self.style = self.style.replace(s, '')
|
self.style = self.style.replace(s, '')
|
||||||
self.style = self.style.strip()
|
self.style = self.style.strip()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
self.tag_with_initial_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
||||||
new_tags.append(self.tag)
|
new_tags.append(self.tag_with_initial_style)
|
||||||
else:
|
else:
|
||||||
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag(name)
|
new_tag = BeautifulSoup(features='lxml').new_tag(name)
|
||||||
new_tags[-1].wrap(new_tag)
|
new_tags[-1].wrap(new_tag)
|
||||||
new_tags.append(new_tag)
|
new_tags.append(new_tag)
|
||||||
|
|
||||||
top_tag = self.tag
|
top_tag = self.tag_with_initial_style
|
||||||
|
|
||||||
if new_tags:
|
if new_tags:
|
||||||
tmp_attrs = top_tag.attrs.copy()
|
tmp_attrs = top_tag.attrs.copy()
|
||||||
@@ -363,21 +369,22 @@ class TagStyleConverter:
|
|||||||
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
||||||
span_style = tag.attrs['style']
|
span_style = tag.attrs['style']
|
||||||
p_style = ''
|
p_style = ''
|
||||||
possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)')
|
for i in range(span_style.count(';')):
|
||||||
has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
|
possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)')
|
||||||
if has_p_style_attrs:
|
has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
|
||||||
if has_p_style_attrs.group(1):
|
if has_p_style_attrs:
|
||||||
p_style += has_p_style_attrs.group(1)
|
if has_p_style_attrs.group(1):
|
||||||
span_style = span_style.replace(has_p_style_attrs.group(1), '')
|
p_style += has_p_style_attrs.group(1)
|
||||||
if has_p_style_attrs.group(3):
|
span_style = span_style.replace(has_p_style_attrs.group(1), '')
|
||||||
p_style += has_p_style_attrs.group(3)
|
if has_p_style_attrs.group(3):
|
||||||
span_style = span_style.replace(has_p_style_attrs.group(3), '')
|
p_style += has_p_style_attrs.group(3)
|
||||||
|
span_style = span_style.replace(has_p_style_attrs.group(3), '')
|
||||||
|
|
||||||
p_tag.attrs['style'] = p_style
|
p_tag.attrs['style'] = p_style
|
||||||
|
|
||||||
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||||
has_li_style_attr = re.search(li_attrs_regexp, span_style)
|
has_li_style_attr = re.search(li_attrs_regexp, span_style)
|
||||||
span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '')
|
span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '')
|
||||||
tag.attrs['style'] = span_style
|
tag.attrs['style'] = span_style
|
||||||
tag.wrap(p_tag)
|
tag.wrap(p_tag)
|
||||||
|
|
||||||
@@ -439,49 +446,53 @@ class TagStyleConverter:
|
|||||||
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
|
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
|
||||||
|
|
||||||
def convert_initial_tag(self):
|
def convert_initial_tag(self):
|
||||||
self.tag = self.change_attrs_with_corresponding_tags()
|
self.tag_with_initial_style = self.change_attrs_with_corresponding_tags()
|
||||||
self.wrap_span_in_p_to_save_style_attrs(self.tag)
|
self.wrap_span_in_p_to_save_style_attrs(self.tag_with_initial_style)
|
||||||
self.add_span_to_save_style_attrs_in_li(self.tag)
|
self.add_span_to_save_style_attrs_in_li(self.tag_with_initial_style)
|
||||||
self.add_span_to_save_style_attrs_in_ul_ol(self.tag)
|
self.add_span_to_save_style_attrs_in_ul_ol(self.tag_with_initial_style)
|
||||||
self.add_span_to_save_style_attrs(self.tag)
|
self.add_span_to_save_style_attrs(self.tag_with_initial_style)
|
||||||
return self.tag
|
return self.tag_with_initial_style
|
||||||
|
|
||||||
|
|
||||||
def add_inline_style_to_html_soup(soup1: BeautifulSoup, css_text: str):
|
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
|
||||||
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
|
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||||
livecarta_tmp_ids = []
|
livecarta_tmp_ids = []
|
||||||
h_regex = f'(^h[1-9]$)'
|
h_regex = f'(^h[1-9]$)'
|
||||||
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
|
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
|
||||||
tags_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
|
tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp)
|
||||||
for i, x in enumerate(tags_with_possible_style_attr):
|
for i, x in enumerate(tags_with_possible_style_attr):
|
||||||
x.attrs['livecarta_id'] = i
|
x.attrs['livecarta_id'] = i
|
||||||
livecarta_tmp_ids.append(i)
|
livecarta_tmp_ids.append(i)
|
||||||
html_with_inline_style: str = transform(str(soup1), css_text=css_text,
|
|
||||||
remove_classes=False,
|
# here we add css styles to inline style
|
||||||
external_styles=False,
|
# sometimes in html_with_css_styles
|
||||||
allow_network=False,
|
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
||||||
disable_validation=True,
|
remove_classes=False,
|
||||||
)
|
external_styles=False,
|
||||||
soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
|
allow_network=False,
|
||||||
|
disable_validation=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
|
||||||
|
|
||||||
for i in livecarta_tmp_ids:
|
for i in livecarta_tmp_ids:
|
||||||
tag = soup1.find(attrs={'livecarta_id': i})
|
tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i})
|
||||||
tag_with_style = soup2.find(attrs={'livecarta_id': i})
|
tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})
|
||||||
del tag.attrs['livecarta_id']
|
del tag_with_initial_style.attrs['livecarta_id']
|
||||||
if tag_with_style.attrs.get('style'):
|
if tag_with_ultimate_style.attrs.get('style'):
|
||||||
style_converter = TagStyleConverter(tag, tag_with_style)
|
style_converter = TagStyleConverter(tag_with_initial_style, tag_with_ultimate_style)
|
||||||
style_converter.convert_initial_tag()
|
style_converter.convert_initial_tag()
|
||||||
|
|
||||||
return soup1
|
return html_soup
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub'
|
file = '../epub/9781627222174.epub'
|
||||||
ebooklib_book = epub.read_epub(file)
|
ebooklib_book = epub.read_epub(file)
|
||||||
css_ = ebooklib_book.get_item_with_href('css/epub.css')
|
css_ = ebooklib_book.get_item_with_href('css/epub.css')
|
||||||
css_ = css_.get_content().decode()
|
css_ = css_.get_content().decode()
|
||||||
css_cleaned = clean_css(css_)
|
css_cleaned = build_css_content(css_)
|
||||||
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
|
html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
|
||||||
html_soup = BeautifulSoup(html_, features='lxml')
|
html_soup = BeautifulSoup(html_, features='lxml')
|
||||||
|
|
||||||
print(add_inline_style_to_html_soup(html_soup, css_cleaned))
|
print(convert_html_soup_with_css_style(html_soup, css_cleaned))
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from data_objects import ChapterItem, NavPoint
|
|||||||
from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \
|
from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \
|
||||||
update_src_links_in_images, preprocess_footnotes
|
update_src_links_in_images, preprocess_footnotes
|
||||||
|
|
||||||
from css_reader import clean_css, add_inline_style_to_html_soup
|
from css_reader import build_css_content, convert_html_soup_with_css_style
|
||||||
from livecarta_config import LiveCartaConfig
|
from livecarta_config import LiveCartaConfig
|
||||||
from util.helpers import BookLogger
|
from util.helpers import BookLogger
|
||||||
|
|
||||||
@@ -29,11 +29,11 @@ class EpubConverter:
|
|||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
self.ebooklib_book = epub.read_epub(file)
|
self.ebooklib_book = epub.read_epub(file)
|
||||||
|
|
||||||
self.href2soup_html: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
|
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
|
||||||
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
|
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
|
||||||
self.added_to_toc_hrefs = set() # enumerate all file paths that where added to TOC
|
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
|
||||||
|
|
||||||
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
|
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
|
||||||
# key = -1 for top level NavPoints
|
# key = -1 for top level NavPoints
|
||||||
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
|
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
|
||||||
|
|
||||||
@@ -44,7 +44,7 @@ class EpubConverter:
|
|||||||
self.internal_anchors = set()
|
self.internal_anchors = set()
|
||||||
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
|
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
|
||||||
self.href2img_bytes = {} # file path to bytes
|
self.href2img_bytes = {} # file path to bytes
|
||||||
self.old_image_path2_aws_path = {} # file path from <a> to generated aws path
|
self.old_image_path2aws_path = {} # file path from <a> to generated aws path
|
||||||
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
||||||
self.noterefs: List[Tag] = [] # start of the footnote
|
self.noterefs: List[Tag] = [] # start of the footnote
|
||||||
self.footnotes: List[Tag] = [] # end of the footnote
|
self.footnotes: List[Tag] = [] # end of the footnote
|
||||||
@@ -57,17 +57,18 @@ class EpubConverter:
|
|||||||
self.href2img_bytes[file_name] = content
|
self.href2img_bytes[file_name] = content
|
||||||
|
|
||||||
self.logger.log('HTML files reading.')
|
self.logger.log('HTML files reading.')
|
||||||
self.href2soup_html = self.build_href2soup_content()
|
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
||||||
|
|
||||||
|
|
||||||
self.logger.log('CSS files processing.')
|
self.logger.log('CSS files processing.')
|
||||||
self.css_href2content, self.html_href2css_href = self.build_css_content()
|
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||||
self.logger.log('CSS styles adding.')
|
self.logger.log('CSS styles adding.')
|
||||||
self.add_css_styles2soup()
|
self.add_css_styles_to_html_soup()
|
||||||
|
|
||||||
self.logger.log('Footnotes processing.')
|
self.logger.log('Footnotes processing.')
|
||||||
for href in self.href2soup_html:
|
for href in self.html_href2html_body_soup:
|
||||||
content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href],
|
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
|
||||||
self.href2soup_html)
|
self.html_href2html_body_soup)
|
||||||
self.footnotes_contents.extend(content)
|
self.footnotes_contents.extend(content)
|
||||||
self.noterefs.extend(noterefs)
|
self.noterefs.extend(noterefs)
|
||||||
self.footnotes.extend(footnotes_tags)
|
self.footnotes.extend(footnotes_tags)
|
||||||
@@ -83,7 +84,7 @@ class EpubConverter:
|
|||||||
# build simple toc from spine if needed
|
# build simple toc from spine if needed
|
||||||
if self.is_toc_empty():
|
if self.is_toc_empty():
|
||||||
self.build_adjacency_list_from_spine()
|
self.build_adjacency_list_from_spine()
|
||||||
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
|
not_added = [x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
|
||||||
self.logger.log(f'Html documents not added to TOC: {not_added}.')
|
self.logger.log(f'Html documents not added to TOC: {not_added}.')
|
||||||
self.add_not_added_files_to_adjacency_list(not_added)
|
self.add_not_added_files_to_adjacency_list(not_added)
|
||||||
self.logger.log(f'Html internal links and structure processing.')
|
self.logger.log(f'Html internal links and structure processing.')
|
||||||
@@ -96,62 +97,69 @@ class EpubConverter:
|
|||||||
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
||||||
# using EpubElements
|
# using EpubElements
|
||||||
# for now just for HTML objects, as it is simplest chapter
|
# for now just for HTML objects, as it is simplest chapter
|
||||||
# todo: check if other chapters exist
|
|
||||||
nodes = dict()
|
nodes = dict()
|
||||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||||
html_body_text = item.get_body_content()
|
html_body_text = item.get_body_content()
|
||||||
# html.parser closes tags if needed
|
# html.parser closes tags if needed
|
||||||
soup = BeautifulSoup(html_body_text, features='html.parser')
|
soup = BeautifulSoup(html_body_text, features='html.parser')
|
||||||
nodes[item.file_name] = soup
|
nodes[item.file_name] = soup
|
||||||
|
|
||||||
return nodes
|
return nodes
|
||||||
|
|
||||||
def _read_css(self, css_href, html_path):
|
def get_css_content(self, css_href, html_href):
|
||||||
'''
|
|
||||||
|
|
||||||
'''
|
|
||||||
path_to_css_from_html = css_href
|
path_to_css_from_html = css_href
|
||||||
html_folder = dirname(html_path)
|
html_folder = dirname(html_href)
|
||||||
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
|
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
|
||||||
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
|
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
|
||||||
assert css_obj, f'Css style {css_href} was not in manifest.'
|
assert css_obj, f'Css style {css_href} was not in manifest.'
|
||||||
css_content: str = css_obj.get_content().decode()
|
css_content: str = css_obj.get_content().decode()
|
||||||
return css_content
|
return css_content
|
||||||
|
|
||||||
def build_css_content(self):
|
def build_html_and_css_relations(self):
|
||||||
css_href2content, html_href2css_href = {}, {}
|
'''
|
||||||
html_href2css_href = defaultdict(list)
|
This function is designed to get 2 dictionaries:
|
||||||
# html_href2css_href 1-to-many
|
The first is css_href2css_content. It is created to connect href of css to content of css
|
||||||
|
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
|
||||||
|
...2... = key2value
|
||||||
|
'''
|
||||||
|
|
||||||
|
html_href2css_href: defaultdict = defaultdict(list) # dictionary: href of html to related css files
|
||||||
|
css_href2css_content: dict = {}
|
||||||
|
|
||||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||||
html_text = item.content
|
html_content = item.content
|
||||||
html_path = item.file_name
|
html_href = item.file_name
|
||||||
soup = BeautifulSoup(html_text, features='lxml')
|
soup_html_content = BeautifulSoup(html_content, features='lxml')
|
||||||
for tag in soup.find_all('link', attrs={"type": "text/css"}):
|
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): #check if file links to css file
|
||||||
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
|
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
|
||||||
continue
|
continue
|
||||||
css_href = tag.attrs.get('href')
|
css_href = tag.attrs.get('href')
|
||||||
html_href2css_href[html_path].append(css_href)
|
html_href2css_href[html_href].append(css_href)
|
||||||
if css_href not in css_href2content:
|
if css_href not in css_href2css_content:
|
||||||
css_href2content[css_href] = clean_css(self._read_css(css_href, html_path))
|
# css_href not in css_href2css_content, add to this dict
|
||||||
|
css_href2css_content[css_href] = build_css_content(
|
||||||
|
self.get_css_content(css_href, html_href))
|
||||||
|
|
||||||
for i, tag in enumerate(soup.find_all('style')):
|
for i, tag in enumerate(soup_html_content.find_all('style')):
|
||||||
css_content = tag.string
|
css_content = tag.string
|
||||||
html_href2css_href[html_path].append(f'href{i}')
|
html_href2css_href[html_href].append(f'href{i}')
|
||||||
css_href2content[f'href{i}'] = clean_css(css_content)
|
css_href2css_content[f'href{i}'] = build_css_content(css_content)
|
||||||
|
|
||||||
return css_href2content, html_href2css_href
|
return html_href2css_href, css_href2css_content,
|
||||||
|
|
||||||
def add_css_styles2soup(self):
|
def add_css_styles_to_html_soup(self):
|
||||||
for href in self.href2soup_html:
|
for href in self.html_href2html_body_soup:
|
||||||
if self.html_href2css_href.get(href):
|
if self.html_href2css_href.get(href):
|
||||||
css =''
|
css =''
|
||||||
for key in self.html_href2css_href[href]:
|
for key in self.html_href2css_href[href]:
|
||||||
css += self.css_href2content[key]
|
css += self.css_href2css_content[key]
|
||||||
content: BeautifulSoup = self.href2soup_html[href]
|
content: BeautifulSoup = self.html_href2html_body_soup[href]
|
||||||
content = add_inline_style_to_html_soup(content, css)
|
# todo func here to make content
|
||||||
self.href2soup_html[href] = content
|
content = convert_html_soup_with_css_style(content, css)
|
||||||
|
self.html_href2html_body_soup[href] = content
|
||||||
|
|
||||||
def build_manifest_id2href(self):
|
def build_manifest_id2html_href(self):
|
||||||
links = dict()
|
links = dict()
|
||||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||||
links[item.id] = item.file_name
|
links[item.id] = item.file_name
|
||||||
@@ -160,7 +168,7 @@ class EpubConverter:
|
|||||||
|
|
||||||
def build_adjacency_list_from_toc(self, element, lvl=0):
|
def build_adjacency_list_from_toc(self, element, lvl=0):
|
||||||
"""
|
"""
|
||||||
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc
|
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
|
||||||
|
|
||||||
key = -1 if root, value = None if leaf
|
key = -1 if root, value = None if leaf
|
||||||
|
|
||||||
@@ -175,7 +183,7 @@ class EpubConverter:
|
|||||||
self.id_anchor_exist_in_nav_points = True
|
self.id_anchor_exist_in_nav_points = True
|
||||||
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
|
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
|
||||||
self.adjacency_list[nav_point] = None
|
self.adjacency_list[nav_point] = None
|
||||||
self.added_to_toc_hrefs.add(nav_point.href)
|
self.hrefs_added_to_toc.add(nav_point.href)
|
||||||
return nav_point
|
return nav_point
|
||||||
|
|
||||||
elif isinstance(element, tuple):
|
elif isinstance(element, tuple):
|
||||||
@@ -191,7 +199,7 @@ class EpubConverter:
|
|||||||
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
|
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
|
||||||
|
|
||||||
self.adjacency_list[nav_point] = sub_nodes
|
self.adjacency_list[nav_point] = sub_nodes
|
||||||
self.added_to_toc_hrefs.add(nav_point.href)
|
self.hrefs_added_to_toc.add(nav_point.href)
|
||||||
return nav_point
|
return nav_point
|
||||||
|
|
||||||
elif isinstance(element, list) and (lvl == 0):
|
elif isinstance(element, list) and (lvl == 0):
|
||||||
@@ -210,26 +218,26 @@ class EpubConverter:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def build_adjacency_list_from_spine(self):
|
def build_adjacency_list_from_spine(self):
|
||||||
manifest_id2href = self.build_manifest_id2href()
|
manifest_id2href = self.build_manifest_id2html_href()
|
||||||
self.adjacency_list = {
|
self.adjacency_list = {
|
||||||
-1: []
|
-1: []
|
||||||
}
|
}
|
||||||
for id_, _ in self.ebooklib_book.spine:
|
for id_, _ in self.ebooklib_book.spine:
|
||||||
nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
|
nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
|
||||||
self.adjacency_list[-1].append(nav_point)
|
self.adjacency_list[-1].append(nav_point)
|
||||||
self.added_to_toc_hrefs.add(nav_point.href)
|
self.hrefs_added_to_toc.add(nav_point.href)
|
||||||
|
|
||||||
def add_not_added_files_to_adjacency_list(self, not_added):
|
def add_not_added_files_to_adjacency_list(self, not_added):
|
||||||
for i, file in enumerate(not_added):
|
for i, file in enumerate(not_added):
|
||||||
nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file))
|
nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file))
|
||||||
self.adjacency_list[-1].append(nav_point)
|
self.adjacency_list[-1].append(nav_point)
|
||||||
self.added_to_toc_hrefs.add(file)
|
self.hrefs_added_to_toc.add(file)
|
||||||
|
|
||||||
def label_chapters_ids_with_tmp_id(self):
|
def label_chapters_ids_with_tmp_id(self):
|
||||||
for href in self.href2soup_html:
|
for href in self.html_href2html_body_soup:
|
||||||
ids = self.href2subchapter_ids[href]
|
ids = self.href2subchapter_ids[href]
|
||||||
for i in ids:
|
for i in ids:
|
||||||
soup = self.href2soup_html[href]
|
soup = self.html_href2html_body_soup[href]
|
||||||
tag = soup.find(id=i)
|
tag = soup.find(id=i)
|
||||||
new_h = soup.new_tag('tmp')
|
new_h = soup.new_tag('tmp')
|
||||||
new_h.attrs['class'] = 'converter-chapter-mark'
|
new_h.attrs['class'] = 'converter-chapter-mark'
|
||||||
@@ -238,9 +246,9 @@ class EpubConverter:
|
|||||||
|
|
||||||
def process_html_soup_structure_to_line(self):
|
def process_html_soup_structure_to_line(self):
|
||||||
# go to line structure
|
# go to line structure
|
||||||
for href in self.href2soup_html:
|
for href in self.html_href2html_body_soup:
|
||||||
soup = self.href2soup_html[href]
|
soup = self.html_href2html_body_soup[href]
|
||||||
self.href2soup_html[href] = unwrap_structural_tags(soup)
|
self.html_href2html_body_soup[href] = unwrap_structural_tags(soup)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_unique_id(href, id_):
|
def _create_unique_id(href, id_):
|
||||||
@@ -270,7 +278,7 @@ class EpubConverter:
|
|||||||
"""
|
"""
|
||||||
dir_name = os.path.dirname(cur_file_path)
|
dir_name = os.path.dirname(cur_file_path)
|
||||||
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
|
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
|
||||||
full_path = [path for path in self.added_to_toc_hrefs if normed_path in path]
|
full_path = [path for path in self.hrefs_added_to_toc if normed_path in path]
|
||||||
if not full_path:
|
if not full_path:
|
||||||
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
|
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
|
||||||
f'While processing href in {internal_link_tag}.')
|
f'While processing href in {internal_link_tag}.')
|
||||||
@@ -285,8 +293,8 @@ class EpubConverter:
|
|||||||
|
|
||||||
def process_internal_links(self):
|
def process_internal_links(self):
|
||||||
# 1. rebuild ids to be unique in all documents
|
# 1. rebuild ids to be unique in all documents
|
||||||
for toc_href in self.added_to_toc_hrefs:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
|
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
|
||||||
if tag.attrs.get('class') == 'converter-chapter-mark':
|
if tag.attrs.get('class') == 'converter-chapter-mark':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -298,8 +306,8 @@ class EpubConverter:
|
|||||||
|
|
||||||
# 2.a) process anchor which is a whole xhtml file
|
# 2.a) process anchor which is a whole xhtml file
|
||||||
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)')
|
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)')
|
||||||
for toc_href in self.added_to_toc_hrefs:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
soup = self.href2soup_html[toc_href]
|
soup = self.html_href2html_body_soup[toc_href]
|
||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
||||||
a_tag_href = internal_link_tag.attrs['href']
|
a_tag_href = internal_link_tag.attrs['href']
|
||||||
# find full path
|
# find full path
|
||||||
@@ -309,7 +317,7 @@ class EpubConverter:
|
|||||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
||||||
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
||||||
if new_id not in self.internal_anchors:
|
if new_id not in self.internal_anchors:
|
||||||
anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
|
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||||
new_anchor_span = self._create_new_anchor_span(soup, new_id)
|
new_anchor_span = self._create_new_anchor_span(soup, new_id)
|
||||||
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
|
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
|
||||||
self.internal_anchors.add(new_id)
|
self.internal_anchors.add(new_id)
|
||||||
@@ -318,8 +326,8 @@ class EpubConverter:
|
|||||||
|
|
||||||
# 2.b) process anchor which is a an element in xhtml file
|
# 2.b) process anchor which is a an element in xhtml file
|
||||||
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
|
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
|
||||||
for toc_href in self.added_to_toc_hrefs:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
soup = self.href2soup_html[toc_href]
|
soup = self.html_href2html_body_soup[toc_href]
|
||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
||||||
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
||||||
# find full path
|
# find full path
|
||||||
@@ -332,7 +340,7 @@ class EpubConverter:
|
|||||||
continue
|
continue
|
||||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
|
new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
|
||||||
|
|
||||||
anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
|
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||||
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
|
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
|
||||||
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote
|
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote
|
||||||
|
|
||||||
@@ -374,7 +382,7 @@ class EpubConverter:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
if nav_point.id:
|
if nav_point.id:
|
||||||
soup = self.href2soup_html[nav_point.href]
|
soup = self.html_href2html_body_soup[nav_point.href]
|
||||||
chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup)
|
chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup)
|
||||||
new_tree = BeautifulSoup('', 'html.parser')
|
new_tree = BeautifulSoup('', 'html.parser')
|
||||||
for tag in chapter_tags:
|
for tag in chapter_tags:
|
||||||
@@ -396,13 +404,13 @@ class EpubConverter:
|
|||||||
if nav_point.id:
|
if nav_point.id:
|
||||||
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
|
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
|
||||||
else:
|
else:
|
||||||
content: BeautifulSoup = self.href2soup_html[nav_point.href]
|
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
|
||||||
|
|
||||||
self.old_image_path2_aws_path = update_src_links_in_images(content,
|
self.old_image_path2aws_path = update_src_links_in_images(content,
|
||||||
self.href2img_bytes,
|
self.href2img_bytes,
|
||||||
path_to_html=nav_point.href,
|
path_to_html=nav_point.href,
|
||||||
access=self.access,
|
access=self.access,
|
||||||
path2aws_path=self.old_image_path2_aws_path)
|
path2aws_path=self.old_image_path2aws_path)
|
||||||
|
|
||||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||||
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
|
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
|
||||||
@@ -447,7 +455,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
||||||
|
|
||||||
json_converter = EpubConverter('../epub/index_with_html.epub',
|
json_converter = EpubConverter('../epub/9781641050692.epub',
|
||||||
logger=logger_object)
|
logger=logger_object)
|
||||||
tmp = json_converter.convert_to_dict()
|
tmp = json_converter.convert_to_dict()
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
from typing import List, Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user