forked from LiveCarta/BookConverter
Add htm support in processing anchors
This commit is contained in:
@@ -100,11 +100,12 @@ LIVECARTA_STYLE_ATTRS = {
|
|||||||
'background-color': [],
|
'background-color': [],
|
||||||
'background': [],
|
'background': [],
|
||||||
'width': [],
|
'width': [],
|
||||||
|
'border': [],
|
||||||
'border-top-width': [],
|
'border-top-width': [],
|
||||||
'border-right-width': [],
|
'border-right-width': [],
|
||||||
'border-left-width': [],
|
'border-left-width': [],
|
||||||
'border-bottom-width': [],
|
'border-bottom-width': [],
|
||||||
'border': [],
|
'border-bottom': [],
|
||||||
'list-style-type': [],
|
'list-style-type': [],
|
||||||
'list-style-image': [],
|
'list-style-image': [],
|
||||||
'margin-left': []
|
'margin-left': []
|
||||||
@@ -145,6 +146,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
|
|||||||
'border-right-width': lambda x: x if x != '0' else '',
|
'border-right-width': lambda x: x if x != '0' else '',
|
||||||
'border-left-width': lambda x: x if x != '0' else '',
|
'border-left-width': lambda x: x if x != '0' else '',
|
||||||
'border-bottom-width': lambda x: x if x != '0' else '',
|
'border-bottom-width': lambda x: x if x != '0' else '',
|
||||||
|
'border-bottom': lambda x: x if x != '0' else '',
|
||||||
'list-style-type': lambda x: x if x in list_types else 'disc',
|
'list-style-type': lambda x: x if x in list_types else 'disc',
|
||||||
'list-style-image': lambda x: 'disc',
|
'list-style-image': lambda x: 'disc',
|
||||||
'margin-left': convert_indents
|
'margin-left': convert_indents
|
||||||
@@ -409,9 +411,9 @@ class TagStyleConverter:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def add_span_to_save_style_attrs_in_ul_ol(t):
|
def add_span_to_save_style_attrs_in_ul_ol(t):
|
||||||
if t.name in ['ul', 'ol'] and t.attrs.get('style'):
|
if t.name in ['ul', 'ol'] and t.attrs.get('style'):
|
||||||
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
|
styles_cant_be_in_ul_ol = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
|
||||||
|
|
||||||
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
|
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_ul_ol]
|
||||||
if any(check):
|
if any(check):
|
||||||
t.name = 'span'
|
t.name = 'span'
|
||||||
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
|
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
|
||||||
|
|||||||
@@ -254,18 +254,18 @@ class EpubConverter:
|
|||||||
self.html_href2html_body_soup[href] = unwrap_structural_tags(soup)
|
self.html_href2html_body_soup[href] = unwrap_structural_tags(soup)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_unique_id(href, id_):
|
def create_unique_id(href, id_):
|
||||||
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
|
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_new_anchor_span(soup, id_):
|
def create_new_anchor_span(soup, id_):
|
||||||
new_anchor_span = soup.new_tag("span")
|
new_anchor_span = soup.new_tag("span")
|
||||||
new_anchor_span.attrs['id'] = id_
|
new_anchor_span.attrs['id'] = id_
|
||||||
new_anchor_span.attrs['class'] = 'link-anchor'
|
new_anchor_span.attrs['class'] = 'link-anchor'
|
||||||
new_anchor_span.string = "\xa0"
|
new_anchor_span.string = "\xa0"
|
||||||
return new_anchor_span
|
return new_anchor_span
|
||||||
|
|
||||||
def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
||||||
"""
|
"""
|
||||||
TOC: a/b/c.xhtml
|
TOC: a/b/c.xhtml
|
||||||
|
|
||||||
@@ -304,44 +304,44 @@ class EpubConverter:
|
|||||||
if tag.attrs.get('class') == 'footnote-element':
|
if tag.attrs.get('class') == 'footnote-element':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
new_id = self._create_unique_id(toc_href, tag.attrs['id'])
|
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
|
||||||
tag.attrs['id'] = new_id
|
tag.attrs['id'] = new_id
|
||||||
|
|
||||||
# 2.a) process anchor which is a whole xhtml file
|
# 2.a) process anchor which is a whole xhtml file
|
||||||
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)')
|
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(htm|html|xhtml)$)')
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
soup = self.html_href2html_body_soup[toc_href]
|
soup = self.html_href2html_body_soup[toc_href]
|
||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
||||||
a_tag_href = internal_link_tag.attrs['href']
|
a_tag_href = internal_link_tag.attrs['href']
|
||||||
# find full path
|
# find full path
|
||||||
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||||
if not a_tag_href_matched_to_toc:
|
if not a_tag_href_matched_to_toc:
|
||||||
continue
|
continue
|
||||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
|
||||||
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
||||||
if new_id not in self.internal_anchors:
|
if new_id not in self.internal_anchors:
|
||||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||||
new_anchor_span = self._create_new_anchor_span(soup, new_id)
|
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
||||||
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
|
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
|
||||||
self.internal_anchors.add(new_id)
|
self.internal_anchors.add(new_id)
|
||||||
|
|
||||||
del internal_link_tag.attrs['href']
|
del internal_link_tag.attrs['href']
|
||||||
|
|
||||||
# 2.b) process anchor which is a an element in xhtml file
|
# 2.b) process anchor which is an element in xhtml file
|
||||||
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
|
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
soup = self.html_href2html_body_soup[toc_href]
|
soup = self.html_href2html_body_soup[toc_href]
|
||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
||||||
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
||||||
# find full path
|
# find full path
|
||||||
if a_tag_href:
|
if a_tag_href:
|
||||||
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href,
|
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
|
||||||
internal_link_tag)
|
internal_link_tag)
|
||||||
else:
|
else:
|
||||||
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
|
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
|
||||||
if not a_tag_href_matched_to_toc:
|
if not a_tag_href_matched_to_toc:
|
||||||
continue
|
continue
|
||||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
|
new_id = self.create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
|
||||||
|
|
||||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||||
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
|
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
|
||||||
@@ -359,7 +359,7 @@ class EpubConverter:
|
|||||||
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
|
||||||
# create span to have cyclic links, link has 1 type of class, anchor another
|
# create span to have cyclic links, link has 1 type of class, anchor another
|
||||||
if anchor_tag.attrs['id'] not in self.internal_anchors:
|
if anchor_tag.attrs['id'] not in self.internal_anchors:
|
||||||
new_anchor_span = self._create_new_anchor_span(soup, new_id)
|
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
||||||
anchor_tag.insert_before(new_anchor_span)
|
anchor_tag.insert_before(new_anchor_span)
|
||||||
self.internal_anchors.add(new_id)
|
self.internal_anchors.add(new_id)
|
||||||
del anchor_tag.attrs['id']
|
del anchor_tag.attrs['id']
|
||||||
@@ -402,7 +402,7 @@ class EpubConverter:
|
|||||||
for point in top_level_nav_points:
|
for point in top_level_nav_points:
|
||||||
self.build_one_chapter(point)
|
self.build_one_chapter(point)
|
||||||
|
|
||||||
def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||||
title = nav_point.title
|
title = nav_point.title
|
||||||
if nav_point.id:
|
if nav_point.id:
|
||||||
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
|
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
|
||||||
@@ -423,7 +423,7 @@ class EpubConverter:
|
|||||||
# warning! not EpubHtmlItems won;t be added to chapter
|
# warning! not EpubHtmlItems won;t be added to chapter
|
||||||
if self.adjacency_list.get(nav_point):
|
if self.adjacency_list.get(nav_point):
|
||||||
for sub_node in self.adjacency_list[nav_point]:
|
for sub_node in self.adjacency_list[nav_point]:
|
||||||
sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1)
|
sub_chapter_item = self.node_to_livecarta_chapter_item(sub_node, lvl + 1)
|
||||||
sub_nodes.append(sub_chapter_item)
|
sub_nodes.append(sub_chapter_item)
|
||||||
|
|
||||||
if self.logger:
|
if self.logger:
|
||||||
@@ -436,7 +436,7 @@ class EpubConverter:
|
|||||||
top_level_chapters = []
|
top_level_chapters = []
|
||||||
|
|
||||||
for nav_point in top_level_nav_points:
|
for nav_point in top_level_nav_points:
|
||||||
chapter = self.node2livecarta_chapter_item(nav_point)
|
chapter = self.node_to_livecarta_chapter_item(nav_point)
|
||||||
top_level_chapters.append(chapter)
|
top_level_chapters.append(chapter)
|
||||||
|
|
||||||
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
|
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
|
||||||
@@ -458,7 +458,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
||||||
|
|
||||||
json_converter = EpubConverter('../../epub/9781634252221.epub',
|
json_converter = EpubConverter('../../epub/Cook.epub',
|
||||||
logger=logger_object)
|
logger=logger_object)
|
||||||
tmp = json_converter.convert_to_dict()
|
tmp = json_converter.convert_to_dict()
|
||||||
|
|
||||||
|
|||||||
@@ -468,10 +468,11 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co
|
|||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
def _clean_wiley_block(block):
|
def clean_wiley_block(block):
|
||||||
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
|
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
|
||||||
for hr in hrs:
|
for hr in hrs:
|
||||||
hr.extract()
|
hr.extract()
|
||||||
|
print(hr)
|
||||||
h = block.find(re.compile("h[1-9]"))
|
h = block.find(re.compile("h[1-9]"))
|
||||||
if h:
|
if h:
|
||||||
h.name = "p"
|
h.name = "p"
|
||||||
@@ -481,7 +482,7 @@ def _clean_wiley_block(block):
|
|||||||
def preprocess_block_tags(chapter_tag):
|
def preprocess_block_tags(chapter_tag):
|
||||||
for block in chapter_tag.find_all("blockquote"):
|
for block in chapter_tag.find_all("blockquote"):
|
||||||
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
|
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
|
||||||
_clean_wiley_block(block)
|
clean_wiley_block(block)
|
||||||
|
|
||||||
color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None
|
color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None
|
||||||
color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color
|
color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color
|
||||||
@@ -490,13 +491,13 @@ def preprocess_block_tags(chapter_tag):
|
|||||||
block.unwrap()
|
block.unwrap()
|
||||||
|
|
||||||
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
|
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
|
||||||
_clean_wiley_block(future_block)
|
clean_wiley_block(future_block)
|
||||||
color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None
|
color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None
|
||||||
color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color
|
color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color
|
||||||
wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
|
wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
|
||||||
|
|
||||||
|
|
||||||
def _prepare_formatted(text):
|
def prepare_formatted(text):
|
||||||
# replace <,> to save them as is in html code
|
# replace <,> to save them as is in html code
|
||||||
text = text.replace("<", "\x3C")
|
text = text.replace("<", "\x3C")
|
||||||
text = text.replace(">", "\x3E")
|
text = text.replace(">", "\x3E")
|
||||||
@@ -515,7 +516,7 @@ def preprocess_pre_tags(chapter_tag):
|
|||||||
|
|
||||||
for child in pre.children:
|
for child in pre.children:
|
||||||
if isinstance(child, NavigableString):
|
if isinstance(child, NavigableString):
|
||||||
cleaned_text = _prepare_formatted(str(child))
|
cleaned_text = prepare_formatted(str(child))
|
||||||
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
|
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
|
||||||
for string in sub_strings:
|
for string in sub_strings:
|
||||||
new_tag.append(NavigableString(string))
|
new_tag.append(NavigableString(string))
|
||||||
@@ -523,10 +524,10 @@ def preprocess_pre_tags(chapter_tag):
|
|||||||
else:
|
else:
|
||||||
for sub_child in child.children:
|
for sub_child in child.children:
|
||||||
if isinstance(sub_child, NavigableString):
|
if isinstance(sub_child, NavigableString):
|
||||||
cleaned_text2 = _prepare_formatted(str(sub_child))
|
cleaned_text2 = prepare_formatted(str(sub_child))
|
||||||
sub_child.replace_with(NavigableString(cleaned_text2))
|
sub_child.replace_with(NavigableString(cleaned_text2))
|
||||||
else:
|
else:
|
||||||
sub_child.string = _prepare_formatted(sub_child.text)
|
sub_child.string = prepare_formatted(sub_child.text)
|
||||||
cleaned_tag = child.extract()
|
cleaned_tag = child.extract()
|
||||||
new_tag.append(cleaned_tag)
|
new_tag.append(cleaned_tag)
|
||||||
if to_add_br:
|
if to_add_br:
|
||||||
|
|||||||
Reference in New Issue
Block a user