epub converter: prettify

This commit is contained in:
shirshasa
2021-07-30 17:34:42 +03:00
parent 240e1e30ff
commit fae85d5280

View File

@@ -225,6 +225,14 @@ class EpubPostprocessor:
def _create_unique_id(href, id_): def _create_unique_id(href, id_):
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
@staticmethod
def _create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs['id'] = id_
new_anchor_span.attrs['class'] = 'link-anchor'
new_anchor_span.string = "\xa0"
return new_anchor_span
def match_href_to_path_from_toc(self, href, href_in_link, internal_link_tag): def match_href_to_path_from_toc(self, href, href_in_link, internal_link_tag):
dir_name = os.path.dirname(href) dir_name = os.path.dirname(href)
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)) normed_path = os.path.normpath(os.path.join(dir_name, href_in_link))
@@ -268,32 +276,29 @@ class EpubPostprocessor:
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
if new_id not in self.internal_anchors: if new_id not in self.internal_anchors:
anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc] anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
new_anchor_span = soup.new_tag("span") new_anchor_span = self._create_new_anchor_span(soup, new_id)
new_anchor_span.attrs['id'] = new_id
new_anchor_span.attrs['class'] = 'link-anchor'
new_anchor_span.string = "\xa0"
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
self.internal_anchors.add(new_id) self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href'] del internal_link_tag.attrs['href']
# ------------------------------------------------------------------------ # ------------------------------------------------------------------------
# write placeholder to all internal links # add placeholder to all internal links
internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)')
for toc_href in self.added_to_toc_hrefs: for toc_href in self.added_to_toc_hrefs:
soup = self.href2soup_html[toc_href] soup = self.href2soup_html[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
a_tag_href, id_in_link = internal_link_tag.attrs['href'].split('#') a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
a_tag_href = a_tag_href or toc_href a_tag_href = a_tag_href or toc_href
# find full path # find full path
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc: if not a_tag_href_matched_to_toc:
continue continue
new_id = self._create_unique_id(a_tag_href_matched_to_toc, id_in_link) new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc] anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id}) anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': id_in_link}) anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote
if anchor_tags: if anchor_tags:
if len(anchor_tags) > 1: if len(anchor_tags) > 1:
@@ -301,14 +306,12 @@ class EpubPostprocessor:
f' While processing {internal_link_tag}') f' While processing {internal_link_tag}')
anchor_tag = anchor_tags[0] anchor_tag = anchor_tags[0]
assert anchor_tag.attrs['id'] == new_id
# if anchor is found we could add placeholder for link creation on server side. # if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
# create span to have cyclic links, link has 1 type of class, anchor another # create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs['id'] not in self.internal_anchors: if anchor_tag.attrs['id'] not in self.internal_anchors:
new_anchor_span = soup.new_tag("span") new_anchor_span = self._create_new_anchor_span(soup, new_id)
new_anchor_span.attrs['id'] = anchor_tag.attrs['id']
new_anchor_span.attrs['class'] = 'link-anchor'
new_anchor_span.string = "\xa0"
anchor_tag.insert_before(new_anchor_span) anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(anchor_tag.attrs['id']) self.internal_anchors.add(anchor_tag.attrs['id'])
del anchor_tag.attrs['id'] del anchor_tag.attrs['id']
@@ -318,7 +321,7 @@ class EpubPostprocessor:
internal_link_tag.attrs['converter-mark'] = 'bad-link' internal_link_tag.attrs['converter-mark'] = 'bad-link'
self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.' self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={id_in_link}') f' Old id={a_tag_id}')
def build_one_anchored_section(self, node): def build_one_anchored_section(self, node):
""" """