Small changes in process_internal_links

This commit is contained in:
Kiryl
2022-10-26 14:21:45 +03:00
parent bb3aa701b2
commit 9704fabff0

View File

@@ -339,7 +339,7 @@ class EpubConverter:
normed_path = path.normpath(path.join(
dir_name, href_in_link)).replace("\\", "/")
full_path = [
path for path in self.hrefs_added_to_toc if normed_path in path]
href_from_toc for href_from_toc in self.hrefs_added_to_toc if normed_path in href_from_toc]
if not full_path:
self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
f"While processing href in {internal_link_tag}.")
@@ -378,78 +378,79 @@ class EpubConverter:
"""
def make_ids_unique():
for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
for href_from_toc in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[href_from_toc].find_all(attrs={"id": re.compile(r".+")}):
if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
new_id = self.create_unique_id(
toc_href, tag.attrs["id"])
tag.attrs["id"] = new_id
new_unique_id = self.create_unique_id(
href_from_toc, tag.attrs["id"])
tag.attrs["id"] = new_unique_id
def process_file_anchor():
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all("a",
{"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
a_tag_href = internal_link_tag.attrs["href"]
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag)
if a_tag_href_matched_to_toc:
new_id = self.create_unique_id(
a_tag_href_matched_to_toc, "")
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
# insert a new span to the beginning of the file
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
def span_creation():
if new_unique_id not in self.internal_anchors:
anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor]
new_anchor_span = self.create_new_anchor_span(html_content, new_unique_id)
# insert a new span to the beginning of the file
anchor_html_content.insert(0, new_anchor_span)
self.internal_anchors.add(new_unique_id)
for html_href_from_toc in self.hrefs_added_to_toc:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc]
for internal_link_tag in html_content.find_all("a",
{"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
href_ = internal_link_tag.attrs["href"]
html_href_of_anchor = self.match_href_to_path_from_toc(
html_href_from_toc, href_, internal_link_tag)
if html_href_of_anchor:
new_unique_id = self.create_unique_id(html_href_of_anchor, "")
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}"
span_creation()
del internal_link_tag.attrs["href"]
def process_file_element_anchor():
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
# process_file_element_anchor
for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
"#")
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag) if a_tag_href \
else path.normpath(toc_href).replace("\\", "/")
if a_tag_href_matched_to_toc:
new_id = self.create_unique_id(
a_tag_href_matched_to_toc, a_tag_id)
def span_creation():
if anchor_tag.attrs["id"] not in self.internal_anchors:
new_anchor_span = self.create_new_anchor_span(
html_content, new_unique_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_unique_id)
del anchor_tag.attrs["id"]
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
anchor_soup.find_all(
attrs={"id": a_tag_id}) # if link is a footnote
for html_href_from_toc in self.hrefs_added_to_toc:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc]
# process_file_element_anchor
for internal_link_tag in html_content.find_all("a",
{"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
href_, id_ = internal_link_tag.attrs["href"].split("#")
html_href_of_anchor = self.match_href_to_path_from_toc(
html_href_from_toc, href_, internal_link_tag) if href_ \
else path.normpath(html_href_from_toc).replace("\\", "/") # the same page
if html_href_of_anchor:
new_unique_id = self.create_unique_id(html_href_of_anchor, id_)
anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor]
anchor_tags = anchor_html_content.find_all(attrs={"id": new_unique_id}) or \
anchor_html_content.find_all(attrs={"id": id_}) # if link is a footnote
if anchor_tags:
if len(anchor_tags) > 1:
self.logger.log(f"Warning in {toc_href}: multiple anchors:"
self.logger.log(f"Warning in {html_href_from_toc}: multiple anchors:"
f"{len(anchor_tags)} found.\n"
f"{anchor_tags}\n"
f"While processing {internal_link_tag}")
anchor_tag = anchor_tags[0]
assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
assert anchor_tag.attrs["id"] in [new_unique_id, id_]
# if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}"
# create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs["id"] not in self.internal_anchors:
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id)
del anchor_tag.attrs["id"]
span_creation()
del internal_link_tag.attrs["href"]
else:
internal_link_tag.attrs["converter-mark"] = "bad-link"
self.logger.log(f"Error in {toc_href}."
self.logger.log(f"Error in {html_href_from_toc}."
f" While processing {internal_link_tag} no anchor found."
f" Should be anchor with new id={new_id} in"
f" {a_tag_href_matched_to_toc} file."
f" Old id={a_tag_id}")
f" Should be anchor with new id={new_unique_id} in"
f" {html_href_of_anchor} file."
f" Old id={id_}")
# 1. make ids to be unique in all documents
make_ids_unique()
# 2a. process anchor which is a whole htm|html|xhtml file