forked from LiveCarta/BookConverter
rewrite process internal links
This commit is contained in:
@@ -94,13 +94,15 @@ class EpubConverter:
|
|||||||
self.logger.log(f"Html documents not added to TOC: {not_added}.")
|
self.logger.log(f"Html documents not added to TOC: {not_added}.")
|
||||||
self.logger.log(f"Add documents not added to TOC.")
|
self.logger.log(f"Add documents not added to TOC.")
|
||||||
self.add_not_added_files_to_adjacency_list(not_added)
|
self.add_not_added_files_to_adjacency_list(not_added)
|
||||||
self.logger.log(f"Html internal links and structure processing.")
|
self.logger.log(f"Label subchapters with converter tag.")
|
||||||
self.label_chapters_ids_with_lc_id()
|
self.label_subchapters_with_lc_tag()
|
||||||
self.chapter_marks_are_same_level()
|
self.logger.log(f"Process html internal links.")
|
||||||
# used only after parsed toc, ids from toc needed
|
|
||||||
self.process_internal_links()
|
self.process_internal_links()
|
||||||
|
self.logger.log(
|
||||||
|
f"Check if converter-chapter-marks are on the same level.")
|
||||||
|
self.chapter_marks_are_same_level()
|
||||||
self.logger.log(f"Define chapters content.")
|
self.logger.log(f"Define chapters content.")
|
||||||
self.define_chapters_content()
|
self.define_chapters_with_content()
|
||||||
self.logger.log(f"Converting html_nodes to LiveCarta chapter items.")
|
self.logger.log(f"Converting html_nodes to LiveCarta chapter items.")
|
||||||
|
|
||||||
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
||||||
@@ -286,14 +288,14 @@ class EpubConverter:
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def build_manifest_id2html_href(self) -> dict:
|
|
||||||
links = dict()
|
|
||||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
|
||||||
links[item.id] = item.file_name
|
|
||||||
return links
|
|
||||||
|
|
||||||
def build_adjacency_list_from_spine(self):
|
def build_adjacency_list_from_spine(self):
|
||||||
manifest_id2html_href = self.build_manifest_id2html_href()
|
def build_manifest_id2html_href() -> dict:
|
||||||
|
links = dict()
|
||||||
|
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||||
|
links[item.id] = item.file_name
|
||||||
|
return links
|
||||||
|
|
||||||
|
manifest_id2html_href = build_manifest_id2html_href()
|
||||||
self.adjacency_list = {
|
self.adjacency_list = {
|
||||||
-1: []
|
-1: []
|
||||||
}
|
}
|
||||||
@@ -311,16 +313,16 @@ class EpubConverter:
|
|||||||
self.adjacency_list[-1].append(nav_point)
|
self.adjacency_list[-1].append(nav_point)
|
||||||
self.hrefs_added_to_toc.add(file)
|
self.hrefs_added_to_toc.add(file)
|
||||||
|
|
||||||
def label_chapters_ids_with_lc_id(self):
|
def label_subchapters_with_lc_tag(self):
|
||||||
for html_href in self.html_href2html_body_soup:
|
for html_href in self.html_href2html_body_soup:
|
||||||
ids = self.html_href2subchapter_ids[html_href]
|
ids, soup = self.html_href2subchapters_ids[html_href], \
|
||||||
|
self.html_href2html_body_soup[html_href]
|
||||||
for i in ids:
|
for i in ids:
|
||||||
soup = self.html_href2html_body_soup[html_href]
|
|
||||||
tag = soup.find(id=i)
|
tag = soup.find(id=i)
|
||||||
new_h = soup.new_tag("tmp")
|
tmp_tag = soup.new_tag("lc_tmp")
|
||||||
new_h.attrs["class"] = "converter-chapter-mark"
|
tmp_tag.attrs["class"] = "converter-chapter-mark"
|
||||||
new_h.attrs["id"] = i
|
tmp_tag.attrs["id"] = i
|
||||||
tag.insert_before(new_h)
|
tag.insert_before(tmp_tag)
|
||||||
|
|
||||||
def chapter_marks_are_same_level(self):
|
def chapter_marks_are_same_level(self):
|
||||||
"""
|
"""
|
||||||
@@ -401,8 +403,8 @@ class EpubConverter:
|
|||||||
Steps
|
Steps
|
||||||
----------
|
----------
|
||||||
1. rebuild ids to be unique in all documents
|
1. rebuild ids to be unique in all documents
|
||||||
2a. process anchor which is a whole xhtml file
|
2a. process anchor which is a whole htm|html|xhtml file
|
||||||
2b. process anchor which is an element in xhtml file
|
2b. process anchor which is an element in htm|html|xhtml file
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -410,91 +412,80 @@ class EpubConverter:
|
|||||||
process links in html
|
process links in html
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# 1. rebuild ids to be unique in all documents
|
def make_ids_unique():
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
|
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
|
||||||
if tag.attrs.get("class") == "converter-chapter-mark":
|
if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
|
||||||
continue
|
new_id = self.create_unique_id(toc_href, tag.attrs["id"])
|
||||||
|
tag.attrs["id"] = new_id
|
||||||
|
|
||||||
if tag.attrs.get("class") == "footnote-element":
|
def process_file_anchor():
|
||||||
continue
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
|
soup = self.html_href2html_body_soup[toc_href]
|
||||||
|
for internal_link_tag in soup.find_all("a",
|
||||||
|
{"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
|
||||||
|
a_tag_href = internal_link_tag.attrs["href"]
|
||||||
|
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
|
||||||
|
toc_href, a_tag_href, internal_link_tag)
|
||||||
|
if a_tag_href_matched_to_toc:
|
||||||
|
new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
|
||||||
|
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
|
||||||
|
if new_id not in self.internal_anchors:
|
||||||
|
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||||
|
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
||||||
|
# insert a new span to the beginning of the file
|
||||||
|
anchor_soup.insert(0, new_anchor_span)
|
||||||
|
self.internal_anchors.add(new_id)
|
||||||
|
del internal_link_tag.attrs["href"]
|
||||||
|
|
||||||
new_id = self.create_unique_id(toc_href, tag.attrs["id"])
|
def process_file_element_anchor():
|
||||||
tag.attrs["id"] = new_id
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
|
soup = self.html_href2html_body_soup[toc_href]
|
||||||
|
# process_file_element_anchor
|
||||||
|
for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
|
||||||
|
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#")
|
||||||
|
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
|
||||||
|
toc_href, a_tag_href, internal_link_tag) if a_tag_href \
|
||||||
|
else os.path.normpath(toc_href).replace("\\", "/")
|
||||||
|
if a_tag_href_matched_to_toc:
|
||||||
|
new_id = self.create_unique_id(
|
||||||
|
a_tag_href_matched_to_toc, a_tag_id)
|
||||||
|
|
||||||
# 2a. process anchor which is a whole xhtml file
|
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
||||||
internal_link_reg1 = re.compile(
|
anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
|
||||||
r"(^(?!https?://).+\.(htm|html|xhtml)$)")
|
anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
if anchor_tags:
|
||||||
soup = self.html_href2html_body_soup[toc_href]
|
if len(anchor_tags) > 1:
|
||||||
for internal_link_tag in soup.find_all("a", {"href": internal_link_reg1}):
|
self.logger.log(f"Warning in {toc_href}: multiple anchors:"
|
||||||
a_tag_href = internal_link_tag.attrs["href"]
|
f"{len(anchor_tags)} found.\n"
|
||||||
# find full path
|
f"{anchor_tags}\n"
|
||||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
|
f"While processing {internal_link_tag}")
|
||||||
toc_href, a_tag_href, internal_link_tag)
|
|
||||||
if not a_tag_href_matched_to_toc:
|
|
||||||
continue
|
|
||||||
new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
|
|
||||||
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
|
|
||||||
if new_id not in self.internal_anchors:
|
|
||||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
|
||||||
new_anchor_span = self.create_new_anchor_span(soup, new_id)
|
|
||||||
# insert a new span to the beginning of the file
|
|
||||||
anchor_soup.insert(0, new_anchor_span)
|
|
||||||
self.internal_anchors.add(new_id)
|
|
||||||
|
|
||||||
del internal_link_tag.attrs["href"]
|
anchor_tag = anchor_tags[0]
|
||||||
|
assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
|
||||||
# 2b. process anchor which is an element in xhtml file
|
# if anchor is found we could add placeholder for link creation on server side.
|
||||||
internal_link_reg2 = re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")
|
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
# create span to have cyclic links, link has 1 type of class, anchor another
|
||||||
soup = self.html_href2html_body_soup[toc_href]
|
if anchor_tag.attrs["id"] not in self.internal_anchors:
|
||||||
for internal_link_tag in soup.find_all("a", {"href": internal_link_reg2}):
|
new_anchor_span = self.create_new_anchor_span(
|
||||||
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
|
soup, new_id)
|
||||||
"#")
|
anchor_tag.insert_before(new_anchor_span)
|
||||||
# find full path
|
self.internal_anchors.add(new_id)
|
||||||
if a_tag_href:
|
del anchor_tag.attrs["id"]
|
||||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
|
del internal_link_tag.attrs["href"]
|
||||||
internal_link_tag)
|
else:
|
||||||
else:
|
internal_link_tag.attrs["converter-mark"] = "bad-link"
|
||||||
a_tag_href_matched_to_toc = os.path.normpath(
|
self.logger.log(f"Error in {toc_href}."
|
||||||
toc_href).replace("\\", "/")
|
f" While processing {internal_link_tag} no anchor found."
|
||||||
|
f" Should be anchor with new id={new_id} in"
|
||||||
if not a_tag_href_matched_to_toc:
|
f" {a_tag_href_matched_to_toc} file."
|
||||||
continue
|
f" Old id={a_tag_id}")
|
||||||
|
# 1. make ids to be unique in all documents
|
||||||
new_id = self.create_unique_id(
|
make_ids_unique()
|
||||||
a_tag_href_matched_to_toc, a_tag_id)
|
# 2a. process anchor which is a whole htm|html|xhtml file
|
||||||
|
process_file_anchor()
|
||||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
# 2b. process anchor which is an element in htm|html|xhtml file
|
||||||
anchor_tags = anchor_soup.find_all(attrs={"id": new_id, })
|
process_file_element_anchor()
|
||||||
anchor_tags = anchor_tags or anchor_soup.find_all(
|
|
||||||
attrs={"id": a_tag_id}) # if link is a footnote
|
|
||||||
|
|
||||||
if anchor_tags:
|
|
||||||
if len(anchor_tags) > 1:
|
|
||||||
self.logger.log(f"Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n"
|
|
||||||
f"{anchor_tags}\n"
|
|
||||||
f" While processing {internal_link_tag}")
|
|
||||||
|
|
||||||
anchor_tag = anchor_tags[0]
|
|
||||||
assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
|
|
||||||
# if anchor is found we could add placeholder for link creation on server side.
|
|
||||||
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
|
|
||||||
# create span to have cyclic links, link has 1 type of class, anchor another
|
|
||||||
if anchor_tag.attrs["id"] not in self.internal_anchors:
|
|
||||||
new_anchor_span = self.create_new_anchor_span(
|
|
||||||
soup, new_id)
|
|
||||||
anchor_tag.insert_before(new_anchor_span)
|
|
||||||
self.internal_anchors.add(new_id)
|
|
||||||
del anchor_tag.attrs["id"]
|
|
||||||
del internal_link_tag.attrs["href"]
|
|
||||||
|
|
||||||
else:
|
|
||||||
internal_link_tag.attrs["converter-mark"] = "bad-link"
|
|
||||||
self.logger.log(f"Error in {toc_href}. While processing {internal_link_tag} no anchor found."
|
|
||||||
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
|
|
||||||
f" Old id={a_tag_id}")
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||||
@@ -606,10 +597,14 @@ class EpubConverter:
|
|||||||
book_id=self.file_path.stem
|
book_id=self.file_path.stem
|
||||||
if hasattr(self.file_path, "stem") else "book_id")
|
if hasattr(self.file_path, "stem") else "book_id")
|
||||||
|
|
||||||
|
indent = " " * lvl
|
||||||
|
self.logger.log(indent + f"Chapter: {title} is processing.")
|
||||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||||
title_preprocessed = self.html_preprocessor.prepare_title(title)
|
self.logger.log(indent + "Process title.")
|
||||||
content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
|
title_preprocessed = self.html_processor.prepare_title(title)
|
||||||
remove_title_from_chapter=is_chapter)
|
self.logger.log(indent + "Process content.")
|
||||||
|
content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
|
||||||
|
remove_title_from_chapter=is_chapter)
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
# warning! not EpubHtmlItems won't be added to chapter
|
# warning! not EpubHtmlItems won't be added to chapter
|
||||||
# if it doesn't have subchapters
|
# if it doesn't have subchapters
|
||||||
@@ -618,10 +613,6 @@ class EpubConverter:
|
|||||||
sub_chapter_item = self.html_node_to_livecarta_chapter_item(
|
sub_chapter_item = self.html_node_to_livecarta_chapter_item(
|
||||||
sub_node, lvl + 1)
|
sub_node, lvl + 1)
|
||||||
sub_nodes.append(sub_chapter_item)
|
sub_nodes.append(sub_chapter_item)
|
||||||
|
|
||||||
if self.logger:
|
|
||||||
indent = " " * lvl
|
|
||||||
self.logger.log(f"{indent}Chapter: {title} is prepared.")
|
|
||||||
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
|
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
|
||||||
|
|
||||||
def convert_to_dict(self) -> dict:
|
def convert_to_dict(self) -> dict:
|
||||||
@@ -644,17 +635,18 @@ class EpubConverter:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
epub_file_path = "../../epub/Modern_Java_in_Action.epub"
|
epub_file_path = "../../epub/9781641050234.epub"
|
||||||
logger_object = BookLogger(
|
logger_object = BookLogger(
|
||||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||||
|
|
||||||
preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
|
preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
|
||||||
.get_preset_json()
|
.get_preset_json()
|
||||||
css_preprocessor = CSSPreprocessor(logger=logger_object)
|
css_processor = CSSPreprocessor()
|
||||||
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
|
html_processor = HtmlEpubPreprocessor(
|
||||||
|
preset=preset, logger=logger_object)
|
||||||
|
|
||||||
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
||||||
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
|
css_processor=css_processor, html_processor=html_processor)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
|
|
||||||
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
|
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
|
||||||
|
|||||||
Reference in New Issue
Block a user