forked from LiveCarta/BookConverter
add processing of JSON presets
This commit is contained in:
@@ -4,33 +4,34 @@ import codecs
|
||||
import os
|
||||
from os.path import dirname, normpath, join
|
||||
from itertools import chain
|
||||
from premailer import transform
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Union, List
|
||||
|
||||
|
||||
import ebooklib
|
||||
from ebooklib import epub
|
||||
from ebooklib.epub import Link, Section
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
from src.preset_processor import PresetProcessor
|
||||
from src.epub_converter.css_preprocessor import CSSPreprocessor
|
||||
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.data_objects import ChapterItem, NavPoint
|
||||
from src.epub_converter.image_processing import update_images_src_links
|
||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
|
||||
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
|
||||
from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
|
||||
prepare_title, prepare_content
|
||||
from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
|
||||
|
||||
|
||||
class EpubConverter:
|
||||
def __init__(self, file_path, access=None, logger=None):
|
||||
def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
|
||||
self.file_path = file_path
|
||||
self.access = access
|
||||
self.logger: BookLogger = logger
|
||||
self.ebooklib_book = epub.read_epub(file_path)
|
||||
self.css_processor = css_preprocessor
|
||||
self.html_preprocessor = html_processor
|
||||
|
||||
# main container for all epub .xhtml files
|
||||
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
|
||||
@@ -74,25 +75,15 @@ class EpubConverter:
|
||||
self.process_inline_styles_in_html_soup()
|
||||
self.logger.log("CSS files processing.")
|
||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||
self.logger.log("CSS styles adding.")
|
||||
self.logger.log("CSS styles adding.")
|
||||
self.add_css_styles_to_html_soup()
|
||||
|
||||
# todo presets
|
||||
|
||||
self.logger.log("Footnotes processing.")
|
||||
for href in self.html_href2html_body_soup:
|
||||
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
|
||||
self.html_href2html_body_soup)
|
||||
self.footnotes_contents.extend(content)
|
||||
self.noterefs.extend(noterefs)
|
||||
self.footnotes.extend(footnotes_tags)
|
||||
|
||||
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
|
||||
noteref.attrs["data-id"] = i + 1
|
||||
noteref.attrs["id"] = f"footnote-{i + 1}"
|
||||
footnote.attrs["href"] = f"#footnote-{i + 1}"
|
||||
|
||||
self.footnotes_contents, self.noterefs, self.footnotes =\
|
||||
preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
|
||||
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
|
||||
|
||||
self.logger.log("TOC processing.")
|
||||
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
||||
# build simple toc from spine if needed
|
||||
@@ -101,6 +92,7 @@ class EpubConverter:
|
||||
not_added = [
|
||||
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
|
||||
self.logger.log(f"Html documents not added to TOC: {not_added}.")
|
||||
self.logger.log(f"Add documents not added to TOC.")
|
||||
self.add_not_added_files_to_adjacency_list(not_added)
|
||||
self.logger.log(f"Html internal links and structure processing.")
|
||||
self.label_chapters_ids_with_lc_id()
|
||||
@@ -149,7 +141,7 @@ class EpubConverter:
|
||||
for tag_initial_inline_style in tags_with_inline_style:
|
||||
inline_style = tag_initial_inline_style.attrs["style"]
|
||||
tag_initial_inline_style.attrs["style"] = \
|
||||
build_inline_style_content(inline_style)
|
||||
self.css_processor.build_inline_style_content(inline_style)
|
||||
|
||||
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
||||
"""
|
||||
@@ -181,16 +173,53 @@ class EpubConverter:
|
||||
html_href2css_href[html_href].append(css_href)
|
||||
if css_href not in css_href2css_content:
|
||||
# css_href not in css_href2css_content, add to this dict
|
||||
css_href2css_content[css_href] = build_css_file_content(
|
||||
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
|
||||
self.get_css_content(css_href, html_href))
|
||||
|
||||
for i, tag in enumerate(soup_html_content.find_all("style")):
|
||||
css_content = tag.string
|
||||
html_href2css_href[html_href].append(f"href{i}")
|
||||
css_href2css_content[f"href{i}"] = build_css_file_content(
|
||||
css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
|
||||
css_content)
|
||||
return html_href2css_href, css_href2css_content
|
||||
|
||||
def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
||||
"""
|
||||
Function adds styles from .css to inline style.
|
||||
Parameters
|
||||
----------
|
||||
html_soup: BeautifulSoup
|
||||
html page with inline style
|
||||
css_text: str
|
||||
css content from css file
|
||||
Returns
|
||||
-------
|
||||
inline_soup: BeautifulSoup
|
||||
soup with styles from css
|
||||
|
||||
"""
|
||||
# remove this specification because it causes problems
|
||||
css_text = css_text.replace(
|
||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||
# here we add css styles to inline style
|
||||
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
||||
remove_classes=False,
|
||||
external_styles=False,
|
||||
allow_network=False,
|
||||
disable_validation=True,
|
||||
)
|
||||
# soup with converted styles from css
|
||||
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
|
||||
|
||||
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
attrs={"style": re.compile(".*")})
|
||||
|
||||
# go through the tags with inline style + style parsed from css file
|
||||
for tag_inline_style in tags_with_inline_style:
|
||||
style_converter = TagInlineStyleProcessor(tag_inline_style)
|
||||
style_converter.convert_initial_tag()
|
||||
return inline_soup
|
||||
|
||||
def add_css_styles_to_html_soup(self):
|
||||
"""
|
||||
This function is designed to update html_href2html_body_soup
|
||||
@@ -203,7 +232,7 @@ class EpubConverter:
|
||||
for css_href in self.html_href2css_href[html_href]:
|
||||
css += self.css_href2css_content[css_href]
|
||||
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
||||
html_content = convert_html_soup_with_css_style(html_content, css)
|
||||
html_content = self.convert_html_soup_with_css_style(html_content, css)
|
||||
self.html_href2html_body_soup[html_href] = html_content
|
||||
|
||||
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
||||
@@ -488,6 +517,48 @@ class EpubConverter:
|
||||
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
|
||||
f" Old id={a_tag_id}")
|
||||
|
||||
@staticmethod
|
||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||
"""
|
||||
After processing on a first_id that corresponds to current chapter,
|
||||
from initial html_soup all tags from current chapter are extracted
|
||||
Parameters
|
||||
----------
|
||||
first_id: str
|
||||
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
|
||||
href: str
|
||||
Name of current chapters file
|
||||
html_soup: Tag
|
||||
Soup object of current file
|
||||
|
||||
Returns
|
||||
-------
|
||||
tags: list [Tag, NavigableString]
|
||||
Chapter's tags
|
||||
|
||||
"""
|
||||
marked_tags = html_soup.find(
|
||||
attrs={"id": first_id, "class": "converter-chapter-mark"})
|
||||
if marked_tags:
|
||||
next_tag = marked_tags.next_sibling
|
||||
tags = []
|
||||
while next_tag:
|
||||
if not isinstance(next_tag, NavigableString) and \
|
||||
(next_tag.attrs.get("class") == "converter-chapter-mark"):
|
||||
break
|
||||
tags.append(next_tag)
|
||||
next_tag = next_tag.next_sibling
|
||||
|
||||
# remove tags between first_id and next found id
|
||||
# save them in list for next steps
|
||||
tags = [tag.extract() for tag in tags]
|
||||
html_soup.smooth()
|
||||
|
||||
else:
|
||||
assert 0, f"Warning: no match for {first_id, href}"
|
||||
|
||||
return tags
|
||||
|
||||
def detect_one_chapter(self, nav_point: NavPoint):
|
||||
"""
|
||||
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
||||
@@ -511,11 +582,11 @@ class EpubConverter:
|
||||
"""
|
||||
if nav_point.id:
|
||||
soup = self.html_href2html_body_soup[nav_point.href]
|
||||
chapter_tags = get_tags_between_chapter_marks(
|
||||
subchapter_tags = self.get_tags_between_chapter_marks(
|
||||
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
|
||||
new_tree = BeautifulSoup("", "html.parser")
|
||||
for tag in chapter_tags:
|
||||
new_tree.append(tag)
|
||||
for subchapter_tag in subchapter_tags:
|
||||
new_tree.append(subchapter_tag)
|
||||
self.href_chapter_id2soup_html[(
|
||||
nav_point.href, nav_point.id)] = new_tree
|
||||
|
||||
@@ -527,8 +598,8 @@ class EpubConverter:
|
||||
"""Function build chapters content, starts from top level chapters"""
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
if self.id_anchor_exist_in_nav_points:
|
||||
for point in top_level_nav_points:
|
||||
self.detect_one_chapter(point)
|
||||
for tl_nav_point in top_level_nav_points:
|
||||
self.detect_one_chapter(tl_nav_point)
|
||||
|
||||
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||
"""
|
||||
@@ -561,9 +632,9 @@ class EpubConverter:
|
||||
if hasattr(self.file_path, "stem") else "book_id")
|
||||
|
||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||
title_preprocessed = prepare_title(title)
|
||||
content_preprocessed = prepare_content(title_preprocessed, content,
|
||||
remove_title_from_chapter=is_chapter)
|
||||
title_preprocessed = self.html_preprocessor.prepare_title(title)
|
||||
content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
|
||||
remove_title_from_chapter=is_chapter)
|
||||
sub_nodes = []
|
||||
# warning! not EpubHtmlItems won't be added to chapter
|
||||
# if it doesn't have subchapters
|
||||
@@ -598,11 +669,17 @@ class EpubConverter:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
epub_file_path = "../../epub/9781641050234.epub"
|
||||
epub_file_path = "../../epub/Modern_Java_in_Action.epub"
|
||||
logger_object = BookLogger(
|
||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||
|
||||
json_converter = EpubConverter(epub_file_path, logger=logger_object)
|
||||
preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
|
||||
.get_preset_json()
|
||||
css_preprocessor = CSSPreprocessor(logger=logger_object)
|
||||
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
|
||||
|
||||
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
||||
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
|
||||
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
|
||||
|
||||
Reference in New Issue
Block a user