add processing of JSON presets

This commit is contained in:
Kiryl
2022-07-07 19:32:24 +03:00
parent 687c09417a
commit c4752a19db
5 changed files with 497 additions and 417 deletions

View File

@@ -4,33 +4,34 @@ import codecs
import os
from os.path import dirname, normpath, join
from itertools import chain
from premailer import transform
from collections import defaultdict
from typing import Dict, Union, List
import ebooklib
from ebooklib import epub
from ebooklib.epub import Link, Section
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup, NavigableString, Tag
from src.util.helpers import BookLogger
from src.preset_processor import PresetProcessor
from src.epub_converter.css_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
prepare_title, prepare_content
from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
class EpubConverter:
def __init__(self, file_path, access=None, logger=None):
def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
self.file_path = file_path
self.access = access
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file_path)
self.css_processor = css_preprocessor
self.html_preprocessor = html_processor
# main container for all epub .xhtml files
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
@@ -74,25 +75,15 @@ class EpubConverter:
self.process_inline_styles_in_html_soup()
self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log("CSS styles adding.")
self.logger.log("CSS styles adding.")
self.add_css_styles_to_html_soup()
# todo presets
self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup:
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
self.html_href2html_body_soup)
self.footnotes_contents.extend(content)
self.noterefs.extend(noterefs)
self.footnotes.extend(footnotes_tags)
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
noteref.attrs["data-id"] = i + 1
noteref.attrs["id"] = f"footnote-{i + 1}"
footnote.attrs["href"] = f"#footnote-{i + 1}"
self.footnotes_contents, self.noterefs, self.footnotes =\
preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
self.logger.log("TOC processing.")
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed
@@ -101,6 +92,7 @@ class EpubConverter:
not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f"Html documents not added to TOC: {not_added}.")
self.logger.log(f"Add documents not added to TOC.")
self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f"Html internal links and structure processing.")
self.label_chapters_ids_with_lc_id()
@@ -149,7 +141,7 @@ class EpubConverter:
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
build_inline_style_content(inline_style)
self.css_processor.build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]:
"""
@@ -181,16 +173,53 @@ class EpubConverter:
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_file_content(
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string
html_href2css_href[html_href].append(f"href{i}")
css_href2css_content[f"href{i}"] = build_css_file_content(
css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
css_content)
return html_href2css_href, css_href2css_content
def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""
Function adds styles from .css to inline style.
Parameters
----------
html_soup: BeautifulSoup
html page with inline style
css_text: str
css content from css file
Returns
-------
inline_soup: BeautifulSoup
soup with styles from css
"""
# remove this specification because it causes problems
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:
style_converter = TagInlineStyleProcessor(tag_inline_style)
style_converter.convert_initial_tag()
return inline_soup
def add_css_styles_to_html_soup(self):
"""
This function is designed to update html_href2html_body_soup
@@ -203,7 +232,7 @@ class EpubConverter:
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
html_content = convert_html_soup_with_css_style(html_content, css)
html_content = self.convert_html_soup_with_css_style(html_content, css)
self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -488,6 +517,48 @@ class EpubConverter:
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
f" Old id={a_tag_id}")
@staticmethod
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id: str
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
href: str
Name of current chapters file
html_soup: Tag
Soup object of current file
Returns
-------
tags: list [Tag, NavigableString]
Chapter's tags
"""
marked_tags = html_soup.find(
attrs={"id": first_id, "class": "converter-chapter-mark"})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and \
(next_tag.attrs.get("class") == "converter-chapter-mark"):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f"Warning: no match for {first_id, href}"
return tags
def detect_one_chapter(self, nav_point: NavPoint):
"""
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
@@ -511,11 +582,11 @@ class EpubConverter:
"""
if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks(
subchapter_tags = self.get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup("", "html.parser")
for tag in chapter_tags:
new_tree.append(tag)
for subchapter_tag in subchapter_tags:
new_tree.append(subchapter_tag)
self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)] = new_tree
@@ -527,8 +598,8 @@ class EpubConverter:
"""Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points:
self.detect_one_chapter(point)
for tl_nav_point in top_level_nav_points:
self.detect_one_chapter(tl_nav_point)
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
"""
@@ -561,9 +632,9 @@ class EpubConverter:
if hasattr(self.file_path, "stem") else "book_id")
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)
content_preprocessed = prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
title_preprocessed = self.html_preprocessor.prepare_title(title)
content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter
# if it doesn't have subchapters
@@ -598,11 +669,17 @@ class EpubConverter:
if __name__ == "__main__":
epub_file_path = "../../epub/9781641050234.epub"
epub_file_path = "../../epub/Modern_Java_in_Action.epub"
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])
json_converter = EpubConverter(epub_file_path, logger=logger_object)
preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
.get_preset_json()
css_preprocessor = CSSPreprocessor(logger=logger_object)
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
json_converter = EpubConverter(epub_file_path, logger=logger_object,
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
content_dict = json_converter.convert_to_dict()
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: