add processing of JSON presets

2022-07-07 19:32:24 +03:00
parent 687c09417a
commit c4752a19db
5 changed files with 497 additions and 417 deletions
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -4,33 +4,34 @@ import codecs
 import os
 from os.path import dirname, normpath, join
 from itertools import chain
+from premailer import transform
 from collections import defaultdict
 from typing import Dict, Union, List

-
 import ebooklib
 from ebooklib import epub
 from ebooklib.epub import Link, Section
-from bs4 import BeautifulSoup, Tag
-
+from bs4 import BeautifulSoup, NavigableString, Tag

 from src.util.helpers import BookLogger
+from src.preset_processor import PresetProcessor
+from src.epub_converter.css_preprocessor import CSSPreprocessor
+from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.epub_converter.image_processing import update_images_src_links
 from src.epub_converter.footnotes_processing import preprocess_footnotes
-from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
-from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
-from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
-    prepare_title, prepare_content
+from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor


 class EpubConverter:
-    def __init__(self, file_path, access=None, logger=None):
+    def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
        self.file_path = file_path
        self.access = access
        self.logger: BookLogger = logger
        self.ebooklib_book = epub.read_epub(file_path)
+        self.css_processor = css_preprocessor
+        self.html_preprocessor = html_processor

        # main container for all epub .xhtml files
        self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
@@ -74,25 +75,15 @@ class EpubConverter:
        self.process_inline_styles_in_html_soup()
        self.logger.log("CSS files processing.")
        self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
-        self.logger.log("CSS styles adding.")
+        self.logger.log("CSS  styles adding.")
        self.add_css_styles_to_html_soup()

-        # todo presets
-
        self.logger.log("Footnotes processing.")
        for href in self.html_href2html_body_soup:
-            content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
-                                                                     self.html_href2html_body_soup)
-            self.footnotes_contents.extend(content)
-            self.noterefs.extend(noterefs)
-            self.footnotes.extend(footnotes_tags)
-
-        for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
-            noteref.attrs["data-id"] = i + 1
-            noteref.attrs["id"] = f"footnote-{i + 1}"
-            footnote.attrs["href"] = f"#footnote-{i + 1}"
-
+            self.footnotes_contents, self.noterefs, self.footnotes =\
+                preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
        self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
+
        self.logger.log("TOC processing.")
        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
        # build simple toc from spine if needed
@@ -101,6 +92,7 @@ class EpubConverter:
        not_added = [
            x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
        self.logger.log(f"Html documents not added to TOC: {not_added}.")
+        self.logger.log(f"Add documents not added to TOC.")
        self.add_not_added_files_to_adjacency_list(not_added)
        self.logger.log(f"Html internal links and structure processing.")
        self.label_chapters_ids_with_lc_id()
@@ -149,7 +141,7 @@ class EpubConverter:
            for tag_initial_inline_style in tags_with_inline_style:
                inline_style = tag_initial_inline_style.attrs["style"]
                tag_initial_inline_style.attrs["style"] = \
-                    build_inline_style_content(inline_style)
+                    self.css_processor.build_inline_style_content(inline_style)

    def build_html_and_css_relations(self) -> tuple[dict, dict]:
        """
@@ -181,16 +173,53 @@ class EpubConverter:
                html_href2css_href[html_href].append(css_href)
                if css_href not in css_href2css_content:
                    # css_href not in css_href2css_content, add to this dict
-                    css_href2css_content[css_href] = build_css_file_content(
+                    css_href2css_content[css_href] = self.css_processor.build_css_file_content(
                        self.get_css_content(css_href, html_href))

            for i, tag in enumerate(soup_html_content.find_all("style")):
                css_content = tag.string
                html_href2css_href[html_href].append(f"href{i}")
-                css_href2css_content[f"href{i}"] = build_css_file_content(
+                css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
                    css_content)
        return html_href2css_href, css_href2css_content

+    def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
+        """
+        Function adds styles from .css to inline style.
+        Parameters
+        ----------
+        html_soup: BeautifulSoup
+            html page with inline style
+        css_text: str
+            css content from css file
+        Returns
+        -------
+        inline_soup: BeautifulSoup
+            soup with styles from css
+
+        """
+        # remove this specification because it causes problems
+        css_text = css_text.replace(
+            '@namespace epub "http://www.idpf.org/2007/ops";', '')
+        # here we add css styles to inline style
+        html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
+                                              remove_classes=False,
+                                              external_styles=False,
+                                              allow_network=False,
+                                              disable_validation=True,
+                                              )
+        # soup with converted styles from css
+        inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
+
+        tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
+                                                      attrs={"style": re.compile(".*")})
+
+        # go through the tags with inline style + style parsed from css file
+        for tag_inline_style in tags_with_inline_style:
+            style_converter = TagInlineStyleProcessor(tag_inline_style)
+            style_converter.convert_initial_tag()
+        return inline_soup
+
    def add_css_styles_to_html_soup(self):
        """
        This function is designed to update html_href2html_body_soup
@@ -203,7 +232,7 @@ class EpubConverter:
                for css_href in self.html_href2css_href[html_href]:
                    css += self.css_href2css_content[css_href]
                html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
-                html_content = convert_html_soup_with_css_style(html_content, css)
+                html_content = self.convert_html_soup_with_css_style(html_content, css)
                self.html_href2html_body_soup[html_href] = html_content

    def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -488,6 +517,48 @@ class EpubConverter:
                                    f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
                                    f" Old id={a_tag_id}")

+    @staticmethod
+    def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
+        """
+        After processing on a first_id that corresponds to current chapter,
+        from initial html_soup all tags from current chapter are extracted
+        Parameters
+        ----------
+        first_id: str
+            Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
+        href: str
+            Name of current chapters file
+        html_soup: Tag
+            Soup object of current  file
+
+        Returns
+        -------
+        tags: list [Tag, NavigableString]
+            Chapter's tags
+
+        """
+        marked_tags = html_soup.find(
+            attrs={"id": first_id, "class": "converter-chapter-mark"})
+        if marked_tags:
+            next_tag = marked_tags.next_sibling
+            tags = []
+            while next_tag:
+                if not isinstance(next_tag, NavigableString) and \
+                        (next_tag.attrs.get("class") == "converter-chapter-mark"):
+                    break
+                tags.append(next_tag)
+                next_tag = next_tag.next_sibling
+
+            # remove tags between first_id and next found id
+            # save them in list for next steps
+            tags = [tag.extract() for tag in tags]
+            html_soup.smooth()
+
+        else:
+            assert 0, f"Warning: no match for {first_id, href}"
+
+        return tags
+
    def detect_one_chapter(self, nav_point: NavPoint):
        """
        Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
@@ -511,11 +582,11 @@ class EpubConverter:
        """
        if nav_point.id:
            soup = self.html_href2html_body_soup[nav_point.href]
-            chapter_tags = get_tags_between_chapter_marks(
+            subchapter_tags = self.get_tags_between_chapter_marks(
                first_id=nav_point.id, href=nav_point.href, html_soup=soup)
            new_tree = BeautifulSoup("", "html.parser")
-            for tag in chapter_tags:
-                new_tree.append(tag)
+            for subchapter_tag in subchapter_tags:
+                new_tree.append(subchapter_tag)
            self.href_chapter_id2soup_html[(
                nav_point.href, nav_point.id)] = new_tree

@@ -527,8 +598,8 @@ class EpubConverter:
        """Function build chapters content, starts from top level chapters"""
        top_level_nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
-            for point in top_level_nav_points:
-                self.detect_one_chapter(point)
+            for tl_nav_point in top_level_nav_points:
+                self.detect_one_chapter(tl_nav_point)

    def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
        """
@@ -561,9 +632,9 @@ class EpubConverter:
                                                                    if hasattr(self.file_path, "stem") else "book_id")

        is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
-        title_preprocessed = prepare_title(title)
-        content_preprocessed = prepare_content(title_preprocessed, content,
-                                               remove_title_from_chapter=is_chapter)
+        title_preprocessed = self.html_preprocessor.prepare_title(title)
+        content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
+                                                                      remove_title_from_chapter=is_chapter)
        sub_nodes = []
        # warning! not EpubHtmlItems won't be added to chapter
        # if it doesn't have subchapters
@@ -598,11 +669,17 @@ class EpubConverter:


 if __name__ == "__main__":
-    epub_file_path = "../../epub/9781641050234.epub"
+    epub_file_path = "../../epub/Modern_Java_in_Action.epub"
    logger_object = BookLogger(
        name="epub", book_id=epub_file_path.split("/")[-1])

-    json_converter = EpubConverter(epub_file_path, logger=logger_object)
+    preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
+        .get_preset_json()
+    css_preprocessor = CSSPreprocessor(logger=logger_object)
+    html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
+
+    json_converter = EpubConverter(epub_file_path, logger=logger_object,
+                                   css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
    content_dict = json_converter.convert_to_dict()

    with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: