rewrite css_processor.py

2022-07-08 18:35:34 +03:00
parent c4752a19db
commit 1926377a34
3 changed files with 78 additions and 80 deletions
--- a/src/epub_converter/css_preprocessor.py
+++ b/src/epub_converter/css_preprocessor.py
@@ -1,14 +1,14 @@
 import re
 import cssutils
+from bs4 import BeautifulSoup
+from os.path import dirname, normpath, join

-from src.util.helpers import BookLogger
 from src.util.color_reader import str2hex
 from src.livecarta_config import LiveCartaConfig


 class CSSPreprocessor:
-    def __init__(self, logger=None):
-        self.logger: BookLogger = logger
+    def __init__(self):
        """
        Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }

@@ -99,12 +99,8 @@ class CSSPreprocessor:
        size_value: str

        """
-        if len(size_value.split(" ")) == 3:
-            size_value = self.convert_tag_style_values(size_value.split(
-                " ")[-2], True)  # returns middle value
-        else:
-            size_value = self.convert_tag_style_values(size_value.split(
-                " ")[-1], True)  # returns last value
+        size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
+            else self.convert_tag_style_values(size_value.split(" ")[-1], True)
        return size_value

    @staticmethod
@@ -152,10 +148,37 @@ class CSSPreprocessor:
        style = "; ".join(split_style)
        return style

+    def process_inline_styles_in_html_soup(self, html_href2html_body_soup):
+        """This function is designed to convert inline html styles"""
+        for html_href in html_href2html_body_soup:
+            html_content: BeautifulSoup = html_href2html_body_soup[html_href]
+            tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
+                                                           attrs={"style": re.compile(".*")})
+
+            for tag_initial_inline_style in tags_with_inline_style:
+                inline_style = tag_initial_inline_style.attrs["style"]
+                tag_initial_inline_style.attrs["style"] = \
+                    self.build_inline_style_content(inline_style)
+
+    @staticmethod
+    def get_css_content(css_href, html_href, ebooklib_book):
+        path_to_css_from_html = css_href
+        html_folder = dirname(html_href)
+        path_to_css_from_root = normpath(
+            join(html_folder, path_to_css_from_html)).replace("\\", "/")
+        css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root)
+        # if in css file we import another css
+        if "@import" in str(css_obj.content):
+            path_to_css_from_root = "css/" + \
+                re.search("'(.*)'", str(css_obj.content)).group(1)
+            css_obj = ebooklib_book.get_item_with_href(
+                path_to_css_from_root)
+        assert css_obj, f"Css style {css_href} was not in manifest."
+        css_content: str = css_obj.get_content().decode()
+        return css_content
+
    def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
                                                  style_type: cssutils.css.property.Property):
-        if style_type.name == "font-family":
-            pass
        if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
            # property not in LIVECARTA_STYLE_ATTRS, remove from css file
            css_rule.style[style_type.name] = ""
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -2,7 +2,6 @@ import re
 import json
 import codecs
 import os
-from os.path import dirname, normpath, join
 from itertools import chain
 from premailer import transform
 from collections import defaultdict
@@ -15,8 +14,8 @@ from bs4 import BeautifulSoup, NavigableString, Tag

 from src.util.helpers import BookLogger
 from src.preset_processor import PresetProcessor
-from src.epub_converter.css_preprocessor import CSSPreprocessor
-from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
+from src.epub_converter.css_processor import CSSPreprocessor
+from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.epub_converter.image_processing import update_images_src_links
@@ -25,18 +24,18 @@ from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcesso


 class EpubConverter:
-    def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
+    def __init__(self, file_path, access=None, logger=None, css_processor=None, html_processor=None):
        self.file_path = file_path
        self.access = access
        self.logger: BookLogger = logger
        self.ebooklib_book = epub.read_epub(file_path)
-        self.css_processor = css_preprocessor
-        self.html_preprocessor = html_processor
+        self.css_processor = css_processor
+        self.html_processor = html_processor

        # main container for all epub .xhtml files
        self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
        # enumerate all subchapter id for each file
-        self.html_href2subchapter_ids = defaultdict(list)
+        self.html_href2subchapters_ids = defaultdict(list)
        self.hrefs_added_to_toc = set()  # enumerate all file paths that where added to TOC

        # toc tree structure stored as adj.list (NavPoint to list of NavPoints)
@@ -71,17 +70,18 @@ class EpubConverter:
        self.html_href2html_body_soup: Dict[str,
                                            BeautifulSoup] = self.build_href2soup_content()

-        self.logger.log("Process CSS inline styles.")
-        self.process_inline_styles_in_html_soup()
+        self.logger.log("CSS inline style processing.")
+        self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
        self.logger.log("CSS files processing.")
        self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
-        self.logger.log("CSS  styles adding.")
+        self.logger.log("CSS styles fusion(inline+file).")
        self.add_css_styles_to_html_soup()

        self.logger.log("Footnotes processing.")
        for href in self.html_href2html_body_soup:
            self.footnotes_contents, self.noterefs, self.footnotes =\
-                preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
+                preprocess_footnotes(
+                    self.html_href2html_body_soup[href], self.html_href2html_body_soup)
        self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")

        self.logger.log("TOC processing.")
@@ -115,34 +115,6 @@ class EpubConverter:
            nodes[item.file_name] = soup
        return nodes

-    def get_css_content(self, css_href, html_href):
-        path_to_css_from_html = css_href
-        html_folder = dirname(html_href)
-        path_to_css_from_root = normpath(
-            join(html_folder, path_to_css_from_html)).replace("\\", "/")
-        css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
-        # if in css file we import another css
-        if "@import" in str(css_obj.content):
-            path_to_css_from_root = "css/" + \
-                re.search("'(.*)'", str(css_obj.content)).group(1)
-            css_obj = self.ebooklib_book.get_item_with_href(
-                path_to_css_from_root)
-        assert css_obj, f"Css style {css_href} was not in manifest."
-        css_content: str = css_obj.get_content().decode()
-        return css_content
-
-    def process_inline_styles_in_html_soup(self):
-        """This function is designed to convert inline html styles"""
-        for html_href in self.html_href2html_body_soup:
-            html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
-            tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
-                                                           attrs={"style": re.compile(".*")})
-
-            for tag_initial_inline_style in tags_with_inline_style:
-                inline_style = tag_initial_inline_style.attrs["style"]
-                tag_initial_inline_style.attrs["style"] = \
-                    self.css_processor.build_inline_style_content(inline_style)
-
    def build_html_and_css_relations(self) -> tuple[dict, dict]:
        """
        Function is designed to get 2 dictionaries:
@@ -174,7 +146,7 @@ class EpubConverter:
                if css_href not in css_href2css_content:
                    # css_href not in css_href2css_content, add to this dict
                    css_href2css_content[css_href] = self.css_processor.build_css_file_content(
-                        self.get_css_content(css_href, html_href))
+                        self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))

            for i, tag in enumerate(soup_html_content.find_all("style")):
                css_content = tag.string
@@ -183,7 +155,8 @@ class EpubConverter:
                    css_content)
        return html_href2css_href, css_href2css_content

-    def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
+    @staticmethod
+    def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
        """
        Function adds styles from .css to inline style.
        Parameters
@@ -224,7 +197,10 @@ class EpubConverter:
        """
        This function is designed to update html_href2html_body_soup
        - add to html_inline_style css_style_content
-
+        Returns
+        -------
+        None
+            updated soups with styles from css
        """
        for html_href in self.html_href2html_body_soup:
            if self.html_href2css_href.get(html_href):
@@ -232,7 +208,8 @@ class EpubConverter:
                for css_href in self.html_href2css_href[html_href]:
                    css += self.css_href2css_content[css_href]
                html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
-                html_content = self.convert_html_soup_with_css_style(html_content, css)
+                html_content = self.modify_html_soup_with_css_styles(
+                    html_content, css)
                self.html_href2html_body_soup[html_href] = html_content

    def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -259,7 +236,7 @@ class EpubConverter:
            nav_point = NavPoint(element)
            if nav_point.id:
                self.id_anchor_exist_in_nav_points = True
-                self.html_href2subchapter_ids[nav_point.href].append(
+                self.html_href2subchapters_ids[nav_point.href].append(
                    nav_point.id)
            self.adjacency_list[nav_point] = None
            self.hrefs_added_to_toc.add(nav_point.href)
@@ -271,7 +248,7 @@ class EpubConverter:
            nav_point = NavPoint(first)
            if nav_point.id:
                self.id_anchor_exist_in_nav_points = True
-                self.html_href2subchapter_ids[nav_point.href].append(
+                self.html_href2subchapters_ids[nav_point.href].append(
                    nav_point.id)

            sub_nodes = []
@@ -357,25 +334,19 @@ class EpubConverter:
        for html_href in self.html_href2html_body_soup:
            chapter_tag = self.html_href2html_body_soup[html_href]
            # check marks for chapter starting are on the same level - 1st
-            marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"})
+            marks = chapter_tag.find_all(
+                attrs={"class": "converter-chapter-mark"})

            # fix marks to be on 1 level
            for mark in marks:
                while mark.parent != chapter_tag:
-                    mark.parent.unwrap()  # todo warning! could reflect on formatting/internal links in some cases
+                    # todo warning! could reflect on formatting/internal links in some cases
+                    mark.parent.unwrap()

    @staticmethod
    def create_unique_id(href, id_):
        return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)

-    @staticmethod
-    def create_new_anchor_span(soup, id_):
-        new_anchor_span = soup.new_tag("span")
-        new_anchor_span.attrs["id"] = id_
-        new_anchor_span.attrs["class"] = "link-anchor"
-        new_anchor_span.string = "\xa0"
-        return new_anchor_span
-
    def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
        """
        Function used to find full path to file that is parsed from tag link
@@ -414,6 +385,14 @@ class EpubConverter:

        return full_path[0]

+    @staticmethod
+    def create_new_anchor_span(soup, id_):
+        new_anchor_span = soup.new_tag("span")
+        new_anchor_span.attrs["id"] = id_
+        new_anchor_span.attrs["class"] = "link-anchor"
+        new_anchor_span.string = "\xa0"
+        return new_anchor_span
+
    def process_internal_links(self):
        """
        Function
@@ -520,8 +499,7 @@ class EpubConverter:
    @staticmethod
    def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
        """
-        After processing on a first_id that corresponds to current chapter,
-        from initial html_soup all tags from current chapter are extracted
+        Get tags between LiveCarta chapter marks
        Parameters
        ----------
        first_id: str
@@ -553,7 +531,6 @@ class EpubConverter:
            # save them in list for next steps
            tags = [tag.extract() for tag in tags]
            html_soup.smooth()
-
        else:
            assert 0, f"Warning: no match for {first_id, href}"

@@ -594,7 +571,7 @@ class EpubConverter:
            for sub_node in self.adjacency_list[nav_point]:
                self.detect_one_chapter(sub_node)

-    def define_chapters_content(self):
+    def define_chapters_with_content(self):
        """Function build chapters content, starts from top level chapters"""
        top_level_nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
@@ -618,11 +595,9 @@ class EpubConverter:

        """
        title = nav_point.title
-        if nav_point.id:
-            content: BeautifulSoup = self.href_chapter_id2soup_html[(
-                nav_point.href, nav_point.id)]
-        else:
-            content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
+        content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
+            if nav_point.id else self.html_href2html_body_soup[nav_point.href]
+
        self.book_image_src_path2aws_path = update_images_src_links(content,
                                                                    self.img_href2img_bytes,
                                                                    path_to_html=nav_point.href,
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -1,7 +1,7 @@
 from src.book_solver import BookSolver
 from src.preset_processor import PresetProcessor
-from src.epub_converter.css_preprocessor import CSSPreprocessor
-from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
+from src.epub_converter.css_processor import CSSPreprocessor
+from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
 from src.epub_converter.epub_converter import EpubConverter


@@ -30,10 +30,10 @@ class EpubBook(BookSolver):
        """
        preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
            .get_preset_json()
-        css_preprocessor = CSSPreprocessor(logger=self.logger_object)
-        html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
+        css_processor = CSSPreprocessor()
+        html_processor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
        json_converter = EpubConverter(
            self.file_path, access=self.access, logger=self.logger_object,
-            css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
+            css_processor=css_processor, html_processor=html_processor)
        content_dict = json_converter.convert_to_dict()
        return content_dict