rewrite css_processor.py

2022-07-08 18:35:34 +03:00
parent c4752a19db
commit 1926377a34
3 changed files with 78 additions and 80 deletions
--- a/src/epub_converter/css_preprocessor.py
+++ b/src/epub_converter/css_preprocessor.py
@@ -1,14 +1,14 @@
 import re
 import cssutils
 from bs4 import BeautifulSoup
 from os.path import dirname, normpath, join
 from src.util.helpers import BookLogger
 from src.util.color_reader import str2hex
 from src.livecarta_config import LiveCartaConfig
 class CSSPreprocessor:
-    def __init__(self, logger=None):
+    def __init__(self):
        self.logger: BookLogger = logger
        """
        Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
@@ -99,12 +99,8 @@ class CSSPreprocessor:
        size_value: str
        """
-        if len(size_value.split(" ")) == 3:
+        size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
-            size_value = self.convert_tag_style_values(size_value.split(
+            else self.convert_tag_style_values(size_value.split(" ")[-1], True)
                " ")[-2], True)  # returns middle value
        else:
            size_value = self.convert_tag_style_values(size_value.split(
                " ")[-1], True)  # returns last value
        return size_value
    @staticmethod
@@ -152,10 +148,37 @@ class CSSPreprocessor:
        style = "; ".join(split_style)
        return style
    def process_inline_styles_in_html_soup(self, html_href2html_body_soup):
        """This function is designed to convert inline html styles"""
        for html_href in html_href2html_body_soup:
            html_content: BeautifulSoup = html_href2html_body_soup[html_href]
            tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
                                                           attrs={"style": re.compile(".*")})
            for tag_initial_inline_style in tags_with_inline_style:
                inline_style = tag_initial_inline_style.attrs["style"]
                tag_initial_inline_style.attrs["style"] = \
                    self.build_inline_style_content(inline_style)
    @staticmethod
    def get_css_content(css_href, html_href, ebooklib_book):
        path_to_css_from_html = css_href
        html_folder = dirname(html_href)
        path_to_css_from_root = normpath(
            join(html_folder, path_to_css_from_html)).replace("\\", "/")
        css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root)
        # if in css file we import another css
        if "@import" in str(css_obj.content):
            path_to_css_from_root = "css/" + \
                re.search("'(.*)'", str(css_obj.content)).group(1)
            css_obj = ebooklib_book.get_item_with_href(
                path_to_css_from_root)
        assert css_obj, f"Css style {css_href} was not in manifest."
        css_content: str = css_obj.get_content().decode()
        return css_content
    def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
                                                  style_type: cssutils.css.property.Property):
        if style_type.name == "font-family":
            pass
        if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
            # property not in LIVECARTA_STYLE_ATTRS, remove from css file
            css_rule.style[style_type.name] = ""
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -2,7 +2,6 @@ import re
 import json
 import codecs
 import os
 from os.path import dirname, normpath, join
 from itertools import chain
 from premailer import transform
 from collections import defaultdict
@@ -15,8 +14,8 @@ from bs4 import BeautifulSoup, NavigableString, Tag
 from src.util.helpers import BookLogger
 from src.preset_processor import PresetProcessor
-from src.epub_converter.css_preprocessor import CSSPreprocessor
+from src.epub_converter.css_processor import CSSPreprocessor
-from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
+from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.epub_converter.image_processing import update_images_src_links
@@ -25,18 +24,18 @@ from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcesso
 class EpubConverter:
-    def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
+    def __init__(self, file_path, access=None, logger=None, css_processor=None, html_processor=None):
        self.file_path = file_path
        self.access = access
        self.logger: BookLogger = logger
        self.ebooklib_book = epub.read_epub(file_path)
-        self.css_processor = css_preprocessor
+        self.css_processor = css_processor
-        self.html_preprocessor = html_processor
+        self.html_processor = html_processor
        # main container for all epub .xhtml files
        self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
        # enumerate all subchapter id for each file
-        self.html_href2subchapter_ids = defaultdict(list)
+        self.html_href2subchapters_ids = defaultdict(list)
        self.hrefs_added_to_toc = set()  # enumerate all file paths that where added to TOC
        # toc tree structure stored as adj.list (NavPoint to list of NavPoints)
@@ -71,17 +70,18 @@ class EpubConverter:
        self.html_href2html_body_soup: Dict[str,
                                            BeautifulSoup] = self.build_href2soup_content()
-        self.logger.log("Process CSS inline styles.")
+        self.logger.log("CSS inline style processing.")
-        self.process_inline_styles_in_html_soup()
+        self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
        self.logger.log("CSS files processing.")
        self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
-        self.logger.log("CSS  styles adding.")
+        self.logger.log("CSS styles fusion(inline+file).")
        self.add_css_styles_to_html_soup()
        self.logger.log("Footnotes processing.")
        for href in self.html_href2html_body_soup:
            self.footnotes_contents, self.noterefs, self.footnotes =\
-                preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
+                preprocess_footnotes(
                    self.html_href2html_body_soup[href], self.html_href2html_body_soup)
        self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
        self.logger.log("TOC processing.")
@@ -115,34 +115,6 @@ class EpubConverter:
            nodes[item.file_name] = soup
        return nodes
    def get_css_content(self, css_href, html_href):
        path_to_css_from_html = css_href
        html_folder = dirname(html_href)
        path_to_css_from_root = normpath(
            join(html_folder, path_to_css_from_html)).replace("\\", "/")
        css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
        # if in css file we import another css
        if "@import" in str(css_obj.content):
            path_to_css_from_root = "css/" + \
                re.search("'(.*)'", str(css_obj.content)).group(1)
            css_obj = self.ebooklib_book.get_item_with_href(
                path_to_css_from_root)
        assert css_obj, f"Css style {css_href} was not in manifest."
        css_content: str = css_obj.get_content().decode()
        return css_content
    def process_inline_styles_in_html_soup(self):
        """This function is designed to convert inline html styles"""
        for html_href in self.html_href2html_body_soup:
            html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
            tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
                                                           attrs={"style": re.compile(".*")})
            for tag_initial_inline_style in tags_with_inline_style:
                inline_style = tag_initial_inline_style.attrs["style"]
                tag_initial_inline_style.attrs["style"] = \
                    self.css_processor.build_inline_style_content(inline_style)
    def build_html_and_css_relations(self) -> tuple[dict, dict]:
        """
        Function is designed to get 2 dictionaries:
@@ -174,7 +146,7 @@ class EpubConverter:
                if css_href not in css_href2css_content:
                    # css_href not in css_href2css_content, add to this dict
                    css_href2css_content[css_href] = self.css_processor.build_css_file_content(
-                        self.get_css_content(css_href, html_href))
+                        self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))
            for i, tag in enumerate(soup_html_content.find_all("style")):
                css_content = tag.string
@@ -183,7 +155,8 @@ class EpubConverter:
                    css_content)
        return html_href2css_href, css_href2css_content
-    def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
+    @staticmethod
    def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
        """
        Function adds styles from .css to inline style.
        Parameters
@@ -224,7 +197,10 @@ class EpubConverter:
        """
        This function is designed to update html_href2html_body_soup
        - add to html_inline_style css_style_content
-
+        Returns
        -------
        None
            updated soups with styles from css
        """
        for html_href in self.html_href2html_body_soup:
            if self.html_href2css_href.get(html_href):
@@ -232,7 +208,8 @@ class EpubConverter:
                for css_href in self.html_href2css_href[html_href]:
                    css += self.css_href2css_content[css_href]
                html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
-                html_content = self.convert_html_soup_with_css_style(html_content, css)
+                html_content = self.modify_html_soup_with_css_styles(
                    html_content, css)
                self.html_href2html_body_soup[html_href] = html_content
    def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -259,7 +236,7 @@ class EpubConverter:
            nav_point = NavPoint(element)
            if nav_point.id:
                self.id_anchor_exist_in_nav_points = True
-                self.html_href2subchapter_ids[nav_point.href].append(
+                self.html_href2subchapters_ids[nav_point.href].append(
                    nav_point.id)
            self.adjacency_list[nav_point] = None
            self.hrefs_added_to_toc.add(nav_point.href)
@@ -271,7 +248,7 @@ class EpubConverter:
            nav_point = NavPoint(first)
            if nav_point.id:
                self.id_anchor_exist_in_nav_points = True
-                self.html_href2subchapter_ids[nav_point.href].append(
+                self.html_href2subchapters_ids[nav_point.href].append(
                    nav_point.id)
            sub_nodes = []
@@ -357,25 +334,19 @@ class EpubConverter:
        for html_href in self.html_href2html_body_soup:
            chapter_tag = self.html_href2html_body_soup[html_href]
            # check marks for chapter starting are on the same level - 1st
-            marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"})
+            marks = chapter_tag.find_all(
                attrs={"class": "converter-chapter-mark"})
            # fix marks to be on 1 level
            for mark in marks:
                while mark.parent != chapter_tag:
-                    mark.parent.unwrap()  # todo warning! could reflect on formatting/internal links in some cases
+                    # todo warning! could reflect on formatting/internal links in some cases
                    mark.parent.unwrap()
    @staticmethod
    def create_unique_id(href, id_):
        return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
    @staticmethod
    def create_new_anchor_span(soup, id_):
        new_anchor_span = soup.new_tag("span")
        new_anchor_span.attrs["id"] = id_
        new_anchor_span.attrs["class"] = "link-anchor"
        new_anchor_span.string = "\xa0"
        return new_anchor_span
    def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
        """
        Function used to find full path to file that is parsed from tag link
@@ -414,6 +385,14 @@ class EpubConverter:
        return full_path[0]
    @staticmethod
    def create_new_anchor_span(soup, id_):
        new_anchor_span = soup.new_tag("span")
        new_anchor_span.attrs["id"] = id_
        new_anchor_span.attrs["class"] = "link-anchor"
        new_anchor_span.string = "\xa0"
        return new_anchor_span
    def process_internal_links(self):
        """
        Function
@@ -520,8 +499,7 @@ class EpubConverter:
    @staticmethod
    def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
        """
-        After processing on a first_id that corresponds to current chapter,
+        Get tags between LiveCarta chapter marks
        from initial html_soup all tags from current chapter are extracted
        Parameters
        ----------
        first_id: str
@@ -553,7 +531,6 @@ class EpubConverter:
            # save them in list for next steps
            tags = [tag.extract() for tag in tags]
            html_soup.smooth()
        else:
            assert 0, f"Warning: no match for {first_id, href}"
@@ -594,7 +571,7 @@ class EpubConverter:
            for sub_node in self.adjacency_list[nav_point]:
                self.detect_one_chapter(sub_node)
-    def define_chapters_content(self):
+    def define_chapters_with_content(self):
        """Function build chapters content, starts from top level chapters"""
        top_level_nav_points = self.adjacency_list[-1]
        if self.id_anchor_exist_in_nav_points:
@@ -618,11 +595,9 @@ class EpubConverter:
        """
        title = nav_point.title
-        if nav_point.id:
+        content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
-            content: BeautifulSoup = self.href_chapter_id2soup_html[(
+            if nav_point.id else self.html_href2html_body_soup[nav_point.href]
-                nav_point.href, nav_point.id)]
+
        else:
            content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
        self.book_image_src_path2aws_path = update_images_src_links(content,
                                                                    self.img_href2img_bytes,
                                                                    path_to_html=nav_point.href,
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -1,7 +1,7 @@
 from src.book_solver import BookSolver
 from src.preset_processor import PresetProcessor
-from src.epub_converter.css_preprocessor import CSSPreprocessor
+from src.epub_converter.css_processor import CSSPreprocessor
-from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
+from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
 from src.epub_converter.epub_converter import EpubConverter
@@ -30,10 +30,10 @@ class EpubBook(BookSolver):
        """
        preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
            .get_preset_json()
-        css_preprocessor = CSSPreprocessor(logger=self.logger_object)
+        css_processor = CSSPreprocessor()
-        html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
+        html_processor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
        json_converter = EpubConverter(
            self.file_path, access=self.access, logger=self.logger_object,
-            css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
+            css_processor=css_processor, html_processor=html_processor)
        content_dict = json_converter.convert_to_dict()
        return content_dict