diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 29959c0..061eedb 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -7,7 +7,6 @@ from pathlib import Path from ebooklib import epub from ebooklib.epub import Link, Section from itertools import chain -from premailer import transform from collections import defaultdict from typing import List, Tuple, Dict, Union from bs4 import BeautifulSoup, Tag, NavigableString @@ -15,20 +14,21 @@ from bs4 import BeautifulSoup, Tag, NavigableString from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint -from src.style_preprocessor import CSSPreprocessor -from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor +from src.style_preprocessor import StylePreprocessor +from src.epub_converter.html_epub_processor import HtmlEpubProcessor from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.footnotes_processing import preprocess_footnotes -from src.tag_inline_style_processor import TagInlineStyleProcessor +from src.tag_inline_style_processor import modify_html_soup_with_css_styles class EpubConverter: - def __init__(self, book_path, access=None, logger: BookLogger = None, css_processor: CSSPreprocessor = None, html_processor: HtmlEpubPreprocessor = None): + def __init__(self, book_path, access=None, logger: BookLogger = None, + style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None): self.book_path = book_path self.access = access self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(book_path) - self.css_processor = css_processor + self.style_processor = style_processor self.html_processor = html_processor # main container for all epub .xhtml files @@ -71,8 +71,8 @@ class EpubConverter: BeautifulSoup] = self.build_href2soup_content() self.logger.log("CSS inline style processing.") - self.css_processor.process_inline_styles_in_html_soup( - self.html_href2html_body_soup) + [self.style_processor.process_inline_styles_in_html_soup( + self.html_href2html_body_soup[html_href]) for html_href in self.html_href2html_body_soup] self.logger.log("CSS files processing.") self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() self.logger.log("CSS styles fusion(inline+file).") @@ -147,54 +147,16 @@ class EpubConverter: html_href2css_href[html_href].append(css_href) if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict - css_href2css_content[css_href] = self.css_processor.build_css_file_content( - self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book)) + css_href2css_content[css_href] = self.style_processor.build_css_file_content( + self.style_processor.get_css_content(css_href, html_href, self.ebooklib_book)) for i, tag in enumerate(soup_html_content.find_all("style")): css_content = tag.string html_href2css_href[html_href].append(f"href{i}") - css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content( + css_href2css_content[f"href{i}"] = self.style_processor.build_css_file_content( css_content) return html_href2css_href, css_href2css_content - @staticmethod - def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: - """ - Function adds styles from .css to inline style. - Parameters - ---------- - html_soup: BeautifulSoup - html page with inline style - css_text: str - css content from css file - Returns - ------- - inline_soup: BeautifulSoup - soup with styles from css - - """ - # remove this specification because it causes problems - css_text = css_text.replace( - '@namespace epub "http://www.idpf.org/2007/ops";', '') - # here we add css styles to inline style - html_with_css_styles: str = transform(str(html_soup), css_text=css_text, - remove_classes=False, - external_styles=False, - allow_network=False, - disable_validation=True, - ) - # soup with converted styles from css - inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") - - tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={"style": re.compile(".*")}) - - # go through the tags with inline style + style parsed from css file - for tag_inline_style in tags_with_inline_style: - style_converter = TagInlineStyleProcessor(tag_inline_style) - style_converter.convert_initial_tag() - return inline_soup - def add_css_styles_to_html_soup(self): """ This function is designed to update html_href2html_body_soup @@ -210,7 +172,7 @@ class EpubConverter: for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] - html_content = self.modify_html_soup_with_css_styles( + html_content = modify_html_soup_with_css_styles( html_content, css) self.html_href2html_body_soup[html_href] = html_content @@ -646,15 +608,16 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../books/epub/9781119646044.epub" + epub_file_path = "../../books/epub/9780763774134.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) - css_processor = CSSPreprocessor() - html_processor = HtmlEpubPreprocessor(logger=logger_object) + css_processor = StylePreprocessor() + html_processor = HtmlEpubProcessor( + "../../presets/presets.json", logger=logger_object) json_converter = EpubConverter(epub_file_path, logger=logger_object, - css_processor=css_processor, html_processor=html_processor) + style_processor=css_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index ceae0fc..754b361 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -1,6 +1,6 @@ from src.book_solver import BookSolver -from src.style_preprocessor import CSSPreprocessor -from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor +from src.style_preprocessor import StylePreprocessor +from src.epub_converter.html_epub_processor import HtmlEpubProcessor from src.epub_converter.epub_converter import EpubConverter @@ -25,11 +25,11 @@ class EpubBook(BookSolver): json for LiveCarta platform """ - css_processor = CSSPreprocessor() - html_processor = HtmlEpubPreprocessor( - self.preset_path, logger=self.logger_object) + style_processor = StylePreprocessor() + html_processor = HtmlEpubProcessor( + logger=self.logger_object) json_converter = EpubConverter( self.book_path, access=self.access, logger=self.logger_object, - css_processor=css_processor, html_processor=html_processor) + style_processor=style_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() return content_dict diff --git a/src/style_preprocessor.py b/src/style_preprocessor.py index 5335ecd..9bdbe2b 100644 --- a/src/style_preprocessor.py +++ b/src/style_preprocessor.py @@ -8,7 +8,7 @@ from src.util.color_reader import str2hex from src.livecarta_config import LiveCartaConfig -class CSSPreprocessor: +class StylePreprocessor: def __init__(self): """ Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } @@ -160,17 +160,15 @@ class CSSPreprocessor: style = "; ".join(split_style) return style - def process_inline_styles_in_html_soup(self, html_href2html_body_soup: Dict[str, BeautifulSoup]): + def process_inline_styles_in_html_soup(self, html_content): """This function is designed to convert inline html styles""" - for html_href in html_href2html_body_soup: - html_content: BeautifulSoup = html_href2html_body_soup[html_href] - tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={"style": re.compile(".*")}) + tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + attrs={"style": re.compile(".*")}) - for tag_initial_inline_style in tags_with_inline_style: - inline_style = tag_initial_inline_style.attrs["style"] - tag_initial_inline_style.attrs["style"] = \ - self.build_inline_style_content(inline_style) + for tag_initial_inline_style in tags_with_inline_style: + inline_style = tag_initial_inline_style.attrs["style"] + tag_initial_inline_style.attrs["style"] = \ + self.build_inline_style_content(inline_style) @staticmethod def get_css_content(css_href: str, html_href: str, ebooklib_book) -> str: diff --git a/src/tag_inline_style_processor.py b/src/tag_inline_style_processor.py index c2f94df..42ed0d4 100644 --- a/src/tag_inline_style_processor.py +++ b/src/tag_inline_style_processor.py @@ -2,6 +2,7 @@ import re import cssutils from typing import List from logging import CRITICAL +from premailer import transform from bs4 import BeautifulSoup, Tag from src.livecarta_config import LiveCartaConfig @@ -215,3 +216,41 @@ class TagInlineStyleProcessor: self.change_attrs_with_corresponding_tags() self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) return self.tag_inline_style + + +def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = "") -> BeautifulSoup: + """ + Function adds styles from .css to inline style. + Parameters + ---------- + html_soup: BeautifulSoup + html page with inline style + css_text: str + css content from css file + Returns + ------- + inline_soup: BeautifulSoup + soup with styles from css + + """ + # remove this specification because it causes problems + css_text = css_text.replace( + '@namespace epub "http://www.idpf.org/2007/ops";', '') + # here we add css styles to inline style + html_with_css_styles: str = transform(str(html_soup), css_text=css_text, + remove_classes=False, + external_styles=False, + allow_network=False, + disable_validation=True, + ) + # soup with converted styles from css + inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") + + tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + attrs={"style": re.compile(".*")}) + + # go through the tags with inline style + style parsed from css file + for tag_inline_style in tags_with_inline_style: + style_converter = TagInlineStyleProcessor(tag_inline_style) + style_converter.convert_initial_tag() + return inline_soup