diff --git a/src/epub_converter/css_preprocessor.py b/src/epub_converter/css_processor.py similarity index 79% rename from src/epub_converter/css_preprocessor.py rename to src/epub_converter/css_processor.py index 57c0388..0caad25 100644 --- a/src/epub_converter/css_preprocessor.py +++ b/src/epub_converter/css_processor.py @@ -1,14 +1,14 @@ import re import cssutils +from bs4 import BeautifulSoup +from os.path import dirname, normpath, join -from src.util.helpers import BookLogger from src.util.color_reader import str2hex from src.livecarta_config import LiveCartaConfig class CSSPreprocessor: - def __init__(self, logger=None): - self.logger: BookLogger = logger + def __init__(self): """ Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } @@ -99,12 +99,8 @@ class CSSPreprocessor: size_value: str """ - if len(size_value.split(" ")) == 3: - size_value = self.convert_tag_style_values(size_value.split( - " ")[-2], True) # returns middle value - else: - size_value = self.convert_tag_style_values(size_value.split( - " ")[-1], True) # returns last value + size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\ + else self.convert_tag_style_values(size_value.split(" ")[-1], True) return size_value @staticmethod @@ -152,10 +148,37 @@ class CSSPreprocessor: style = "; ".join(split_style) return style + def process_inline_styles_in_html_soup(self, html_href2html_body_soup): + """This function is designed to convert inline html styles""" + for html_href in html_href2html_body_soup: + html_content: BeautifulSoup = html_href2html_body_soup[html_href] + tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + attrs={"style": re.compile(".*")}) + + for tag_initial_inline_style in tags_with_inline_style: + inline_style = tag_initial_inline_style.attrs["style"] + tag_initial_inline_style.attrs["style"] = \ + self.build_inline_style_content(inline_style) + + @staticmethod + def get_css_content(css_href, html_href, ebooklib_book): + path_to_css_from_html = css_href + html_folder = dirname(html_href) + path_to_css_from_root = normpath( + join(html_folder, path_to_css_from_html)).replace("\\", "/") + css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root) + # if in css file we import another css + if "@import" in str(css_obj.content): + path_to_css_from_root = "css/" + \ + re.search("'(.*)'", str(css_obj.content)).group(1) + css_obj = ebooklib_book.get_item_with_href( + path_to_css_from_root) + assert css_obj, f"Css style {css_href} was not in manifest." + css_content: str = css_obj.get_content().decode() + return css_content + def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule, style_type: cssutils.css.property.Property): - if style_type.name == "font-family": - pass if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = "" diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 525fad3..a301b5b 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -2,7 +2,6 @@ import re import json import codecs import os -from os.path import dirname, normpath, join from itertools import chain from premailer import transform from collections import defaultdict @@ -15,8 +14,8 @@ from bs4 import BeautifulSoup, NavigableString, Tag from src.util.helpers import BookLogger from src.preset_processor import PresetProcessor -from src.epub_converter.css_preprocessor import CSSPreprocessor -from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor +from src.epub_converter.css_processor import CSSPreprocessor +from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.epub_converter.image_processing import update_images_src_links @@ -25,18 +24,18 @@ from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcesso class EpubConverter: - def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None): + def __init__(self, file_path, access=None, logger=None, css_processor=None, html_processor=None): self.file_path = file_path self.access = access self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file_path) - self.css_processor = css_preprocessor - self.html_preprocessor = html_processor + self.css_processor = css_processor + self.html_processor = html_processor # main container for all epub .xhtml files self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # enumerate all subchapter id for each file - self.html_href2subchapter_ids = defaultdict(list) + self.html_href2subchapters_ids = defaultdict(list) self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC # toc tree structure stored as adj.list (NavPoint to list of NavPoints) @@ -71,17 +70,18 @@ class EpubConverter: self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() - self.logger.log("Process CSS inline styles.") - self.process_inline_styles_in_html_soup() + self.logger.log("CSS inline style processing.") + self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup) self.logger.log("CSS files processing.") self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() - self.logger.log("CSS styles adding.") + self.logger.log("CSS styles fusion(inline+file).") self.add_css_styles_to_html_soup() self.logger.log("Footnotes processing.") for href in self.html_href2html_body_soup: self.footnotes_contents, self.noterefs, self.footnotes =\ - preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup) + preprocess_footnotes( + self.html_href2html_body_soup[href], self.html_href2html_body_soup) self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") self.logger.log("TOC processing.") @@ -115,34 +115,6 @@ class EpubConverter: nodes[item.file_name] = soup return nodes - def get_css_content(self, css_href, html_href): - path_to_css_from_html = css_href - html_folder = dirname(html_href) - path_to_css_from_root = normpath( - join(html_folder, path_to_css_from_html)).replace("\\", "/") - css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) - # if in css file we import another css - if "@import" in str(css_obj.content): - path_to_css_from_root = "css/" + \ - re.search("'(.*)'", str(css_obj.content)).group(1) - css_obj = self.ebooklib_book.get_item_with_href( - path_to_css_from_root) - assert css_obj, f"Css style {css_href} was not in manifest." - css_content: str = css_obj.get_content().decode() - return css_content - - def process_inline_styles_in_html_soup(self): - """This function is designed to convert inline html styles""" - for html_href in self.html_href2html_body_soup: - html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] - tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={"style": re.compile(".*")}) - - for tag_initial_inline_style in tags_with_inline_style: - inline_style = tag_initial_inline_style.attrs["style"] - tag_initial_inline_style.attrs["style"] = \ - self.css_processor.build_inline_style_content(inline_style) - def build_html_and_css_relations(self) -> tuple[dict, dict]: """ Function is designed to get 2 dictionaries: @@ -174,7 +146,7 @@ class EpubConverter: if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict css_href2css_content[css_href] = self.css_processor.build_css_file_content( - self.get_css_content(css_href, html_href)) + self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book)) for i, tag in enumerate(soup_html_content.find_all("style")): css_content = tag.string @@ -183,7 +155,8 @@ class EpubConverter: css_content) return html_href2css_href, css_href2css_content - def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: + @staticmethod + def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: """ Function adds styles from .css to inline style. Parameters @@ -224,7 +197,10 @@ class EpubConverter: """ This function is designed to update html_href2html_body_soup - add to html_inline_style css_style_content - + Returns + ------- + None + updated soups with styles from css """ for html_href in self.html_href2html_body_soup: if self.html_href2css_href.get(html_href): @@ -232,7 +208,8 @@ class EpubConverter: for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] - html_content = self.convert_html_soup_with_css_style(html_content, css) + html_content = self.modify_html_soup_with_css_styles( + html_content, css) self.html_href2html_body_soup[html_href] = html_content def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): @@ -259,7 +236,7 @@ class EpubConverter: nav_point = NavPoint(element) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.html_href2subchapter_ids[nav_point.href].append( + self.html_href2subchapters_ids[nav_point.href].append( nav_point.id) self.adjacency_list[nav_point] = None self.hrefs_added_to_toc.add(nav_point.href) @@ -271,7 +248,7 @@ class EpubConverter: nav_point = NavPoint(first) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.html_href2subchapter_ids[nav_point.href].append( + self.html_href2subchapters_ids[nav_point.href].append( nav_point.id) sub_nodes = [] @@ -357,25 +334,19 @@ class EpubConverter: for html_href in self.html_href2html_body_soup: chapter_tag = self.html_href2html_body_soup[html_href] # check marks for chapter starting are on the same level - 1st - marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"}) + marks = chapter_tag.find_all( + attrs={"class": "converter-chapter-mark"}) # fix marks to be on 1 level for mark in marks: while mark.parent != chapter_tag: - mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases + # todo warning! could reflect on formatting/internal links in some cases + mark.parent.unwrap() @staticmethod def create_unique_id(href, id_): return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_) - @staticmethod - def create_new_anchor_span(soup, id_): - new_anchor_span = soup.new_tag("span") - new_anchor_span.attrs["id"] = id_ - new_anchor_span.attrs["class"] = "link-anchor" - new_anchor_span.string = "\xa0" - return new_anchor_span - def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]: """ Function used to find full path to file that is parsed from tag link @@ -414,6 +385,14 @@ class EpubConverter: return full_path[0] + @staticmethod + def create_new_anchor_span(soup, id_): + new_anchor_span = soup.new_tag("span") + new_anchor_span.attrs["id"] = id_ + new_anchor_span.attrs["class"] = "link-anchor" + new_anchor_span.string = "\xa0" + return new_anchor_span + def process_internal_links(self): """ Function @@ -520,8 +499,7 @@ class EpubConverter: @staticmethod def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: """ - After processing on a first_id that corresponds to current chapter, - from initial html_soup all tags from current chapter are extracted + Get tags between LiveCarta chapter marks Parameters ---------- first_id: str @@ -553,7 +531,6 @@ class EpubConverter: # save them in list for next steps tags = [tag.extract() for tag in tags] html_soup.smooth() - else: assert 0, f"Warning: no match for {first_id, href}" @@ -594,7 +571,7 @@ class EpubConverter: for sub_node in self.adjacency_list[nav_point]: self.detect_one_chapter(sub_node) - def define_chapters_content(self): + def define_chapters_with_content(self): """Function build chapters content, starts from top level chapters""" top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: @@ -618,11 +595,9 @@ class EpubConverter: """ title = nav_point.title - if nav_point.id: - content: BeautifulSoup = self.href_chapter_id2soup_html[( - nav_point.href, nav_point.id)] - else: - content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href] + content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \ + if nav_point.id else self.html_href2html_body_soup[nav_point.href] + self.book_image_src_path2aws_path = update_images_src_links(content, self.img_href2img_bytes, path_to_html=nav_point.href, diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index c1bb800..e0cfef6 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -1,7 +1,7 @@ from src.book_solver import BookSolver from src.preset_processor import PresetProcessor -from src.epub_converter.css_preprocessor import CSSPreprocessor -from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor +from src.epub_converter.css_processor import CSSPreprocessor +from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.epub_converter.epub_converter import EpubConverter @@ -30,10 +30,10 @@ class EpubBook(BookSolver): """ preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\ .get_preset_json() - css_preprocessor = CSSPreprocessor(logger=self.logger_object) - html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object) + css_processor = CSSPreprocessor() + html_processor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object) json_converter = EpubConverter( self.file_path, access=self.access, logger=self.logger_object, - css_preprocessor=css_preprocessor, html_processor=html_preprocessor) + css_processor=css_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() return content_dict