rewrite css_processor.py

This commit is contained in:
Kiryl
2022-07-08 18:35:34 +03:00
parent c4752a19db
commit 1926377a34
3 changed files with 78 additions and 80 deletions

View File

@@ -1,14 +1,14 @@
import re import re
import cssutils import cssutils
from bs4 import BeautifulSoup
from os.path import dirname, normpath, join
from src.util.helpers import BookLogger
from src.util.color_reader import str2hex from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
class CSSPreprocessor: class CSSPreprocessor:
def __init__(self, logger=None): def __init__(self):
self.logger: BookLogger = logger
""" """
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
@@ -99,12 +99,8 @@ class CSSPreprocessor:
size_value: str size_value: str
""" """
if len(size_value.split(" ")) == 3: size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
size_value = self.convert_tag_style_values(size_value.split( else self.convert_tag_style_values(size_value.split(" ")[-1], True)
" ")[-2], True) # returns middle value
else:
size_value = self.convert_tag_style_values(size_value.split(
" ")[-1], True) # returns last value
return size_value return size_value
@staticmethod @staticmethod
@@ -152,10 +148,37 @@ class CSSPreprocessor:
style = "; ".join(split_style) style = "; ".join(split_style)
return style return style
def process_inline_styles_in_html_soup(self, html_href2html_body_soup):
"""This function is designed to convert inline html styles"""
for html_href in html_href2html_body_soup:
html_content: BeautifulSoup = html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
self.build_inline_style_content(inline_style)
@staticmethod
def get_css_content(css_href, html_href, ebooklib_book):
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace("\\", "/")
css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search("'(.*)'", str(css_obj.content)).group(1)
css_obj = ebooklib_book.get_item_with_href(
path_to_css_from_root)
assert css_obj, f"Css style {css_href} was not in manifest."
css_content: str = css_obj.get_content().decode()
return css_content
def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule, def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
style_type: cssutils.css.property.Property): style_type: cssutils.css.property.Property):
if style_type.name == "font-family":
pass
if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file # property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = "" css_rule.style[style_type.name] = ""

View File

@@ -2,7 +2,6 @@ import re
import json import json
import codecs import codecs
import os import os
from os.path import dirname, normpath, join
from itertools import chain from itertools import chain
from premailer import transform from premailer import transform
from collections import defaultdict from collections import defaultdict
@@ -15,8 +14,8 @@ from bs4 import BeautifulSoup, NavigableString, Tag
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.preset_processor import PresetProcessor from src.preset_processor import PresetProcessor
from src.epub_converter.css_preprocessor import CSSPreprocessor from src.epub_converter.css_processor import CSSPreprocessor
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.image_processing import update_images_src_links
@@ -25,18 +24,18 @@ from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcesso
class EpubConverter: class EpubConverter:
def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None): def __init__(self, file_path, access=None, logger=None, css_processor=None, html_processor=None):
self.file_path = file_path self.file_path = file_path
self.access = access self.access = access
self.logger: BookLogger = logger self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file_path) self.ebooklib_book = epub.read_epub(file_path)
self.css_processor = css_preprocessor self.css_processor = css_processor
self.html_preprocessor = html_processor self.html_processor = html_processor
# main container for all epub .xhtml files # main container for all epub .xhtml files
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
# enumerate all subchapter id for each file # enumerate all subchapter id for each file
self.html_href2subchapter_ids = defaultdict(list) self.html_href2subchapters_ids = defaultdict(list)
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints) # toc tree structure stored as adj.list (NavPoint to list of NavPoints)
@@ -71,17 +70,18 @@ class EpubConverter:
self.html_href2html_body_soup: Dict[str, self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content() BeautifulSoup] = self.build_href2soup_content()
self.logger.log("Process CSS inline styles.") self.logger.log("CSS inline style processing.")
self.process_inline_styles_in_html_soup() self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
self.logger.log("CSS files processing.") self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log("CSS styles adding.") self.logger.log("CSS styles fusion(inline+file).")
self.add_css_styles_to_html_soup() self.add_css_styles_to_html_soup()
self.logger.log("Footnotes processing.") self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup: for href in self.html_href2html_body_soup:
self.footnotes_contents, self.noterefs, self.footnotes =\ self.footnotes_contents, self.noterefs, self.footnotes =\
preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup) preprocess_footnotes(
self.html_href2html_body_soup[href], self.html_href2html_body_soup)
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
self.logger.log("TOC processing.") self.logger.log("TOC processing.")
@@ -115,34 +115,6 @@ class EpubConverter:
nodes[item.file_name] = soup nodes[item.file_name] = soup
return nodes return nodes
def get_css_content(self, css_href, html_href):
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace("\\", "/")
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search("'(.*)'", str(css_obj.content)).group(1)
css_obj = self.ebooklib_book.get_item_with_href(
path_to_css_from_root)
assert css_obj, f"Css style {css_href} was not in manifest."
css_content: str = css_obj.get_content().decode()
return css_content
def process_inline_styles_in_html_soup(self):
"""This function is designed to convert inline html styles"""
for html_href in self.html_href2html_body_soup:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
self.css_processor.build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]: def build_html_and_css_relations(self) -> tuple[dict, dict]:
""" """
Function is designed to get 2 dictionaries: Function is designed to get 2 dictionaries:
@@ -174,7 +146,7 @@ class EpubConverter:
if css_href not in css_href2css_content: if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict # css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = self.css_processor.build_css_file_content( css_href2css_content[css_href] = self.css_processor.build_css_file_content(
self.get_css_content(css_href, html_href)) self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))
for i, tag in enumerate(soup_html_content.find_all("style")): for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string css_content = tag.string
@@ -183,7 +155,8 @@ class EpubConverter:
css_content) css_content)
return html_href2css_href, css_href2css_content return html_href2css_href, css_href2css_content
def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: @staticmethod
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
""" """
Function adds styles from .css to inline style. Function adds styles from .css to inline style.
Parameters Parameters
@@ -224,7 +197,10 @@ class EpubConverter:
""" """
This function is designed to update html_href2html_body_soup This function is designed to update html_href2html_body_soup
- add to html_inline_style css_style_content - add to html_inline_style css_style_content
Returns
-------
None
updated soups with styles from css
""" """
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href): if self.html_href2css_href.get(html_href):
@@ -232,7 +208,8 @@ class EpubConverter:
for css_href in self.html_href2css_href[html_href]: for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href] css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
html_content = self.convert_html_soup_with_css_style(html_content, css) html_content = self.modify_html_soup_with_css_styles(
html_content, css)
self.html_href2html_body_soup[html_href] = html_content self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -259,7 +236,7 @@ class EpubConverter:
nav_point = NavPoint(element) nav_point = NavPoint(element)
if nav_point.id: if nav_point.id:
self.id_anchor_exist_in_nav_points = True self.id_anchor_exist_in_nav_points = True
self.html_href2subchapter_ids[nav_point.href].append( self.html_href2subchapters_ids[nav_point.href].append(
nav_point.id) nav_point.id)
self.adjacency_list[nav_point] = None self.adjacency_list[nav_point] = None
self.hrefs_added_to_toc.add(nav_point.href) self.hrefs_added_to_toc.add(nav_point.href)
@@ -271,7 +248,7 @@ class EpubConverter:
nav_point = NavPoint(first) nav_point = NavPoint(first)
if nav_point.id: if nav_point.id:
self.id_anchor_exist_in_nav_points = True self.id_anchor_exist_in_nav_points = True
self.html_href2subchapter_ids[nav_point.href].append( self.html_href2subchapters_ids[nav_point.href].append(
nav_point.id) nav_point.id)
sub_nodes = [] sub_nodes = []
@@ -357,25 +334,19 @@ class EpubConverter:
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
chapter_tag = self.html_href2html_body_soup[html_href] chapter_tag = self.html_href2html_body_soup[html_href]
# check marks for chapter starting are on the same level - 1st # check marks for chapter starting are on the same level - 1st
marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"}) marks = chapter_tag.find_all(
attrs={"class": "converter-chapter-mark"})
# fix marks to be on 1 level # fix marks to be on 1 level
for mark in marks: for mark in marks:
while mark.parent != chapter_tag: while mark.parent != chapter_tag:
mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases # todo warning! could reflect on formatting/internal links in some cases
mark.parent.unwrap()
@staticmethod @staticmethod
def create_unique_id(href, id_): def create_unique_id(href, id_):
return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_) return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
@staticmethod
def create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs["class"] = "link-anchor"
new_anchor_span.string = "\xa0"
return new_anchor_span
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]: def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
""" """
Function used to find full path to file that is parsed from tag link Function used to find full path to file that is parsed from tag link
@@ -414,6 +385,14 @@ class EpubConverter:
return full_path[0] return full_path[0]
@staticmethod
def create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs["class"] = "link-anchor"
new_anchor_span.string = "\xa0"
return new_anchor_span
def process_internal_links(self): def process_internal_links(self):
""" """
Function Function
@@ -520,8 +499,7 @@ class EpubConverter:
@staticmethod @staticmethod
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
""" """
After processing on a first_id that corresponds to current chapter, Get tags between LiveCarta chapter marks
from initial html_soup all tags from current chapter are extracted
Parameters Parameters
---------- ----------
first_id: str first_id: str
@@ -553,7 +531,6 @@ class EpubConverter:
# save them in list for next steps # save them in list for next steps
tags = [tag.extract() for tag in tags] tags = [tag.extract() for tag in tags]
html_soup.smooth() html_soup.smooth()
else: else:
assert 0, f"Warning: no match for {first_id, href}" assert 0, f"Warning: no match for {first_id, href}"
@@ -594,7 +571,7 @@ class EpubConverter:
for sub_node in self.adjacency_list[nav_point]: for sub_node in self.adjacency_list[nav_point]:
self.detect_one_chapter(sub_node) self.detect_one_chapter(sub_node)
def define_chapters_content(self): def define_chapters_with_content(self):
"""Function build chapters content, starts from top level chapters""" """Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points: if self.id_anchor_exist_in_nav_points:
@@ -618,11 +595,9 @@ class EpubConverter:
""" """
title = nav_point.title title = nav_point.title
if nav_point.id: content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
content: BeautifulSoup = self.href_chapter_id2soup_html[( if nav_point.id else self.html_href2html_body_soup[nav_point.href]
nav_point.href, nav_point.id)]
else:
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
self.book_image_src_path2aws_path = update_images_src_links(content, self.book_image_src_path2aws_path = update_images_src_links(content,
self.img_href2img_bytes, self.img_href2img_bytes,
path_to_html=nav_point.href, path_to_html=nav_point.href,

View File

@@ -1,7 +1,7 @@
from src.book_solver import BookSolver from src.book_solver import BookSolver
from src.preset_processor import PresetProcessor from src.preset_processor import PresetProcessor
from src.epub_converter.css_preprocessor import CSSPreprocessor from src.epub_converter.css_processor import CSSPreprocessor
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.epub_converter.epub_converter import EpubConverter from src.epub_converter.epub_converter import EpubConverter
@@ -30,10 +30,10 @@ class EpubBook(BookSolver):
""" """
preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\ preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
.get_preset_json() .get_preset_json()
css_preprocessor = CSSPreprocessor(logger=self.logger_object) css_processor = CSSPreprocessor()
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object) html_processor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
json_converter = EpubConverter( json_converter = EpubConverter(
self.file_path, access=self.access, logger=self.logger_object, self.file_path, access=self.access, logger=self.logger_object,
css_preprocessor=css_preprocessor, html_processor=html_preprocessor) css_processor=css_processor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
return content_dict return content_dict