rewrite css_processor.py

This commit is contained in:
Kiryl
2022-07-08 18:35:34 +03:00
parent c4752a19db
commit 1926377a34
3 changed files with 78 additions and 80 deletions

View File

@@ -1,14 +1,14 @@
import re
import cssutils
from bs4 import BeautifulSoup
from os.path import dirname, normpath, join
from src.util.helpers import BookLogger
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
class CSSPreprocessor:
def __init__(self, logger=None):
self.logger: BookLogger = logger
def __init__(self):
"""
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
@@ -99,12 +99,8 @@ class CSSPreprocessor:
size_value: str
"""
if len(size_value.split(" ")) == 3:
size_value = self.convert_tag_style_values(size_value.split(
" ")[-2], True) # returns middle value
else:
size_value = self.convert_tag_style_values(size_value.split(
" ")[-1], True) # returns last value
size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
else self.convert_tag_style_values(size_value.split(" ")[-1], True)
return size_value
@staticmethod
@@ -152,10 +148,37 @@ class CSSPreprocessor:
style = "; ".join(split_style)
return style
def process_inline_styles_in_html_soup(self, html_href2html_body_soup):
"""This function is designed to convert inline html styles"""
for html_href in html_href2html_body_soup:
html_content: BeautifulSoup = html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
self.build_inline_style_content(inline_style)
@staticmethod
def get_css_content(css_href, html_href, ebooklib_book):
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace("\\", "/")
css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search("'(.*)'", str(css_obj.content)).group(1)
css_obj = ebooklib_book.get_item_with_href(
path_to_css_from_root)
assert css_obj, f"Css style {css_href} was not in manifest."
css_content: str = css_obj.get_content().decode()
return css_content
def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
style_type: cssutils.css.property.Property):
if style_type.name == "font-family":
pass
if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ""

View File

@@ -2,7 +2,6 @@ import re
import json
import codecs
import os
from os.path import dirname, normpath, join
from itertools import chain
from premailer import transform
from collections import defaultdict
@@ -15,8 +14,8 @@ from bs4 import BeautifulSoup, NavigableString, Tag
from src.util.helpers import BookLogger
from src.preset_processor import PresetProcessor
from src.epub_converter.css_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
from src.epub_converter.css_processor import CSSPreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.image_processing import update_images_src_links
@@ -25,18 +24,18 @@ from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcesso
class EpubConverter:
def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
def __init__(self, file_path, access=None, logger=None, css_processor=None, html_processor=None):
self.file_path = file_path
self.access = access
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file_path)
self.css_processor = css_preprocessor
self.html_preprocessor = html_processor
self.css_processor = css_processor
self.html_processor = html_processor
# main container for all epub .xhtml files
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
# enumerate all subchapter id for each file
self.html_href2subchapter_ids = defaultdict(list)
self.html_href2subchapters_ids = defaultdict(list)
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
@@ -71,17 +70,18 @@ class EpubConverter:
self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content()
self.logger.log("Process CSS inline styles.")
self.process_inline_styles_in_html_soup()
self.logger.log("CSS inline style processing.")
self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log("CSS styles adding.")
self.logger.log("CSS styles fusion(inline+file).")
self.add_css_styles_to_html_soup()
self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup:
self.footnotes_contents, self.noterefs, self.footnotes =\
preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
preprocess_footnotes(
self.html_href2html_body_soup[href], self.html_href2html_body_soup)
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
self.logger.log("TOC processing.")
@@ -115,34 +115,6 @@ class EpubConverter:
nodes[item.file_name] = soup
return nodes
def get_css_content(self, css_href, html_href):
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace("\\", "/")
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search("'(.*)'", str(css_obj.content)).group(1)
css_obj = self.ebooklib_book.get_item_with_href(
path_to_css_from_root)
assert css_obj, f"Css style {css_href} was not in manifest."
css_content: str = css_obj.get_content().decode()
return css_content
def process_inline_styles_in_html_soup(self):
"""This function is designed to convert inline html styles"""
for html_href in self.html_href2html_body_soup:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
self.css_processor.build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]:
"""
Function is designed to get 2 dictionaries:
@@ -174,7 +146,7 @@ class EpubConverter:
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
self.get_css_content(css_href, html_href))
self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))
for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string
@@ -183,7 +155,8 @@ class EpubConverter:
css_content)
return html_href2css_href, css_href2css_content
def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
@staticmethod
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""
Function adds styles from .css to inline style.
Parameters
@@ -224,7 +197,10 @@ class EpubConverter:
"""
This function is designed to update html_href2html_body_soup
- add to html_inline_style css_style_content
Returns
-------
None
updated soups with styles from css
"""
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
@@ -232,7 +208,8 @@ class EpubConverter:
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
html_content = self.convert_html_soup_with_css_style(html_content, css)
html_content = self.modify_html_soup_with_css_styles(
html_content, css)
self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -259,7 +236,7 @@ class EpubConverter:
nav_point = NavPoint(element)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.html_href2subchapter_ids[nav_point.href].append(
self.html_href2subchapters_ids[nav_point.href].append(
nav_point.id)
self.adjacency_list[nav_point] = None
self.hrefs_added_to_toc.add(nav_point.href)
@@ -271,7 +248,7 @@ class EpubConverter:
nav_point = NavPoint(first)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.html_href2subchapter_ids[nav_point.href].append(
self.html_href2subchapters_ids[nav_point.href].append(
nav_point.id)
sub_nodes = []
@@ -357,25 +334,19 @@ class EpubConverter:
for html_href in self.html_href2html_body_soup:
chapter_tag = self.html_href2html_body_soup[html_href]
# check marks for chapter starting are on the same level - 1st
marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"})
marks = chapter_tag.find_all(
attrs={"class": "converter-chapter-mark"})
# fix marks to be on 1 level
for mark in marks:
while mark.parent != chapter_tag:
mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
# todo warning! could reflect on formatting/internal links in some cases
mark.parent.unwrap()
@staticmethod
def create_unique_id(href, id_):
return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
@staticmethod
def create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs["class"] = "link-anchor"
new_anchor_span.string = "\xa0"
return new_anchor_span
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
"""
Function used to find full path to file that is parsed from tag link
@@ -414,6 +385,14 @@ class EpubConverter:
return full_path[0]
@staticmethod
def create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs["class"] = "link-anchor"
new_anchor_span.string = "\xa0"
return new_anchor_span
def process_internal_links(self):
"""
Function
@@ -520,8 +499,7 @@ class EpubConverter:
@staticmethod
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Get tags between LiveCarta chapter marks
Parameters
----------
first_id: str
@@ -553,7 +531,6 @@ class EpubConverter:
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f"Warning: no match for {first_id, href}"
@@ -594,7 +571,7 @@ class EpubConverter:
for sub_node in self.adjacency_list[nav_point]:
self.detect_one_chapter(sub_node)
def define_chapters_content(self):
def define_chapters_with_content(self):
"""Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
@@ -618,11 +595,9 @@ class EpubConverter:
"""
title = nav_point.title
if nav_point.id:
content: BeautifulSoup = self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)]
else:
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
if nav_point.id else self.html_href2html_body_soup[nav_point.href]
self.book_image_src_path2aws_path = update_images_src_links(content,
self.img_href2img_bytes,
path_to_html=nav_point.href,

View File

@@ -1,7 +1,7 @@
from src.book_solver import BookSolver
from src.preset_processor import PresetProcessor
from src.epub_converter.css_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
from src.epub_converter.css_processor import CSSPreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.epub_converter.epub_converter import EpubConverter
@@ -30,10 +30,10 @@ class EpubBook(BookSolver):
"""
preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
.get_preset_json()
css_preprocessor = CSSPreprocessor(logger=self.logger_object)
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
css_processor = CSSPreprocessor()
html_processor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
json_converter = EpubConverter(
self.file_path, access=self.access, logger=self.logger_object,
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
css_processor=css_processor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict()
return content_dict