forked from LiveCarta/BookConverter
rewrite css_processor.py
This commit is contained in:
@@ -1,14 +1,14 @@
|
||||
import re
|
||||
import cssutils
|
||||
from bs4 import BeautifulSoup
|
||||
from os.path import dirname, normpath, join
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
from src.util.color_reader import str2hex
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
|
||||
|
||||
class CSSPreprocessor:
|
||||
def __init__(self, logger=None):
|
||||
self.logger: BookLogger = logger
|
||||
def __init__(self):
|
||||
"""
|
||||
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
||||
|
||||
@@ -99,12 +99,8 @@ class CSSPreprocessor:
|
||||
size_value: str
|
||||
|
||||
"""
|
||||
if len(size_value.split(" ")) == 3:
|
||||
size_value = self.convert_tag_style_values(size_value.split(
|
||||
" ")[-2], True) # returns middle value
|
||||
else:
|
||||
size_value = self.convert_tag_style_values(size_value.split(
|
||||
" ")[-1], True) # returns last value
|
||||
size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
|
||||
else self.convert_tag_style_values(size_value.split(" ")[-1], True)
|
||||
return size_value
|
||||
|
||||
@staticmethod
|
||||
@@ -152,10 +148,37 @@ class CSSPreprocessor:
|
||||
style = "; ".join(split_style)
|
||||
return style
|
||||
|
||||
def process_inline_styles_in_html_soup(self, html_href2html_body_soup):
|
||||
"""This function is designed to convert inline html styles"""
|
||||
for html_href in html_href2html_body_soup:
|
||||
html_content: BeautifulSoup = html_href2html_body_soup[html_href]
|
||||
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
attrs={"style": re.compile(".*")})
|
||||
|
||||
for tag_initial_inline_style in tags_with_inline_style:
|
||||
inline_style = tag_initial_inline_style.attrs["style"]
|
||||
tag_initial_inline_style.attrs["style"] = \
|
||||
self.build_inline_style_content(inline_style)
|
||||
|
||||
@staticmethod
|
||||
def get_css_content(css_href, html_href, ebooklib_book):
|
||||
path_to_css_from_html = css_href
|
||||
html_folder = dirname(html_href)
|
||||
path_to_css_from_root = normpath(
|
||||
join(html_folder, path_to_css_from_html)).replace("\\", "/")
|
||||
css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root)
|
||||
# if in css file we import another css
|
||||
if "@import" in str(css_obj.content):
|
||||
path_to_css_from_root = "css/" + \
|
||||
re.search("'(.*)'", str(css_obj.content)).group(1)
|
||||
css_obj = ebooklib_book.get_item_with_href(
|
||||
path_to_css_from_root)
|
||||
assert css_obj, f"Css style {css_href} was not in manifest."
|
||||
css_content: str = css_obj.get_content().decode()
|
||||
return css_content
|
||||
|
||||
def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
|
||||
style_type: cssutils.css.property.Property):
|
||||
if style_type.name == "font-family":
|
||||
pass
|
||||
if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
|
||||
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||
css_rule.style[style_type.name] = ""
|
||||
@@ -2,7 +2,6 @@ import re
|
||||
import json
|
||||
import codecs
|
||||
import os
|
||||
from os.path import dirname, normpath, join
|
||||
from itertools import chain
|
||||
from premailer import transform
|
||||
from collections import defaultdict
|
||||
@@ -15,8 +14,8 @@ from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
from src.preset_processor import PresetProcessor
|
||||
from src.epub_converter.css_preprocessor import CSSPreprocessor
|
||||
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
|
||||
from src.epub_converter.css_processor import CSSPreprocessor
|
||||
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.data_objects import ChapterItem, NavPoint
|
||||
from src.epub_converter.image_processing import update_images_src_links
|
||||
@@ -25,18 +24,18 @@ from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcesso
|
||||
|
||||
|
||||
class EpubConverter:
|
||||
def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
|
||||
def __init__(self, file_path, access=None, logger=None, css_processor=None, html_processor=None):
|
||||
self.file_path = file_path
|
||||
self.access = access
|
||||
self.logger: BookLogger = logger
|
||||
self.ebooklib_book = epub.read_epub(file_path)
|
||||
self.css_processor = css_preprocessor
|
||||
self.html_preprocessor = html_processor
|
||||
self.css_processor = css_processor
|
||||
self.html_processor = html_processor
|
||||
|
||||
# main container for all epub .xhtml files
|
||||
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
|
||||
# enumerate all subchapter id for each file
|
||||
self.html_href2subchapter_ids = defaultdict(list)
|
||||
self.html_href2subchapters_ids = defaultdict(list)
|
||||
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
|
||||
|
||||
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
|
||||
@@ -71,17 +70,18 @@ class EpubConverter:
|
||||
self.html_href2html_body_soup: Dict[str,
|
||||
BeautifulSoup] = self.build_href2soup_content()
|
||||
|
||||
self.logger.log("Process CSS inline styles.")
|
||||
self.process_inline_styles_in_html_soup()
|
||||
self.logger.log("CSS inline style processing.")
|
||||
self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
|
||||
self.logger.log("CSS files processing.")
|
||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||
self.logger.log("CSS styles adding.")
|
||||
self.logger.log("CSS styles fusion(inline+file).")
|
||||
self.add_css_styles_to_html_soup()
|
||||
|
||||
self.logger.log("Footnotes processing.")
|
||||
for href in self.html_href2html_body_soup:
|
||||
self.footnotes_contents, self.noterefs, self.footnotes =\
|
||||
preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
|
||||
preprocess_footnotes(
|
||||
self.html_href2html_body_soup[href], self.html_href2html_body_soup)
|
||||
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
|
||||
|
||||
self.logger.log("TOC processing.")
|
||||
@@ -115,34 +115,6 @@ class EpubConverter:
|
||||
nodes[item.file_name] = soup
|
||||
return nodes
|
||||
|
||||
def get_css_content(self, css_href, html_href):
|
||||
path_to_css_from_html = css_href
|
||||
html_folder = dirname(html_href)
|
||||
path_to_css_from_root = normpath(
|
||||
join(html_folder, path_to_css_from_html)).replace("\\", "/")
|
||||
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
|
||||
# if in css file we import another css
|
||||
if "@import" in str(css_obj.content):
|
||||
path_to_css_from_root = "css/" + \
|
||||
re.search("'(.*)'", str(css_obj.content)).group(1)
|
||||
css_obj = self.ebooklib_book.get_item_with_href(
|
||||
path_to_css_from_root)
|
||||
assert css_obj, f"Css style {css_href} was not in manifest."
|
||||
css_content: str = css_obj.get_content().decode()
|
||||
return css_content
|
||||
|
||||
def process_inline_styles_in_html_soup(self):
|
||||
"""This function is designed to convert inline html styles"""
|
||||
for html_href in self.html_href2html_body_soup:
|
||||
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
||||
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
attrs={"style": re.compile(".*")})
|
||||
|
||||
for tag_initial_inline_style in tags_with_inline_style:
|
||||
inline_style = tag_initial_inline_style.attrs["style"]
|
||||
tag_initial_inline_style.attrs["style"] = \
|
||||
self.css_processor.build_inline_style_content(inline_style)
|
||||
|
||||
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
||||
"""
|
||||
Function is designed to get 2 dictionaries:
|
||||
@@ -174,7 +146,7 @@ class EpubConverter:
|
||||
if css_href not in css_href2css_content:
|
||||
# css_href not in css_href2css_content, add to this dict
|
||||
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
|
||||
self.get_css_content(css_href, html_href))
|
||||
self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))
|
||||
|
||||
for i, tag in enumerate(soup_html_content.find_all("style")):
|
||||
css_content = tag.string
|
||||
@@ -183,7 +155,8 @@ class EpubConverter:
|
||||
css_content)
|
||||
return html_href2css_href, css_href2css_content
|
||||
|
||||
def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
||||
@staticmethod
|
||||
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
||||
"""
|
||||
Function adds styles from .css to inline style.
|
||||
Parameters
|
||||
@@ -224,7 +197,10 @@ class EpubConverter:
|
||||
"""
|
||||
This function is designed to update html_href2html_body_soup
|
||||
- add to html_inline_style css_style_content
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
updated soups with styles from css
|
||||
"""
|
||||
for html_href in self.html_href2html_body_soup:
|
||||
if self.html_href2css_href.get(html_href):
|
||||
@@ -232,7 +208,8 @@ class EpubConverter:
|
||||
for css_href in self.html_href2css_href[html_href]:
|
||||
css += self.css_href2css_content[css_href]
|
||||
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
||||
html_content = self.convert_html_soup_with_css_style(html_content, css)
|
||||
html_content = self.modify_html_soup_with_css_styles(
|
||||
html_content, css)
|
||||
self.html_href2html_body_soup[html_href] = html_content
|
||||
|
||||
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
||||
@@ -259,7 +236,7 @@ class EpubConverter:
|
||||
nav_point = NavPoint(element)
|
||||
if nav_point.id:
|
||||
self.id_anchor_exist_in_nav_points = True
|
||||
self.html_href2subchapter_ids[nav_point.href].append(
|
||||
self.html_href2subchapters_ids[nav_point.href].append(
|
||||
nav_point.id)
|
||||
self.adjacency_list[nav_point] = None
|
||||
self.hrefs_added_to_toc.add(nav_point.href)
|
||||
@@ -271,7 +248,7 @@ class EpubConverter:
|
||||
nav_point = NavPoint(first)
|
||||
if nav_point.id:
|
||||
self.id_anchor_exist_in_nav_points = True
|
||||
self.html_href2subchapter_ids[nav_point.href].append(
|
||||
self.html_href2subchapters_ids[nav_point.href].append(
|
||||
nav_point.id)
|
||||
|
||||
sub_nodes = []
|
||||
@@ -357,25 +334,19 @@ class EpubConverter:
|
||||
for html_href in self.html_href2html_body_soup:
|
||||
chapter_tag = self.html_href2html_body_soup[html_href]
|
||||
# check marks for chapter starting are on the same level - 1st
|
||||
marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"})
|
||||
marks = chapter_tag.find_all(
|
||||
attrs={"class": "converter-chapter-mark"})
|
||||
|
||||
# fix marks to be on 1 level
|
||||
for mark in marks:
|
||||
while mark.parent != chapter_tag:
|
||||
mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
|
||||
# todo warning! could reflect on formatting/internal links in some cases
|
||||
mark.parent.unwrap()
|
||||
|
||||
@staticmethod
|
||||
def create_unique_id(href, id_):
|
||||
return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
|
||||
|
||||
@staticmethod
|
||||
def create_new_anchor_span(soup, id_):
|
||||
new_anchor_span = soup.new_tag("span")
|
||||
new_anchor_span.attrs["id"] = id_
|
||||
new_anchor_span.attrs["class"] = "link-anchor"
|
||||
new_anchor_span.string = "\xa0"
|
||||
return new_anchor_span
|
||||
|
||||
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
|
||||
"""
|
||||
Function used to find full path to file that is parsed from tag link
|
||||
@@ -414,6 +385,14 @@ class EpubConverter:
|
||||
|
||||
return full_path[0]
|
||||
|
||||
@staticmethod
|
||||
def create_new_anchor_span(soup, id_):
|
||||
new_anchor_span = soup.new_tag("span")
|
||||
new_anchor_span.attrs["id"] = id_
|
||||
new_anchor_span.attrs["class"] = "link-anchor"
|
||||
new_anchor_span.string = "\xa0"
|
||||
return new_anchor_span
|
||||
|
||||
def process_internal_links(self):
|
||||
"""
|
||||
Function
|
||||
@@ -520,8 +499,7 @@ class EpubConverter:
|
||||
@staticmethod
|
||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||
"""
|
||||
After processing on a first_id that corresponds to current chapter,
|
||||
from initial html_soup all tags from current chapter are extracted
|
||||
Get tags between LiveCarta chapter marks
|
||||
Parameters
|
||||
----------
|
||||
first_id: str
|
||||
@@ -553,7 +531,6 @@ class EpubConverter:
|
||||
# save them in list for next steps
|
||||
tags = [tag.extract() for tag in tags]
|
||||
html_soup.smooth()
|
||||
|
||||
else:
|
||||
assert 0, f"Warning: no match for {first_id, href}"
|
||||
|
||||
@@ -594,7 +571,7 @@ class EpubConverter:
|
||||
for sub_node in self.adjacency_list[nav_point]:
|
||||
self.detect_one_chapter(sub_node)
|
||||
|
||||
def define_chapters_content(self):
|
||||
def define_chapters_with_content(self):
|
||||
"""Function build chapters content, starts from top level chapters"""
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
if self.id_anchor_exist_in_nav_points:
|
||||
@@ -618,11 +595,9 @@ class EpubConverter:
|
||||
|
||||
"""
|
||||
title = nav_point.title
|
||||
if nav_point.id:
|
||||
content: BeautifulSoup = self.href_chapter_id2soup_html[(
|
||||
nav_point.href, nav_point.id)]
|
||||
else:
|
||||
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
|
||||
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
|
||||
if nav_point.id else self.html_href2html_body_soup[nav_point.href]
|
||||
|
||||
self.book_image_src_path2aws_path = update_images_src_links(content,
|
||||
self.img_href2img_bytes,
|
||||
path_to_html=nav_point.href,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from src.book_solver import BookSolver
|
||||
from src.preset_processor import PresetProcessor
|
||||
from src.epub_converter.css_preprocessor import CSSPreprocessor
|
||||
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
|
||||
from src.epub_converter.css_processor import CSSPreprocessor
|
||||
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
||||
from src.epub_converter.epub_converter import EpubConverter
|
||||
|
||||
|
||||
@@ -30,10 +30,10 @@ class EpubBook(BookSolver):
|
||||
"""
|
||||
preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
|
||||
.get_preset_json()
|
||||
css_preprocessor = CSSPreprocessor(logger=self.logger_object)
|
||||
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
|
||||
css_processor = CSSPreprocessor()
|
||||
html_processor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
|
||||
json_converter = EpubConverter(
|
||||
self.file_path, access=self.access, logger=self.logger_object,
|
||||
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
|
||||
css_processor=css_processor, html_processor=html_processor)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
return content_dict
|
||||
|
||||
Reference in New Issue
Block a user