forked from LiveCarta/BookConverter
Rewrite style processing to make it common[Epub, Docx]
This commit is contained in:
@@ -7,7 +7,6 @@ from pathlib import Path
|
||||
from ebooklib import epub
|
||||
from ebooklib.epub import Link, Section
|
||||
from itertools import chain
|
||||
from premailer import transform
|
||||
from collections import defaultdict
|
||||
from typing import List, Tuple, Dict, Union
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
@@ -15,20 +14,21 @@ from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
from src.util.helpers import BookLogger
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.data_objects import ChapterItem, NavPoint
|
||||
from src.style_preprocessor import CSSPreprocessor
|
||||
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
||||
from src.style_preprocessor import StylePreprocessor
|
||||
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||
from src.epub_converter.image_processing import update_images_src_links
|
||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||
from src.tag_inline_style_processor import TagInlineStyleProcessor
|
||||
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
||||
|
||||
|
||||
class EpubConverter:
|
||||
def __init__(self, book_path, access=None, logger: BookLogger = None, css_processor: CSSPreprocessor = None, html_processor: HtmlEpubPreprocessor = None):
|
||||
def __init__(self, book_path, access=None, logger: BookLogger = None,
|
||||
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
|
||||
self.book_path = book_path
|
||||
self.access = access
|
||||
self.logger: BookLogger = logger
|
||||
self.ebooklib_book = epub.read_epub(book_path)
|
||||
self.css_processor = css_processor
|
||||
self.style_processor = style_processor
|
||||
self.html_processor = html_processor
|
||||
|
||||
# main container for all epub .xhtml files
|
||||
@@ -71,8 +71,8 @@ class EpubConverter:
|
||||
BeautifulSoup] = self.build_href2soup_content()
|
||||
|
||||
self.logger.log("CSS inline style processing.")
|
||||
self.css_processor.process_inline_styles_in_html_soup(
|
||||
self.html_href2html_body_soup)
|
||||
[self.style_processor.process_inline_styles_in_html_soup(
|
||||
self.html_href2html_body_soup[html_href]) for html_href in self.html_href2html_body_soup]
|
||||
self.logger.log("CSS files processing.")
|
||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||
self.logger.log("CSS styles fusion(inline+file).")
|
||||
@@ -147,54 +147,16 @@ class EpubConverter:
|
||||
html_href2css_href[html_href].append(css_href)
|
||||
if css_href not in css_href2css_content:
|
||||
# css_href not in css_href2css_content, add to this dict
|
||||
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
|
||||
self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))
|
||||
css_href2css_content[css_href] = self.style_processor.build_css_file_content(
|
||||
self.style_processor.get_css_content(css_href, html_href, self.ebooklib_book))
|
||||
|
||||
for i, tag in enumerate(soup_html_content.find_all("style")):
|
||||
css_content = tag.string
|
||||
html_href2css_href[html_href].append(f"href{i}")
|
||||
css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
|
||||
css_href2css_content[f"href{i}"] = self.style_processor.build_css_file_content(
|
||||
css_content)
|
||||
return html_href2css_href, css_href2css_content
|
||||
|
||||
@staticmethod
|
||||
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
||||
"""
|
||||
Function adds styles from .css to inline style.
|
||||
Parameters
|
||||
----------
|
||||
html_soup: BeautifulSoup
|
||||
html page with inline style
|
||||
css_text: str
|
||||
css content from css file
|
||||
Returns
|
||||
-------
|
||||
inline_soup: BeautifulSoup
|
||||
soup with styles from css
|
||||
|
||||
"""
|
||||
# remove this specification because it causes problems
|
||||
css_text = css_text.replace(
|
||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||
# here we add css styles to inline style
|
||||
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
||||
remove_classes=False,
|
||||
external_styles=False,
|
||||
allow_network=False,
|
||||
disable_validation=True,
|
||||
)
|
||||
# soup with converted styles from css
|
||||
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
|
||||
|
||||
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
attrs={"style": re.compile(".*")})
|
||||
|
||||
# go through the tags with inline style + style parsed from css file
|
||||
for tag_inline_style in tags_with_inline_style:
|
||||
style_converter = TagInlineStyleProcessor(tag_inline_style)
|
||||
style_converter.convert_initial_tag()
|
||||
return inline_soup
|
||||
|
||||
def add_css_styles_to_html_soup(self):
|
||||
"""
|
||||
This function is designed to update html_href2html_body_soup
|
||||
@@ -210,7 +172,7 @@ class EpubConverter:
|
||||
for css_href in self.html_href2css_href[html_href]:
|
||||
css += self.css_href2css_content[css_href]
|
||||
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
||||
html_content = self.modify_html_soup_with_css_styles(
|
||||
html_content = modify_html_soup_with_css_styles(
|
||||
html_content, css)
|
||||
self.html_href2html_body_soup[html_href] = html_content
|
||||
|
||||
@@ -646,15 +608,16 @@ class EpubConverter:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
epub_file_path = "../../books/epub/9781119646044.epub"
|
||||
epub_file_path = "../../books/epub/9780763774134.epub"
|
||||
logger_object = BookLogger(
|
||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||
|
||||
css_processor = CSSPreprocessor()
|
||||
html_processor = HtmlEpubPreprocessor(logger=logger_object)
|
||||
css_processor = StylePreprocessor()
|
||||
html_processor = HtmlEpubProcessor(
|
||||
"../../presets/presets.json", logger=logger_object)
|
||||
|
||||
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
||||
css_processor=css_processor, html_processor=html_processor)
|
||||
style_processor=css_processor, html_processor=html_processor)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
|
||||
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from src.book_solver import BookSolver
|
||||
from src.style_preprocessor import CSSPreprocessor
|
||||
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
||||
from src.style_preprocessor import StylePreprocessor
|
||||
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||
from src.epub_converter.epub_converter import EpubConverter
|
||||
|
||||
|
||||
@@ -25,11 +25,11 @@ class EpubBook(BookSolver):
|
||||
json for LiveCarta platform
|
||||
|
||||
"""
|
||||
css_processor = CSSPreprocessor()
|
||||
html_processor = HtmlEpubPreprocessor(
|
||||
self.preset_path, logger=self.logger_object)
|
||||
style_processor = StylePreprocessor()
|
||||
html_processor = HtmlEpubProcessor(
|
||||
logger=self.logger_object)
|
||||
json_converter = EpubConverter(
|
||||
self.book_path, access=self.access, logger=self.logger_object,
|
||||
css_processor=css_processor, html_processor=html_processor)
|
||||
style_processor=style_processor, html_processor=html_processor)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
return content_dict
|
||||
|
||||
@@ -8,7 +8,7 @@ from src.util.color_reader import str2hex
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
|
||||
|
||||
class CSSPreprocessor:
|
||||
class StylePreprocessor:
|
||||
def __init__(self):
|
||||
"""
|
||||
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
||||
@@ -160,17 +160,15 @@ class CSSPreprocessor:
|
||||
style = "; ".join(split_style)
|
||||
return style
|
||||
|
||||
def process_inline_styles_in_html_soup(self, html_href2html_body_soup: Dict[str, BeautifulSoup]):
|
||||
def process_inline_styles_in_html_soup(self, html_content):
|
||||
"""This function is designed to convert inline html styles"""
|
||||
for html_href in html_href2html_body_soup:
|
||||
html_content: BeautifulSoup = html_href2html_body_soup[html_href]
|
||||
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
attrs={"style": re.compile(".*")})
|
||||
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
attrs={"style": re.compile(".*")})
|
||||
|
||||
for tag_initial_inline_style in tags_with_inline_style:
|
||||
inline_style = tag_initial_inline_style.attrs["style"]
|
||||
tag_initial_inline_style.attrs["style"] = \
|
||||
self.build_inline_style_content(inline_style)
|
||||
for tag_initial_inline_style in tags_with_inline_style:
|
||||
inline_style = tag_initial_inline_style.attrs["style"]
|
||||
tag_initial_inline_style.attrs["style"] = \
|
||||
self.build_inline_style_content(inline_style)
|
||||
|
||||
@staticmethod
|
||||
def get_css_content(css_href: str, html_href: str, ebooklib_book) -> str:
|
||||
|
||||
@@ -2,6 +2,7 @@ import re
|
||||
import cssutils
|
||||
from typing import List
|
||||
from logging import CRITICAL
|
||||
from premailer import transform
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
@@ -215,3 +216,41 @@ class TagInlineStyleProcessor:
|
||||
self.change_attrs_with_corresponding_tags()
|
||||
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
|
||||
return self.tag_inline_style
|
||||
|
||||
|
||||
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = "") -> BeautifulSoup:
|
||||
"""
|
||||
Function adds styles from .css to inline style.
|
||||
Parameters
|
||||
----------
|
||||
html_soup: BeautifulSoup
|
||||
html page with inline style
|
||||
css_text: str
|
||||
css content from css file
|
||||
Returns
|
||||
-------
|
||||
inline_soup: BeautifulSoup
|
||||
soup with styles from css
|
||||
|
||||
"""
|
||||
# remove this specification because it causes problems
|
||||
css_text = css_text.replace(
|
||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||
# here we add css styles to inline style
|
||||
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
||||
remove_classes=False,
|
||||
external_styles=False,
|
||||
allow_network=False,
|
||||
disable_validation=True,
|
||||
)
|
||||
# soup with converted styles from css
|
||||
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
|
||||
|
||||
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
attrs={"style": re.compile(".*")})
|
||||
|
||||
# go through the tags with inline style + style parsed from css file
|
||||
for tag_inline_style in tags_with_inline_style:
|
||||
style_converter = TagInlineStyleProcessor(tag_inline_style)
|
||||
style_converter.convert_initial_tag()
|
||||
return inline_soup
|
||||
|
||||
Reference in New Issue
Block a user