forked from LiveCarta/BookConverter
Rewrite style processing to make it common[Epub, Docx]
This commit is contained in:
@@ -7,7 +7,6 @@ from pathlib import Path
|
|||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
from ebooklib.epub import Link, Section
|
from ebooklib.epub import Link, Section
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from premailer import transform
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import List, Tuple, Dict, Union
|
from typing import List, Tuple, Dict, Union
|
||||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||||
@@ -15,20 +14,21 @@ from bs4 import BeautifulSoup, Tag, NavigableString
|
|||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.style_preprocessor import CSSPreprocessor
|
from src.style_preprocessor import StylePreprocessor
|
||||||
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||||
from src.epub_converter.image_processing import update_images_src_links
|
from src.epub_converter.image_processing import update_images_src_links
|
||||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||||
from src.tag_inline_style_processor import TagInlineStyleProcessor
|
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
||||||
|
|
||||||
|
|
||||||
class EpubConverter:
|
class EpubConverter:
|
||||||
def __init__(self, book_path, access=None, logger: BookLogger = None, css_processor: CSSPreprocessor = None, html_processor: HtmlEpubPreprocessor = None):
|
def __init__(self, book_path, access=None, logger: BookLogger = None,
|
||||||
|
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
|
||||||
self.book_path = book_path
|
self.book_path = book_path
|
||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
self.ebooklib_book = epub.read_epub(book_path)
|
self.ebooklib_book = epub.read_epub(book_path)
|
||||||
self.css_processor = css_processor
|
self.style_processor = style_processor
|
||||||
self.html_processor = html_processor
|
self.html_processor = html_processor
|
||||||
|
|
||||||
# main container for all epub .xhtml files
|
# main container for all epub .xhtml files
|
||||||
@@ -71,8 +71,8 @@ class EpubConverter:
|
|||||||
BeautifulSoup] = self.build_href2soup_content()
|
BeautifulSoup] = self.build_href2soup_content()
|
||||||
|
|
||||||
self.logger.log("CSS inline style processing.")
|
self.logger.log("CSS inline style processing.")
|
||||||
self.css_processor.process_inline_styles_in_html_soup(
|
[self.style_processor.process_inline_styles_in_html_soup(
|
||||||
self.html_href2html_body_soup)
|
self.html_href2html_body_soup[html_href]) for html_href in self.html_href2html_body_soup]
|
||||||
self.logger.log("CSS files processing.")
|
self.logger.log("CSS files processing.")
|
||||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||||
self.logger.log("CSS styles fusion(inline+file).")
|
self.logger.log("CSS styles fusion(inline+file).")
|
||||||
@@ -147,54 +147,16 @@ class EpubConverter:
|
|||||||
html_href2css_href[html_href].append(css_href)
|
html_href2css_href[html_href].append(css_href)
|
||||||
if css_href not in css_href2css_content:
|
if css_href not in css_href2css_content:
|
||||||
# css_href not in css_href2css_content, add to this dict
|
# css_href not in css_href2css_content, add to this dict
|
||||||
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
|
css_href2css_content[css_href] = self.style_processor.build_css_file_content(
|
||||||
self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))
|
self.style_processor.get_css_content(css_href, html_href, self.ebooklib_book))
|
||||||
|
|
||||||
for i, tag in enumerate(soup_html_content.find_all("style")):
|
for i, tag in enumerate(soup_html_content.find_all("style")):
|
||||||
css_content = tag.string
|
css_content = tag.string
|
||||||
html_href2css_href[html_href].append(f"href{i}")
|
html_href2css_href[html_href].append(f"href{i}")
|
||||||
css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
|
css_href2css_content[f"href{i}"] = self.style_processor.build_css_file_content(
|
||||||
css_content)
|
css_content)
|
||||||
return html_href2css_href, css_href2css_content
|
return html_href2css_href, css_href2css_content
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
|
||||||
"""
|
|
||||||
Function adds styles from .css to inline style.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
html_soup: BeautifulSoup
|
|
||||||
html page with inline style
|
|
||||||
css_text: str
|
|
||||||
css content from css file
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
inline_soup: BeautifulSoup
|
|
||||||
soup with styles from css
|
|
||||||
|
|
||||||
"""
|
|
||||||
# remove this specification because it causes problems
|
|
||||||
css_text = css_text.replace(
|
|
||||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
|
||||||
# here we add css styles to inline style
|
|
||||||
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
|
||||||
remove_classes=False,
|
|
||||||
external_styles=False,
|
|
||||||
allow_network=False,
|
|
||||||
disable_validation=True,
|
|
||||||
)
|
|
||||||
# soup with converted styles from css
|
|
||||||
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
|
|
||||||
|
|
||||||
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
|
||||||
attrs={"style": re.compile(".*")})
|
|
||||||
|
|
||||||
# go through the tags with inline style + style parsed from css file
|
|
||||||
for tag_inline_style in tags_with_inline_style:
|
|
||||||
style_converter = TagInlineStyleProcessor(tag_inline_style)
|
|
||||||
style_converter.convert_initial_tag()
|
|
||||||
return inline_soup
|
|
||||||
|
|
||||||
def add_css_styles_to_html_soup(self):
|
def add_css_styles_to_html_soup(self):
|
||||||
"""
|
"""
|
||||||
This function is designed to update html_href2html_body_soup
|
This function is designed to update html_href2html_body_soup
|
||||||
@@ -210,7 +172,7 @@ class EpubConverter:
|
|||||||
for css_href in self.html_href2css_href[html_href]:
|
for css_href in self.html_href2css_href[html_href]:
|
||||||
css += self.css_href2css_content[css_href]
|
css += self.css_href2css_content[css_href]
|
||||||
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
||||||
html_content = self.modify_html_soup_with_css_styles(
|
html_content = modify_html_soup_with_css_styles(
|
||||||
html_content, css)
|
html_content, css)
|
||||||
self.html_href2html_body_soup[html_href] = html_content
|
self.html_href2html_body_soup[html_href] = html_content
|
||||||
|
|
||||||
@@ -646,15 +608,16 @@ class EpubConverter:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
epub_file_path = "../../books/epub/9781119646044.epub"
|
epub_file_path = "../../books/epub/9780763774134.epub"
|
||||||
logger_object = BookLogger(
|
logger_object = BookLogger(
|
||||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||||
|
|
||||||
css_processor = CSSPreprocessor()
|
css_processor = StylePreprocessor()
|
||||||
html_processor = HtmlEpubPreprocessor(logger=logger_object)
|
html_processor = HtmlEpubProcessor(
|
||||||
|
"../../presets/presets.json", logger=logger_object)
|
||||||
|
|
||||||
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
||||||
css_processor=css_processor, html_processor=html_processor)
|
style_processor=css_processor, html_processor=html_processor)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
|
|
||||||
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
|
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from src.book_solver import BookSolver
|
from src.book_solver import BookSolver
|
||||||
from src.style_preprocessor import CSSPreprocessor
|
from src.style_preprocessor import StylePreprocessor
|
||||||
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||||
from src.epub_converter.epub_converter import EpubConverter
|
from src.epub_converter.epub_converter import EpubConverter
|
||||||
|
|
||||||
|
|
||||||
@@ -25,11 +25,11 @@ class EpubBook(BookSolver):
|
|||||||
json for LiveCarta platform
|
json for LiveCarta platform
|
||||||
|
|
||||||
"""
|
"""
|
||||||
css_processor = CSSPreprocessor()
|
style_processor = StylePreprocessor()
|
||||||
html_processor = HtmlEpubPreprocessor(
|
html_processor = HtmlEpubProcessor(
|
||||||
self.preset_path, logger=self.logger_object)
|
logger=self.logger_object)
|
||||||
json_converter = EpubConverter(
|
json_converter = EpubConverter(
|
||||||
self.book_path, access=self.access, logger=self.logger_object,
|
self.book_path, access=self.access, logger=self.logger_object,
|
||||||
css_processor=css_processor, html_processor=html_processor)
|
style_processor=style_processor, html_processor=html_processor)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
return content_dict
|
return content_dict
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from src.util.color_reader import str2hex
|
|||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
|
|
||||||
|
|
||||||
class CSSPreprocessor:
|
class StylePreprocessor:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"""
|
"""
|
||||||
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
|
||||||
@@ -160,17 +160,15 @@ class CSSPreprocessor:
|
|||||||
style = "; ".join(split_style)
|
style = "; ".join(split_style)
|
||||||
return style
|
return style
|
||||||
|
|
||||||
def process_inline_styles_in_html_soup(self, html_href2html_body_soup: Dict[str, BeautifulSoup]):
|
def process_inline_styles_in_html_soup(self, html_content):
|
||||||
"""This function is designed to convert inline html styles"""
|
"""This function is designed to convert inline html styles"""
|
||||||
for html_href in html_href2html_body_soup:
|
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||||
html_content: BeautifulSoup = html_href2html_body_soup[html_href]
|
attrs={"style": re.compile(".*")})
|
||||||
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
|
||||||
attrs={"style": re.compile(".*")})
|
|
||||||
|
|
||||||
for tag_initial_inline_style in tags_with_inline_style:
|
for tag_initial_inline_style in tags_with_inline_style:
|
||||||
inline_style = tag_initial_inline_style.attrs["style"]
|
inline_style = tag_initial_inline_style.attrs["style"]
|
||||||
tag_initial_inline_style.attrs["style"] = \
|
tag_initial_inline_style.attrs["style"] = \
|
||||||
self.build_inline_style_content(inline_style)
|
self.build_inline_style_content(inline_style)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_css_content(css_href: str, html_href: str, ebooklib_book) -> str:
|
def get_css_content(css_href: str, html_href: str, ebooklib_book) -> str:
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import re
|
|||||||
import cssutils
|
import cssutils
|
||||||
from typing import List
|
from typing import List
|
||||||
from logging import CRITICAL
|
from logging import CRITICAL
|
||||||
|
from premailer import transform
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
@@ -215,3 +216,41 @@ class TagInlineStyleProcessor:
|
|||||||
self.change_attrs_with_corresponding_tags()
|
self.change_attrs_with_corresponding_tags()
|
||||||
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
|
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
|
||||||
return self.tag_inline_style
|
return self.tag_inline_style
|
||||||
|
|
||||||
|
|
||||||
|
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = "") -> BeautifulSoup:
|
||||||
|
"""
|
||||||
|
Function adds styles from .css to inline style.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
html_soup: BeautifulSoup
|
||||||
|
html page with inline style
|
||||||
|
css_text: str
|
||||||
|
css content from css file
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
inline_soup: BeautifulSoup
|
||||||
|
soup with styles from css
|
||||||
|
|
||||||
|
"""
|
||||||
|
# remove this specification because it causes problems
|
||||||
|
css_text = css_text.replace(
|
||||||
|
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||||
|
# here we add css styles to inline style
|
||||||
|
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
||||||
|
remove_classes=False,
|
||||||
|
external_styles=False,
|
||||||
|
allow_network=False,
|
||||||
|
disable_validation=True,
|
||||||
|
)
|
||||||
|
# soup with converted styles from css
|
||||||
|
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
|
||||||
|
|
||||||
|
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||||
|
attrs={"style": re.compile(".*")})
|
||||||
|
|
||||||
|
# go through the tags with inline style + style parsed from css file
|
||||||
|
for tag_inline_style in tags_with_inline_style:
|
||||||
|
style_converter = TagInlineStyleProcessor(tag_inline_style)
|
||||||
|
style_converter.convert_initial_tag()
|
||||||
|
return inline_soup
|
||||||
|
|||||||
Reference in New Issue
Block a user