Rewrite style processing to make it common[Epub, Docx]

This commit is contained in:
Kiryl
2022-09-02 14:43:16 +03:00
parent d71ef44178
commit c602d9974a
4 changed files with 70 additions and 70 deletions

View File

@@ -7,7 +7,6 @@ from pathlib import Path
from ebooklib import epub
from ebooklib.epub import Link, Section
from itertools import chain
from premailer import transform
from collections import defaultdict
from typing import List, Tuple, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString
@@ -15,20 +14,21 @@ from bs4 import BeautifulSoup, Tag, NavigableString
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.style_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.style_preprocessor import StylePreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.tag_inline_style_processor import TagInlineStyleProcessor
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
class EpubConverter:
def __init__(self, book_path, access=None, logger: BookLogger = None, css_processor: CSSPreprocessor = None, html_processor: HtmlEpubPreprocessor = None):
def __init__(self, book_path, access=None, logger: BookLogger = None,
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
self.book_path = book_path
self.access = access
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(book_path)
self.css_processor = css_processor
self.style_processor = style_processor
self.html_processor = html_processor
# main container for all epub .xhtml files
@@ -71,8 +71,8 @@ class EpubConverter:
BeautifulSoup] = self.build_href2soup_content()
self.logger.log("CSS inline style processing.")
self.css_processor.process_inline_styles_in_html_soup(
self.html_href2html_body_soup)
[self.style_processor.process_inline_styles_in_html_soup(
self.html_href2html_body_soup[html_href]) for html_href in self.html_href2html_body_soup]
self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log("CSS styles fusion(inline+file).")
@@ -147,54 +147,16 @@ class EpubConverter:
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))
css_href2css_content[css_href] = self.style_processor.build_css_file_content(
self.style_processor.get_css_content(css_href, html_href, self.ebooklib_book))
for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string
html_href2css_href[html_href].append(f"href{i}")
css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
css_href2css_content[f"href{i}"] = self.style_processor.build_css_file_content(
css_content)
return html_href2css_href, css_href2css_content
@staticmethod
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""
Function adds styles from .css to inline style.
Parameters
----------
html_soup: BeautifulSoup
html page with inline style
css_text: str
css content from css file
Returns
-------
inline_soup: BeautifulSoup
soup with styles from css
"""
# remove this specification because it causes problems
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:
style_converter = TagInlineStyleProcessor(tag_inline_style)
style_converter.convert_initial_tag()
return inline_soup
def add_css_styles_to_html_soup(self):
"""
This function is designed to update html_href2html_body_soup
@@ -210,7 +172,7 @@ class EpubConverter:
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
html_content = self.modify_html_soup_with_css_styles(
html_content = modify_html_soup_with_css_styles(
html_content, css)
self.html_href2html_body_soup[html_href] = html_content
@@ -646,15 +608,16 @@ class EpubConverter:
if __name__ == "__main__":
epub_file_path = "../../books/epub/9781119646044.epub"
epub_file_path = "../../books/epub/9780763774134.epub"
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])
css_processor = CSSPreprocessor()
html_processor = HtmlEpubPreprocessor(logger=logger_object)
css_processor = StylePreprocessor()
html_processor = HtmlEpubProcessor(
"../../presets/presets.json", logger=logger_object)
json_converter = EpubConverter(epub_file_path, logger=logger_object,
css_processor=css_processor, html_processor=html_processor)
style_processor=css_processor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict()
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:

View File

@@ -1,6 +1,6 @@
from src.book_solver import BookSolver
from src.style_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.style_preprocessor import StylePreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
from src.epub_converter.epub_converter import EpubConverter
@@ -25,11 +25,11 @@ class EpubBook(BookSolver):
json for LiveCarta platform
"""
css_processor = CSSPreprocessor()
html_processor = HtmlEpubPreprocessor(
self.preset_path, logger=self.logger_object)
style_processor = StylePreprocessor()
html_processor = HtmlEpubProcessor(
logger=self.logger_object)
json_converter = EpubConverter(
self.book_path, access=self.access, logger=self.logger_object,
css_processor=css_processor, html_processor=html_processor)
style_processor=style_processor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict()
return content_dict

View File

@@ -8,7 +8,7 @@ from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
class CSSPreprocessor:
class StylePreprocessor:
def __init__(self):
"""
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
@@ -160,17 +160,15 @@ class CSSPreprocessor:
style = "; ".join(split_style)
return style
def process_inline_styles_in_html_soup(self, html_href2html_body_soup: Dict[str, BeautifulSoup]):
def process_inline_styles_in_html_soup(self, html_content):
"""This function is designed to convert inline html styles"""
for html_href in html_href2html_body_soup:
html_content: BeautifulSoup = html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
self.build_inline_style_content(inline_style)
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
self.build_inline_style_content(inline_style)
@staticmethod
def get_css_content(css_href: str, html_href: str, ebooklib_book) -> str:

View File

@@ -2,6 +2,7 @@ import re
import cssutils
from typing import List
from logging import CRITICAL
from premailer import transform
from bs4 import BeautifulSoup, Tag
from src.livecarta_config import LiveCartaConfig
@@ -215,3 +216,41 @@ class TagInlineStyleProcessor:
self.change_attrs_with_corresponding_tags()
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
return self.tag_inline_style
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str = "") -> BeautifulSoup:
"""
Function adds styles from .css to inline style.
Parameters
----------
html_soup: BeautifulSoup
html page with inline style
css_text: str
css content from css file
Returns
-------
inline_soup: BeautifulSoup
soup with styles from css
"""
# remove this specification because it causes problems
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:
style_converter = TagInlineStyleProcessor(tag_inline_style)
style_converter.convert_initial_tag()
return inline_soup