forked from LiveCarta/BookConverter
put style processors on general level
This commit is contained in:
@@ -15,15 +15,15 @@ from bs4 import BeautifulSoup, Tag, NavigableString
|
|||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.epub_converter.css_processor import CSSPreprocessor
|
from src.style_preprocessor import CSSPreprocessor
|
||||||
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
||||||
from src.epub_converter.image_processing import update_images_src_links
|
from src.epub_converter.image_processing import update_images_src_links
|
||||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||||
from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
|
from src.tag_inline_style_processor import TagInlineStyleProcessor
|
||||||
|
|
||||||
|
|
||||||
class EpubConverter:
|
class EpubConverter:
|
||||||
def __init__(self, book_path, access=None, logger=None, css_processor=None, html_processor=None):
|
def __init__(self, book_path, access=None, logger: BookLogger = None, css_processor: CSSPreprocessor = None, html_processor: HtmlEpubPreprocessor = None):
|
||||||
self.book_path = book_path
|
self.book_path = book_path
|
||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
@@ -257,7 +257,7 @@ class EpubConverter:
|
|||||||
|
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
for elem in second:
|
for elem in second:
|
||||||
if (bool(re.search('^section$|^part$', first.title.lower()))) and lvl == 1:
|
if (bool(re.search("^section$|^part$", first.title.lower()))) and lvl == 1:
|
||||||
self.offset_sub_nodes.append(
|
self.offset_sub_nodes.append(
|
||||||
self.build_adjacency_list_from_toc(elem, lvl))
|
self.build_adjacency_list_from_toc(elem, lvl))
|
||||||
else:
|
else:
|
||||||
@@ -291,7 +291,7 @@ class EpubConverter:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def build_adjacency_list_from_spine(self):
|
def build_adjacency_list_from_spine(self):
|
||||||
def build_manifest_id2html_href() -> dict:
|
def build_manifest_id2html_href() -> Dict[int, str]:
|
||||||
links = dict()
|
links = dict()
|
||||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||||
links[item.id] = item.file_name
|
links[item.id] = item.file_name
|
||||||
@@ -607,7 +607,7 @@ class EpubConverter:
|
|||||||
self.logger.log(indent + "Process title.")
|
self.logger.log(indent + "Process title.")
|
||||||
title_preprocessed: str = self.html_processor.prepare_title(title)
|
title_preprocessed: str = self.html_processor.prepare_title(title)
|
||||||
self.logger.log(indent + "Process content.")
|
self.logger.log(indent + "Process content.")
|
||||||
content_preprocessed: BeautifulSoup = self.html_processor.prepare_content(
|
content_preprocessed: Union[Tag, BeautifulSoup] = self.html_processor.prepare_content(
|
||||||
title_preprocessed, content, remove_title_from_chapter=is_chapter)
|
title_preprocessed, content, remove_title_from_chapter=is_chapter)
|
||||||
|
|
||||||
self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
|
self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from src.book_solver import BookSolver
|
from src.book_solver import BookSolver
|
||||||
from src.epub_converter.css_processor import CSSPreprocessor
|
from src.style_preprocessor import CSSPreprocessor
|
||||||
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
|
||||||
from src.epub_converter.epub_converter import EpubConverter
|
from src.epub_converter.epub_converter import EpubConverter
|
||||||
|
|
||||||
|
|||||||
@@ -192,14 +192,18 @@ class HtmlEpubPreprocessor:
|
|||||||
tag_to_replace: str = rule["tag_to_replace"]
|
tag_to_replace: str = rule["tag_to_replace"]
|
||||||
if rule["condition"]:
|
if rule["condition"]:
|
||||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||||
if condition_on_tag[0] == 'parent_tags':
|
if condition_on_tag[0] == "parent_tags":
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
if tag.parent.select(condition_on_tag[1]):
|
if tag.parent.select(condition_on_tag[1]):
|
||||||
tag.name = tag_to_replace
|
tag.name = tag_to_replace
|
||||||
elif condition_on_tag[0] == 'child_tags':
|
elif condition_on_tag[0] == "child_tags":
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
|
if "not" in condition_on_tag[1]:
|
||||||
tag.name = tag_to_replace
|
if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])):
|
||||||
|
tag.name = tag_to_replace
|
||||||
|
else:
|
||||||
|
if tag.select(condition_on_tag[1]):
|
||||||
|
tag.name = tag_to_replace
|
||||||
elif condition_on_tag[0] == "attrs":
|
elif condition_on_tag[0] == "attrs":
|
||||||
for attr in rule["condition"]["attrs"]:
|
for attr in rule["condition"]["attrs"]:
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
@@ -236,15 +240,15 @@ class HtmlEpubPreprocessor:
|
|||||||
tag[attr_to_replace] = tag[attr]
|
tag[attr_to_replace] = tag[attr]
|
||||||
del tag[attr]
|
del tag[attr]
|
||||||
|
|
||||||
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]):
|
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: List[Dict[str, List[str]]]):
|
||||||
"""
|
"""
|
||||||
Function unwrap tags and moves id to span
|
Function unwrap tags and moves id to span
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
chapter_tag: BeautifulSoup
|
chapter_tag: BeautifulSoup
|
||||||
Tag & contents of the chapter tag
|
Tag & contents of the chapter tag
|
||||||
rules: Dict[str, List[str]]
|
rules: List[Dict[str, List[str]]]
|
||||||
dict of tags to unwrap
|
list of conditions when fire function
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -252,13 +256,14 @@ class HtmlEpubPreprocessor:
|
|||||||
Chapter Tag with unwrapped certain tags
|
Chapter Tag with unwrapped certain tags
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for tag_name in rules["tags"]:
|
for rule in rules:
|
||||||
for tag in chapter_tag.select(tag_name):
|
for tag_name in rule["tags"]:
|
||||||
# if tag is a subtag
|
for tag in chapter_tag.select(tag_name):
|
||||||
if ">" in tag_name:
|
# if tag is a subtag
|
||||||
tag.parent.attrs.update(tag.attrs)
|
if ">" in tag_name:
|
||||||
self._add_span_to_save_ids_for_links(tag, chapter_tag)
|
tag.parent.attrs.update(tag.attrs)
|
||||||
tag.unwrap()
|
self._add_span_to_save_ids_for_links(tag, chapter_tag)
|
||||||
|
tag.unwrap()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup,
|
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup,
|
||||||
@@ -293,14 +298,18 @@ class HtmlEpubPreprocessor:
|
|||||||
tags: List[str] = rule["tags"]
|
tags: List[str] = rule["tags"]
|
||||||
if rule["condition"]:
|
if rule["condition"]:
|
||||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||||
if condition_on_tag[0] == 'parent_tags':
|
if condition_on_tag[0] == "parent_tags":
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
if tag.parent.select(condition_on_tag[1]):
|
if tag.parent.select(condition_on_tag[1]):
|
||||||
insert(tag)
|
insert(tag)
|
||||||
elif condition_on_tag[0] == 'child_tags':
|
elif condition_on_tag[0] == "child_tags":
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
|
if "not" in condition_on_tag[1]:
|
||||||
insert(tag)
|
if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])):
|
||||||
|
tag.unwrap()
|
||||||
|
else:
|
||||||
|
if tag.select(condition_on_tag[1]):
|
||||||
|
tag.unwrap()
|
||||||
elif condition_on_tag[0] == "attrs":
|
elif condition_on_tag[0] == "attrs":
|
||||||
for attr in rule["condition"]["attrs"]:
|
for attr in rule["condition"]["attrs"]:
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
@@ -441,7 +450,7 @@ class HtmlEpubPreprocessor:
|
|||||||
# 3-6.
|
# 3-6.
|
||||||
for rule in self.preset:
|
for rule in self.preset:
|
||||||
func = self.name2function[rule["preset_name"]]
|
func = self.name2function[rule["preset_name"]]
|
||||||
func(content_tag, rule['rules'])
|
func(content_tag, rule["rules"])
|
||||||
# 7.
|
# 7.
|
||||||
if remove_title_from_chapter:
|
if remove_title_from_chapter:
|
||||||
self._remove_headings_content(content_tag, title_str)
|
self._remove_headings_content(content_tag, title_str)
|
||||||
|
|||||||
Reference in New Issue
Block a user