put style processors on general level

This commit is contained in:
Kiryl
2022-09-01 18:12:04 +03:00
parent 39d5e27df2
commit 115a53e366
5 changed files with 35 additions and 26 deletions

View File

@@ -15,15 +15,15 @@ from bs4 import BeautifulSoup, Tag, NavigableString
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_processor import CSSPreprocessor from src.style_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor from src.tag_inline_style_processor import TagInlineStyleProcessor
class EpubConverter: class EpubConverter:
def __init__(self, book_path, access=None, logger=None, css_processor=None, html_processor=None): def __init__(self, book_path, access=None, logger: BookLogger = None, css_processor: CSSPreprocessor = None, html_processor: HtmlEpubPreprocessor = None):
self.book_path = book_path self.book_path = book_path
self.access = access self.access = access
self.logger: BookLogger = logger self.logger: BookLogger = logger
@@ -257,7 +257,7 @@ class EpubConverter:
sub_nodes = [] sub_nodes = []
for elem in second: for elem in second:
if (bool(re.search('^section$|^part$', first.title.lower()))) and lvl == 1: if (bool(re.search("^section$|^part$", first.title.lower()))) and lvl == 1:
self.offset_sub_nodes.append( self.offset_sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl)) self.build_adjacency_list_from_toc(elem, lvl))
else: else:
@@ -291,7 +291,7 @@ class EpubConverter:
return False return False
def build_adjacency_list_from_spine(self): def build_adjacency_list_from_spine(self):
def build_manifest_id2html_href() -> dict: def build_manifest_id2html_href() -> Dict[int, str]:
links = dict() links = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
links[item.id] = item.file_name links[item.id] = item.file_name
@@ -607,7 +607,7 @@ class EpubConverter:
self.logger.log(indent + "Process title.") self.logger.log(indent + "Process title.")
title_preprocessed: str = self.html_processor.prepare_title(title) title_preprocessed: str = self.html_processor.prepare_title(title)
self.logger.log(indent + "Process content.") self.logger.log(indent + "Process content.")
content_preprocessed: BeautifulSoup = self.html_processor.prepare_content( content_preprocessed: Union[Tag, BeautifulSoup] = self.html_processor.prepare_content(
title_preprocessed, content, remove_title_from_chapter=is_chapter) title_preprocessed, content, remove_title_from_chapter=is_chapter)
self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed, self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,

View File

@@ -1,5 +1,5 @@
from src.book_solver import BookSolver from src.book_solver import BookSolver
from src.epub_converter.css_processor import CSSPreprocessor from src.style_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.epub_converter.epub_converter import EpubConverter from src.epub_converter.epub_converter import EpubConverter

View File

@@ -192,14 +192,18 @@ class HtmlEpubPreprocessor:
tag_to_replace: str = rule["tag_to_replace"] tag_to_replace: str = rule["tag_to_replace"]
if rule["condition"]: if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags': if condition_on_tag[0] == "parent_tags":
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if tag.parent.select(condition_on_tag[1]): if tag.parent.select(condition_on_tag[1]):
tag.name = tag_to_replace tag.name = tag_to_replace
elif condition_on_tag[0] == 'child_tags': elif condition_on_tag[0] == "child_tags":
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): if "not" in condition_on_tag[1]:
tag.name = tag_to_replace if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])):
tag.name = tag_to_replace
else:
if tag.select(condition_on_tag[1]):
tag.name = tag_to_replace
elif condition_on_tag[0] == "attrs": elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]: for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
@@ -236,15 +240,15 @@ class HtmlEpubPreprocessor:
tag[attr_to_replace] = tag[attr] tag[attr_to_replace] = tag[attr]
del tag[attr] del tag[attr]
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: Dict[str, List[str]]): def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: List[Dict[str, List[str]]]):
""" """
Function unwrap tags and moves id to span Function unwrap tags and moves id to span
Parameters Parameters
---------- ----------
chapter_tag: BeautifulSoup chapter_tag: BeautifulSoup
Tag & contents of the chapter tag Tag & contents of the chapter tag
rules: Dict[str, List[str]] rules: List[Dict[str, List[str]]]
dict of tags to unwrap list of conditions when fire function
Returns Returns
------- -------
@@ -252,13 +256,14 @@ class HtmlEpubPreprocessor:
Chapter Tag with unwrapped certain tags Chapter Tag with unwrapped certain tags
""" """
for tag_name in rules["tags"]: for rule in rules:
for tag in chapter_tag.select(tag_name): for tag_name in rule["tags"]:
# if tag is a subtag for tag in chapter_tag.select(tag_name):
if ">" in tag_name: # if tag is a subtag
tag.parent.attrs.update(tag.attrs) if ">" in tag_name:
self._add_span_to_save_ids_for_links(tag, chapter_tag) tag.parent.attrs.update(tag.attrs)
tag.unwrap() self._add_span_to_save_ids_for_links(tag, chapter_tag)
tag.unwrap()
@staticmethod @staticmethod
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup,
@@ -293,14 +298,18 @@ class HtmlEpubPreprocessor:
tags: List[str] = rule["tags"] tags: List[str] = rule["tags"]
if rule["condition"]: if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags': if condition_on_tag[0] == "parent_tags":
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if tag.parent.select(condition_on_tag[1]): if tag.parent.select(condition_on_tag[1]):
insert(tag) insert(tag)
elif condition_on_tag[0] == 'child_tags': elif condition_on_tag[0] == "child_tags":
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): if "not" in condition_on_tag[1]:
insert(tag) if not tag.select(re.sub("[():]|not", "", condition_on_tag[1])):
tag.unwrap()
else:
if tag.select(condition_on_tag[1]):
tag.unwrap()
elif condition_on_tag[0] == "attrs": elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]: for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
@@ -441,7 +450,7 @@ class HtmlEpubPreprocessor:
# 3-6. # 3-6.
for rule in self.preset: for rule in self.preset:
func = self.name2function[rule["preset_name"]] func = self.name2function[rule["preset_name"]]
func(content_tag, rule['rules']) func(content_tag, rule["rules"])
# 7. # 7.
if remove_title_from_chapter: if remove_title_from_chapter:
self._remove_headings_content(content_tag, title_str) self._remove_headings_content(content_tag, title_str)