add processing of JSON presets

This commit is contained in:
Kiryl
2022-07-07 19:32:24 +03:00
parent 687c09417a
commit c4752a19db
5 changed files with 497 additions and 417 deletions

View File

@@ -1,5 +1,4 @@
import os import os
import logging
import pathlib import pathlib
from shutil import copyfile from shutil import copyfile

View File

@@ -4,33 +4,34 @@ import codecs
import os import os
from os.path import dirname, normpath, join from os.path import dirname, normpath, join
from itertools import chain from itertools import chain
from premailer import transform
from collections import defaultdict from collections import defaultdict
from typing import Dict, Union, List from typing import Dict, Union, List
import ebooklib import ebooklib
from ebooklib import epub from ebooklib import epub
from ebooklib.epub import Link, Section from ebooklib.epub import Link, Section
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, NavigableString, Tag
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.preset_processor import PresetProcessor
from src.epub_converter.css_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
prepare_title, prepare_content
class EpubConverter: class EpubConverter:
def __init__(self, file_path, access=None, logger=None): def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
self.file_path = file_path self.file_path = file_path
self.access = access self.access = access
self.logger: BookLogger = logger self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file_path) self.ebooklib_book = epub.read_epub(file_path)
self.css_processor = css_preprocessor
self.html_preprocessor = html_processor
# main container for all epub .xhtml files # main container for all epub .xhtml files
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
@@ -77,22 +78,12 @@ class EpubConverter:
self.logger.log("CSS styles adding.") self.logger.log("CSS styles adding.")
self.add_css_styles_to_html_soup() self.add_css_styles_to_html_soup()
# todo presets
self.logger.log("Footnotes processing.") self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup: for href in self.html_href2html_body_soup:
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], self.footnotes_contents, self.noterefs, self.footnotes =\
self.html_href2html_body_soup) preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
self.footnotes_contents.extend(content)
self.noterefs.extend(noterefs)
self.footnotes.extend(footnotes_tags)
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
noteref.attrs["data-id"] = i + 1
noteref.attrs["id"] = f"footnote-{i + 1}"
footnote.attrs["href"] = f"#footnote-{i + 1}"
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
self.logger.log("TOC processing.") self.logger.log("TOC processing.")
self.build_adjacency_list_from_toc(self.ebooklib_book.toc) self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed # build simple toc from spine if needed
@@ -101,6 +92,7 @@ class EpubConverter:
not_added = [ not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f"Html documents not added to TOC: {not_added}.") self.logger.log(f"Html documents not added to TOC: {not_added}.")
self.logger.log(f"Add documents not added to TOC.")
self.add_not_added_files_to_adjacency_list(not_added) self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f"Html internal links and structure processing.") self.logger.log(f"Html internal links and structure processing.")
self.label_chapters_ids_with_lc_id() self.label_chapters_ids_with_lc_id()
@@ -149,7 +141,7 @@ class EpubConverter:
for tag_initial_inline_style in tags_with_inline_style: for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"] inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \ tag_initial_inline_style.attrs["style"] = \
build_inline_style_content(inline_style) self.css_processor.build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]: def build_html_and_css_relations(self) -> tuple[dict, dict]:
""" """
@@ -181,16 +173,53 @@ class EpubConverter:
html_href2css_href[html_href].append(css_href) html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content: if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict # css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_file_content( css_href2css_content[css_href] = self.css_processor.build_css_file_content(
self.get_css_content(css_href, html_href)) self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all("style")): for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string css_content = tag.string
html_href2css_href[html_href].append(f"href{i}") html_href2css_href[html_href].append(f"href{i}")
css_href2css_content[f"href{i}"] = build_css_file_content( css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
css_content) css_content)
return html_href2css_href, css_href2css_content return html_href2css_href, css_href2css_content
def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""
Function adds styles from .css to inline style.
Parameters
----------
html_soup: BeautifulSoup
html page with inline style
css_text: str
css content from css file
Returns
-------
inline_soup: BeautifulSoup
soup with styles from css
"""
# remove this specification because it causes problems
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:
style_converter = TagInlineStyleProcessor(tag_inline_style)
style_converter.convert_initial_tag()
return inline_soup
def add_css_styles_to_html_soup(self): def add_css_styles_to_html_soup(self):
""" """
This function is designed to update html_href2html_body_soup This function is designed to update html_href2html_body_soup
@@ -203,7 +232,7 @@ class EpubConverter:
for css_href in self.html_href2css_href[html_href]: for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href] css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
html_content = convert_html_soup_with_css_style(html_content, css) html_content = self.convert_html_soup_with_css_style(html_content, css)
self.html_href2html_body_soup[html_href] = html_content self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -488,6 +517,48 @@ class EpubConverter:
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file." f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
f" Old id={a_tag_id}") f" Old id={a_tag_id}")
@staticmethod
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id: str
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
href: str
Name of current chapters file
html_soup: Tag
Soup object of current file
Returns
-------
tags: list [Tag, NavigableString]
Chapter's tags
"""
marked_tags = html_soup.find(
attrs={"id": first_id, "class": "converter-chapter-mark"})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and \
(next_tag.attrs.get("class") == "converter-chapter-mark"):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f"Warning: no match for {first_id, href}"
return tags
def detect_one_chapter(self, nav_point: NavPoint): def detect_one_chapter(self, nav_point: NavPoint):
""" """
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
@@ -511,11 +582,11 @@ class EpubConverter:
""" """
if nav_point.id: if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href] soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks( subchapter_tags = self.get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup) first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup("", "html.parser") new_tree = BeautifulSoup("", "html.parser")
for tag in chapter_tags: for subchapter_tag in subchapter_tags:
new_tree.append(tag) new_tree.append(subchapter_tag)
self.href_chapter_id2soup_html[( self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)] = new_tree nav_point.href, nav_point.id)] = new_tree
@@ -527,8 +598,8 @@ class EpubConverter:
"""Function build chapters content, starts from top level chapters""" """Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points: if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points: for tl_nav_point in top_level_nav_points:
self.detect_one_chapter(point) self.detect_one_chapter(tl_nav_point)
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
""" """
@@ -561,8 +632,8 @@ class EpubConverter:
if hasattr(self.file_path, "stem") else "book_id") if hasattr(self.file_path, "stem") else "book_id")
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title) title_preprocessed = self.html_preprocessor.prepare_title(title)
content_preprocessed = prepare_content(title_preprocessed, content, content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter) remove_title_from_chapter=is_chapter)
sub_nodes = [] sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter # warning! not EpubHtmlItems won't be added to chapter
@@ -598,11 +669,17 @@ class EpubConverter:
if __name__ == "__main__": if __name__ == "__main__":
epub_file_path = "../../epub/9781641050234.epub" epub_file_path = "../../epub/Modern_Java_in_Action.epub"
logger_object = BookLogger( logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1]) name="epub", book_id=epub_file_path.split("/")[-1])
json_converter = EpubConverter(epub_file_path, logger=logger_object) preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
.get_preset_json()
css_preprocessor = CSSPreprocessor(logger=logger_object)
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
json_converter = EpubConverter(epub_file_path, logger=logger_object,
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:

View File

@@ -1,4 +1,7 @@
from src.book_solver import BookSolver from src.book_solver import BookSolver
from src.preset_processor import PresetProcessor
from src.epub_converter.css_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
from src.epub_converter.epub_converter import EpubConverter from src.epub_converter.epub_converter import EpubConverter
@@ -14,8 +17,10 @@ class EpubBook(BookSolver):
Function Function
Steps Steps
---------- ----------
1. Converts .epub to .html 1. Gets data from preset structure
2. Parses from line structure to nested structure 2. Add preset to html preprocessor
3. Converts .epub to .html
4. Parses from line structure to nested structure
Returns Returns
---------- ----------
@@ -23,7 +28,12 @@ class EpubBook(BookSolver):
json for LiveCarta platform json for LiveCarta platform
""" """
preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
.get_preset_json()
css_preprocessor = CSSPreprocessor(logger=self.logger_object)
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
json_converter = EpubConverter( json_converter = EpubConverter(
self.file_path, access=self.access, logger=self.logger_object) self.file_path, access=self.access, logger=self.logger_object,
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
return content_dict return content_dict

View File

@@ -1,11 +1,22 @@
import re import re
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
from bs4 import BeautifulSoup, NavigableString, Tag, Comment from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup): class HtmlEpubPreprocessor:
def __init__(self, preset, logger=None):
self.preset = preset
self.logger: BookLogger = logger
self.name2function = {
"table_wrapper": self._wrap_tags_with_table,
"replacer": self._tags_to_correspond_livecarta_tag,
"unwrapper": self._unwrap_tags,
"inserter": self._insert_tags_into_correspond_tags
}
@staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
""" """
Function adds span with id from tag_to_be_removed Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract) because this tag will be removed(unwrapped/extract)
@@ -20,7 +31,9 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou
updated body tag updated body tag
""" """
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
class_: list):
"""Function inserts span before tag aren't supported by LiveCarta""" """Function inserts span before tag aren't supported by LiveCarta"""
new_tag = chapter_tag.new_tag("span") new_tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or "" new_tag.attrs["id"] = id_ or ""
@@ -33,50 +46,8 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou
id_=tag_to_be_removed.attrs["id"], id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class")) class_=tag_to_be_removed.attrs.get("class"))
@staticmethod
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: def prepare_title(title_of_chapter: str) -> str:
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id: str
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
href: str
Name of current chapters file
html_soup: Tag
Soup object of current file
Returns
-------
tags: list [Tag, NavigableString]
Chapter's tags
"""
marked_tags = html_soup.find(
attrs={"id": first_id, "class": "converter-chapter-mark"})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and \
(next_tag.attrs.get("class") == "converter-chapter-mark"):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f"Warning: no match for {first_id, href}"
return tags
def prepare_title(title_of_chapter: str) -> str:
""" """
Function finalise processing/cleaning title Function finalise processing/cleaning title
Parameters Parameters
@@ -94,8 +65,8 @@ def prepare_title(title_of_chapter: str) -> str:
title = re.sub(r"[\s\xa0]", " ", title).strip() title = re.sub(r"[\s\xa0]", " ", title).strip()
return title return title
@staticmethod
def _remove_comments(chapter_tag): def _remove_comments(chapter_tag):
""" """
Function remove comments Function remove comments
Parameters Parameters
@@ -113,8 +84,8 @@ def _remove_comments(chapter_tag):
for element in tag(text=lambda text: isinstance(text, Comment)): for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract() element.extract()
@staticmethod
def _wrap_strings_with_p(chapter_tag): def _wrap_strings_with_p(chapter_tag):
""" """
Function converts headings that aren't supported by LiveCarta with <p> Function converts headings that aren't supported by LiveCarta with <p>
Parameters Parameters
@@ -137,8 +108,7 @@ def _wrap_strings_with_p(chapter_tag):
p_tag.append(str(node)) p_tag.append(str(node))
node.replace_with(p_tag) node.replace_with(p_tag)
def _wrap_tags_with_table(self, chapter_tag, rules: list):
def _wrap_tags_with_table(chapter_tag):
""" """
Function wraps <tag> with <table> Function wraps <tag> with <table>
Parameters Parameters
@@ -152,6 +122,7 @@ def _wrap_tags_with_table(chapter_tag):
Chapter Tag with wrapped certain tags with <table> Chapter Tag with wrapped certain tags with <table>
""" """
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
table = chapter_tag.new_tag("table") table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \ table.attrs["border"], table.attrs["align"], table.attrs["style"] \
@@ -173,21 +144,18 @@ def _wrap_tags_with_table(chapter_tag):
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
_add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap() tag_to_wrap.unwrap()
for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items(): for rule in rules:
if isinstance(attrs, tuple): tags = rule["tags"]
attr, val = attrs[0], attrs[1] for attr in rule["attrs"]:
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}): for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
process_tag_using_table(tag_to_wrap) {attr["name"]: re.compile(fr"{attr['value']}")}):
else:
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
process_tag_using_table(tag_to_wrap) process_tag_using_table(tag_to_wrap)
@staticmethod
def _tags_to_correspond_livecarta_tag(chapter_tag): def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list):
""" """
Function to replace all tags to correspond LiveCarta tags Function to replace all tags to correspond LiveCarta tags
Parameters Parameters
@@ -201,28 +169,30 @@ def _tags_to_correspond_livecarta_tag(chapter_tag):
Chapter Tag with all tags replaced with LiveCarta tags Chapter Tag with all tags replaced with LiveCarta tags
""" """
for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items(): for rule in rules:
for key in reg_keys: tags = rule["tags"]
if isinstance(key, tuple): tag_to_replace = rule["tag_to_replace"]
replace = key[0] if rule["condition"]:
parent, child = key[1], key[2] for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
for parent_tag in chapter_tag.select(parent): if condition_on_tag[0] == 'parent_tags':
if replace == "parent": for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
parent_tag.name = to_replace_value if tag.parent.select(condition_on_tag[1]):
elif replace == "child": tag.name = tag_to_replace
for child_tag in parent_tag.select(child): elif condition_on_tag[0] == 'child_tags':
child_tag.name = to_replace_value for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not child_tag.attrs.get("style"): if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
child_tag.attrs["style"] =\ tag.name = tag_to_replace
"font-size: 14px; font-family: courier new,courier,monospace;" elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
tag.name = tag_to_replace
else: else:
tags = chapter_tag.find_all(re.compile(key)) for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
for tag in tags:
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section) # todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
tag.name = to_replace_value tag.name = tag_to_replace
def _unwrap_tags(self, chapter_tag, rules: dict):
def _unwrap_tags(chapter_tag):
""" """
Function unwrap tags and moves id to span Function unwrap tags and moves id to span
Parameters Parameters
@@ -236,16 +206,61 @@ def _unwrap_tags(chapter_tag):
Chapter Tag with unwrapped certain tags Chapter Tag with unwrapped certain tags
""" """
for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP: for tag_name in rules["tags"]:
for tag in chapter_tag.select(tag_name): for tag in chapter_tag.select(tag_name):
# if tag is a subtag # if tag is a subtag
if ">" in tag_name: if ">" in tag_name:
tag.parent.attrs.update(tag.attrs) tag.parent.attrs.update(tag.attrs)
_add_span_to_save_ids_for_links(tag, chapter_tag) self._add_span_to_save_ids_for_links(tag, chapter_tag)
tag.unwrap() tag.unwrap()
@staticmethod
def _insert_tags_into_correspond_tags(chapter_tag, rules: list):
"""
Function inserts tags into correspond tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
def _remove_headings_content(content_tag, title_of_chapter: str): Returns
-------
None
Chapter Tag with inserted tags
"""
def insert(tag, tag_to_insert):
# insert all items that was in tag to subtag and remove from tag
for content in reversed(tag.contents):
tag_to_insert.insert(0, content.extract())
# wrap subtag with items
tag.append(tag_to_insert)
for rule in rules:
tags = rule["tags"]
tag_to_insert = \
chapter_tag.new_tag(rule["tag_to_insert"])
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if tag.parent.select(condition_on_tag[1]):
insert(tag, tag_to_insert)
elif condition_on_tag[0] == 'child_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
insert(tag, tag_to_insert)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
insert(tag, tag_to_insert)
else:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
insert(tag, tag_to_insert)
def _remove_headings_content(self, content_tag, title_of_chapter: str):
""" """
Function Function
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
@@ -272,15 +287,15 @@ def _remove_headings_content(content_tag, title_of_chapter: str):
if title_of_chapter == text or \ if title_of_chapter == text or \
(title_of_chapter in text and (title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or content_tag.name)): re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
_add_span_to_save_ids_for_links(tag, content_tag) self._add_span_to_save_ids_for_links(tag, content_tag)
tag.extract() tag.extract()
return return
elif not isinstance(tag, NavigableString): elif not isinstance(tag, NavigableString):
if not _remove_headings_content(tag, title_of_chapter): if not self._remove_headings_content(tag, title_of_chapter):
break break
@staticmethod
def _process_table(chapter_tag: BeautifulSoup): def _process_tables(chapter_tag: BeautifulSoup):
""" """
Function preprocesses tables and tags(td|th|tr) Function preprocesses tables and tags(td|th|tr)
Parameters Parameters
@@ -316,37 +331,8 @@ def _process_table(chapter_tag: BeautifulSoup):
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1" table.attrs["border"] = "1"
@staticmethod
def _insert_tags_in_parents(chapter_tag): def _class_removing(chapter_tag):
"""
Function inserts tags into correspond tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with inserted tags
"""
parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
for parent_tag_name, condition in parent_tag2condition.items():
for parent_tag in chapter_tag.select(parent_tag_name):
if parent_tag.select(condition):
continue
else:
tag_to_insert = chapter_tag.new_tag(
LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
# insert all items that was in pre to code and remove from pre
for content in reversed(parent_tag.contents):
tag_to_insert.insert(0, content.extract())
# wrap code with items
parent_tag.append(tag_to_insert)
def _class_removing(chapter_tag):
""" """
Function removes classes that aren't created by converter Function removes classes that aren't created by converter
Parameters Parameters
@@ -365,8 +351,7 @@ def _class_removing(chapter_tag):
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"] del tag.attrs["class"]
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
""" """
Function finalise processing/cleaning content Function finalise processing/cleaning content
Parameters Parameters
@@ -381,12 +366,12 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
---------- ----------
1. comments removal 1. comments removal
2. wrap NavigableString with tag <p> 2. wrap NavigableString with tag <p>
3. wrap tags with <table> 3-6. wrap tags with <table>
4. replace tags with correspond LiveCarta tags replace tags with correspond LiveCarta tags
5. unwrap tags unwrap tags
6. heading removal insert tags into correspond tags
7. process_table 7. heading removal
8. insert tags into correspond tags 8. process_tables
9. class removal 9. class removal
Returns Returns
@@ -396,24 +381,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
""" """
# 1. remove comments # 1. remove comments
_remove_comments(content_tag) self._remove_comments(content_tag)
# 2. # 2.
_wrap_strings_with_p(content_tag) self._wrap_strings_with_p(content_tag)
# 3. # 3-6.
_wrap_tags_with_table(content_tag) for dict in self.preset:
# 4. func = self.name2function[dict["preset_name"]]
_tags_to_correspond_livecarta_tag(content_tag) func(content_tag, dict['rules'])
# 5.
_unwrap_tags(content_tag)
# 6.
if remove_title_from_chapter:
_remove_headings_content(content_tag, title_str)
# 7. # 7.
_process_table(content_tag) if remove_title_from_chapter:
self._remove_headings_content(content_tag, title_str)
# 8. # 8.
_insert_tags_in_parents(content_tag) self._process_tables(content_tag)
# 9. remove classes that weren't created by converter # 9. remove classes that weren't created by converter
_class_removing(content_tag) self._class_removing(content_tag)
return str(content_tag) return str(content_tag)

15
src/preset_processor.py Normal file
View File

@@ -0,0 +1,15 @@
import json
from src.util.helpers import BookLogger
class PresetProcessor:
def __init__(self, preset_path="config/presets.json", logger=None):
self.preset_path = preset_path
self.logger: BookLogger = logger
def get_preset_json(self):
f = open(self.preset_path)
data = json.load(f)
return data