add processing of JSON presets

This commit is contained in:
Kiryl
2022-07-07 19:32:24 +03:00
parent 687c09417a
commit c4752a19db
5 changed files with 497 additions and 417 deletions

View File

@@ -1,5 +1,4 @@
import os
import logging
import pathlib
from shutil import copyfile

View File

@@ -4,33 +4,34 @@ import codecs
import os
from os.path import dirname, normpath, join
from itertools import chain
from premailer import transform
from collections import defaultdict
from typing import Dict, Union, List
import ebooklib
from ebooklib import epub
from ebooklib.epub import Link, Section
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup, NavigableString, Tag
from src.util.helpers import BookLogger
from src.preset_processor import PresetProcessor
from src.epub_converter.css_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
prepare_title, prepare_content
from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
class EpubConverter:
def __init__(self, file_path, access=None, logger=None):
def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
self.file_path = file_path
self.access = access
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file_path)
self.css_processor = css_preprocessor
self.html_preprocessor = html_processor
# main container for all epub .xhtml files
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
@@ -74,25 +75,15 @@ class EpubConverter:
self.process_inline_styles_in_html_soup()
self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log("CSS styles adding.")
self.logger.log("CSS styles adding.")
self.add_css_styles_to_html_soup()
# todo presets
self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup:
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
self.html_href2html_body_soup)
self.footnotes_contents.extend(content)
self.noterefs.extend(noterefs)
self.footnotes.extend(footnotes_tags)
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
noteref.attrs["data-id"] = i + 1
noteref.attrs["id"] = f"footnote-{i + 1}"
footnote.attrs["href"] = f"#footnote-{i + 1}"
self.footnotes_contents, self.noterefs, self.footnotes =\
preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
self.logger.log("TOC processing.")
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed
@@ -101,6 +92,7 @@ class EpubConverter:
not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f"Html documents not added to TOC: {not_added}.")
self.logger.log(f"Add documents not added to TOC.")
self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f"Html internal links and structure processing.")
self.label_chapters_ids_with_lc_id()
@@ -149,7 +141,7 @@ class EpubConverter:
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
build_inline_style_content(inline_style)
self.css_processor.build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]:
"""
@@ -181,16 +173,53 @@ class EpubConverter:
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_file_content(
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string
html_href2css_href[html_href].append(f"href{i}")
css_href2css_content[f"href{i}"] = build_css_file_content(
css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
css_content)
return html_href2css_href, css_href2css_content
def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""
Function adds styles from .css to inline style.
Parameters
----------
html_soup: BeautifulSoup
html page with inline style
css_text: str
css content from css file
Returns
-------
inline_soup: BeautifulSoup
soup with styles from css
"""
# remove this specification because it causes problems
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:
style_converter = TagInlineStyleProcessor(tag_inline_style)
style_converter.convert_initial_tag()
return inline_soup
def add_css_styles_to_html_soup(self):
"""
This function is designed to update html_href2html_body_soup
@@ -203,7 +232,7 @@ class EpubConverter:
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
html_content = convert_html_soup_with_css_style(html_content, css)
html_content = self.convert_html_soup_with_css_style(html_content, css)
self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -488,6 +517,48 @@ class EpubConverter:
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
f" Old id={a_tag_id}")
@staticmethod
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id: str
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
href: str
Name of current chapters file
html_soup: Tag
Soup object of current file
Returns
-------
tags: list [Tag, NavigableString]
Chapter's tags
"""
marked_tags = html_soup.find(
attrs={"id": first_id, "class": "converter-chapter-mark"})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and \
(next_tag.attrs.get("class") == "converter-chapter-mark"):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f"Warning: no match for {first_id, href}"
return tags
def detect_one_chapter(self, nav_point: NavPoint):
"""
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
@@ -511,11 +582,11 @@ class EpubConverter:
"""
if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks(
subchapter_tags = self.get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup("", "html.parser")
for tag in chapter_tags:
new_tree.append(tag)
for subchapter_tag in subchapter_tags:
new_tree.append(subchapter_tag)
self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)] = new_tree
@@ -527,8 +598,8 @@ class EpubConverter:
"""Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points:
self.detect_one_chapter(point)
for tl_nav_point in top_level_nav_points:
self.detect_one_chapter(tl_nav_point)
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
"""
@@ -561,9 +632,9 @@ class EpubConverter:
if hasattr(self.file_path, "stem") else "book_id")
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)
content_preprocessed = prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
title_preprocessed = self.html_preprocessor.prepare_title(title)
content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter
# if it doesn't have subchapters
@@ -598,11 +669,17 @@ class EpubConverter:
if __name__ == "__main__":
epub_file_path = "../../epub/9781641050234.epub"
epub_file_path = "../../epub/Modern_Java_in_Action.epub"
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])
json_converter = EpubConverter(epub_file_path, logger=logger_object)
preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
.get_preset_json()
css_preprocessor = CSSPreprocessor(logger=logger_object)
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
json_converter = EpubConverter(epub_file_path, logger=logger_object,
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
content_dict = json_converter.convert_to_dict()
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:

View File

@@ -1,4 +1,7 @@
from src.book_solver import BookSolver
from src.preset_processor import PresetProcessor
from src.epub_converter.css_preprocessor import CSSPreprocessor
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
from src.epub_converter.epub_converter import EpubConverter
@@ -14,8 +17,10 @@ class EpubBook(BookSolver):
Function
Steps
----------
1. Converts .epub to .html
2. Parses from line structure to nested structure
1. Gets data from preset structure
2. Add preset to html preprocessor
3. Converts .epub to .html
4. Parses from line structure to nested structure
Returns
----------
@@ -23,7 +28,12 @@ class EpubBook(BookSolver):
json for LiveCarta platform
"""
preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
.get_preset_json()
css_preprocessor = CSSPreprocessor(logger=self.logger_object)
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
json_converter = EpubConverter(
self.file_path, access=self.access, logger=self.logger_object)
self.file_path, access=self.access, logger=self.logger_object,
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
content_dict = json_converter.convert_to_dict()
return content_dict

View File

@@ -1,419 +1,398 @@
import re
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from src.livecarta_config import LiveCartaConfig
from src.util.helpers import BookLogger
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Soup object
chapter_tag: BeautifulSoup
class HtmlEpubPreprocessor:
def __init__(self, preset, logger=None):
self.preset = preset
self.logger: BookLogger = logger
self.name2function = {
"table_wrapper": self._wrap_tags_with_table,
"replacer": self._tags_to_correspond_livecarta_tag,
"unwrapper": self._unwrap_tags,
"inserter": self._insert_tags_into_correspond_tags
}
Returns
-------
None
updated body tag
@staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Soup object
chapter_tag: BeautifulSoup
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
"""Function inserts span before tag aren't supported by LiveCarta"""
new_tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
Returns
-------
None
updated body tag
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
class_: list):
"""Function inserts span before tag aren't supported by LiveCarta"""
new_tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id: str
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
href: str
Name of current chapters file
html_soup: Tag
Soup object of current file
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
Returns
-------
tags: list [Tag, NavigableString]
Chapter's tags
@staticmethod
def prepare_title(title_of_chapter: str) -> str:
"""
Function finalise processing/cleaning title
Parameters
----------
title_of_chapter: str
"""
marked_tags = html_soup.find(
attrs={"id": first_id, "class": "converter-chapter-mark"})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and \
(next_tag.attrs.get("class") == "converter-chapter-mark"):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
Returns
-------
title: str
cleaned title
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
"""
title = BeautifulSoup(title_of_chapter, features="lxml").string
# clean extra whitespace characters ([\r\n\t\f\v ])
title = re.sub(r"[\s\xa0]", " ", title).strip()
return title
else:
assert 0, f"Warning: no match for {first_id, href}"
@staticmethod
def _remove_comments(chapter_tag):
"""
Function remove comments
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
return tags
Returns
-------
None
Chapter Tag without comments
"""
for tag in chapter_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
def prepare_title(title_of_chapter: str) -> str:
"""
Function finalise processing/cleaning title
Parameters
----------
title_of_chapter: str
@staticmethod
def _wrap_strings_with_p(chapter_tag):
"""
Function converts headings that aren't supported by LiveCarta with <p>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
title: str
cleaned title
Returns
-------
None
Chapter Tag with wrapped NavigableStrings
"""
title = BeautifulSoup(title_of_chapter, features="lxml").string
# clean extra whitespace characters ([\r\n\t\f\v ])
title = re.sub(r"[\s\xa0]", " ", title).strip()
return title
"""
for node in chapter_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r"([\s\xa0])", " ", content).strip()
if content:
p_tag = chapter_tag.new_tag("p")
p_tag.append(str(node))
node.replace_with(p_tag)
def _wrap_tags_with_table(self, chapter_tag, rules: list):
"""
Function wraps <tag> with <table>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
def _remove_comments(chapter_tag):
"""
Function remove comments
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with wrapped certain tags with <table>
Returns
-------
None
Chapter Tag without comments
"""
"""
for tag in chapter_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_be_wrapped.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def process_tag_using_table(tag_to_wrap):
_wrap_tag_with_table(
chapter_tag,
tag_to_be_wrapped=tag_to_wrap,
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap()
def _wrap_strings_with_p(chapter_tag):
"""
Function converts headings that aren't supported by LiveCarta with <p>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with wrapped NavigableStrings
"""
for node in chapter_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r"([\s\xa0])", " ", content).strip()
if content:
p_tag = chapter_tag.new_tag("p")
p_tag.append(str(node))
node.replace_with(p_tag)
def _wrap_tags_with_table(chapter_tag):
"""
Function wraps <tag> with <table>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with wrapped certain tags with <table>
"""
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_be_wrapped.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def process_tag_using_table(tag_to_wrap):
_wrap_tag_with_table(
chapter_tag,
tag_to_be_wrapped=tag_to_wrap,
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
_add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap()
for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
if isinstance(attrs, tuple):
attr, val = attrs[0], attrs[1]
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
process_tag_using_table(tag_to_wrap)
else:
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
for rule in rules:
tags = rule["tags"]
for attr in rule["attrs"]:
for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
process_tag_using_table(tag_to_wrap)
@staticmethod
def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
def _tags_to_correspond_livecarta_tag(chapter_tag):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with all tags replaced with LiveCarta tags
Returns
-------
None
Chapter Tag with all tags replaced with LiveCarta tags
"""
for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
for key in reg_keys:
if isinstance(key, tuple):
replace = key[0]
parent, child = key[1], key[2]
for parent_tag in chapter_tag.select(parent):
if replace == "parent":
parent_tag.name = to_replace_value
elif replace == "child":
for child_tag in parent_tag.select(child):
child_tag.name = to_replace_value
if not child_tag.attrs.get("style"):
child_tag.attrs["style"] =\
"font-size: 14px; font-family: courier new,courier,monospace;"
"""
for rule in rules:
tags = rule["tags"]
tag_to_replace = rule["tag_to_replace"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if tag.parent.select(condition_on_tag[1]):
tag.name = tag_to_replace
elif condition_on_tag[0] == 'child_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
tag.name = tag_to_replace
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
tag.name = tag_to_replace
else:
tags = chapter_tag.find_all(re.compile(key))
for tag in tags:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
tag.name = to_replace_value
tag.name = tag_to_replace
def _unwrap_tags(self, chapter_tag, rules: dict):
"""
Function unwrap tags and moves id to span
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
def _unwrap_tags(chapter_tag):
"""
Function unwrap tags and moves id to span
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with unwrapped certain tags
Returns
-------
None
Chapter Tag with unwrapped certain tags
"""
for tag_name in rules["tags"]:
for tag in chapter_tag.select(tag_name):
# if tag is a subtag
if ">" in tag_name:
tag.parent.attrs.update(tag.attrs)
self._add_span_to_save_ids_for_links(tag, chapter_tag)
tag.unwrap()
"""
for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
for tag in chapter_tag.select(tag_name):
# if tag is a subtag
if ">" in tag_name:
tag.parent.attrs.update(tag.attrs)
_add_span_to_save_ids_for_links(tag, chapter_tag)
tag.unwrap()
@staticmethod
def _insert_tags_into_correspond_tags(chapter_tag, rules: list):
"""
Function inserts tags into correspond tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with inserted tags
def _remove_headings_content(content_tag, title_of_chapter: str):
"""
Function
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
- adds span with id in order to
Parameters
----------
content_tag: soup object
Tag of the page
title_of_chapter: str
Chapter title
"""
Returns
-------
None
clean/remove headings & add span with id
def insert(tag, tag_to_insert):
# insert all items that was in tag to subtag and remove from tag
for content in reversed(tag.contents):
tag_to_insert.insert(0, content.extract())
# wrap subtag with items
tag.append(tag_to_insert)
"""
title_of_chapter = title_of_chapter.lower()
for tag in content_tag.contents:
text = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
if title_of_chapter == text or \
(title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
_add_span_to_save_ids_for_links(tag, content_tag)
tag.extract()
return
elif not isinstance(tag, NavigableString):
if not _remove_headings_content(tag, title_of_chapter):
break
def _process_table(chapter_tag: BeautifulSoup):
"""
Function preprocesses tables and tags(td|th|tr)
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with processed tables
"""
tables = chapter_tag.find_all("table")
for table in tables:
for t_tag in table.find_all(re.compile("td|th|tr")):
width = ""
if t_tag.get("style"):
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
if width_match:
size = width_match.group(1)
width = size + "px"
t_tag.attrs["width"] = t_tag.get("width") or width
if t_tag.attrs.get("style"):
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
"border:0;", "")
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
del t_tag.attrs["style"]
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1"
def _insert_tags_in_parents(chapter_tag):
"""
Function inserts tags into correspond tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with inserted tags
"""
parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
for parent_tag_name, condition in parent_tag2condition.items():
for parent_tag in chapter_tag.select(parent_tag_name):
if parent_tag.select(condition):
continue
for rule in rules:
tags = rule["tags"]
tag_to_insert = \
chapter_tag.new_tag(rule["tag_to_insert"])
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if tag.parent.select(condition_on_tag[1]):
insert(tag, tag_to_insert)
elif condition_on_tag[0] == 'child_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
insert(tag, tag_to_insert)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
insert(tag, tag_to_insert)
else:
tag_to_insert = chapter_tag.new_tag(
LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
# insert all items that was in pre to code and remove from pre
for content in reversed(parent_tag.contents):
tag_to_insert.insert(0, content.extract())
# wrap code with items
parent_tag.append(tag_to_insert)
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
insert(tag, tag_to_insert)
def _remove_headings_content(self, content_tag, title_of_chapter: str):
"""
Function
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
- adds span with id in order to
Parameters
----------
content_tag: soup object
Tag of the page
title_of_chapter: str
Chapter title
def _class_removing(chapter_tag):
"""
Function removes classes that aren't created by converter
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
clean/remove headings & add span with id
Returns
-------
None
Chapter Tag without original classes of the book
"""
title_of_chapter = title_of_chapter.lower()
for tag in content_tag.contents:
text = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
if title_of_chapter == text or \
(title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
self._add_span_to_save_ids_for_links(tag, content_tag)
tag.extract()
return
elif not isinstance(tag, NavigableString):
if not self._remove_headings_content(tag, title_of_chapter):
break
"""
for tag in chapter_tag.find_all(recursive=True):
if tag.attrs.get("class") \
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
@staticmethod
def _process_tables(chapter_tag: BeautifulSoup):
"""
Function preprocesses tables and tags(td|th|tr)
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with processed tables
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""
Function finalise processing/cleaning content
Parameters
----------
title_str: str
"""
tables = chapter_tag.find_all("table")
for table in tables:
for t_tag in table.find_all(re.compile("td|th|tr")):
width = ""
if t_tag.get("style"):
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
if width_match:
size = width_match.group(1)
width = size + "px"
content_tag: Tag, soup object
t_tag.attrs["width"] = t_tag.get("width") or width
remove_title_from_chapter: bool
if t_tag.attrs.get("style"):
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
"border:0;", "")
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
del t_tag.attrs["style"]
Steps
----------
1. comments removal
2. wrap NavigableString with tag <p>
3. wrap tags with <table>
4. replace tags with correspond LiveCarta tags
5. unwrap tags
6. heading removal
7. process_table
8. insert tags into correspond tags
9. class removal
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1"
Returns
-------
content_tag: str
prepared content
@staticmethod
def _class_removing(chapter_tag):
"""
Function removes classes that aren't created by converter
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
"""
# 1. remove comments
_remove_comments(content_tag)
Returns
-------
None
Chapter Tag without original classes of the book
# 2.
_wrap_strings_with_p(content_tag)
# 3.
_wrap_tags_with_table(content_tag)
# 4.
_tags_to_correspond_livecarta_tag(content_tag)
# 5.
_unwrap_tags(content_tag)
# 6.
if remove_title_from_chapter:
_remove_headings_content(content_tag, title_str)
# 7.
_process_table(content_tag)
# 8.
_insert_tags_in_parents(content_tag)
"""
for tag in chapter_tag.find_all(recursive=True):
if tag.attrs.get("class") \
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
# 9. remove classes that weren't created by converter
_class_removing(content_tag)
return str(content_tag)
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""
Function finalise processing/cleaning content
Parameters
----------
title_str: str
content_tag: Tag, soup object
remove_title_from_chapter: bool
Steps
----------
1. comments removal
2. wrap NavigableString with tag <p>
3-6. wrap tags with <table>
replace tags with correspond LiveCarta tags
unwrap tags
insert tags into correspond tags
7. heading removal
8. process_tables
9. class removal
Returns
-------
content_tag: str
prepared content
"""
# 1. remove comments
self._remove_comments(content_tag)
# 2.
self._wrap_strings_with_p(content_tag)
# 3-6.
for dict in self.preset:
func = self.name2function[dict["preset_name"]]
func(content_tag, dict['rules'])
# 7.
if remove_title_from_chapter:
self._remove_headings_content(content_tag, title_str)
# 8.
self._process_tables(content_tag)
# 9. remove classes that weren't created by converter
self._class_removing(content_tag)
return str(content_tag)

15
src/preset_processor.py Normal file
View File

@@ -0,0 +1,15 @@
import json
from src.util.helpers import BookLogger
class PresetProcessor:
def __init__(self, preset_path="config/presets.json", logger=None):
self.preset_path = preset_path
self.logger: BookLogger = logger
def get_preset_json(self):
f = open(self.preset_path)
data = json.load(f)
return data