diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py
index 923a274..e593312 100644
--- a/src/docx_converter/image_processing.py
+++ b/src/docx_converter/image_processing.py
@@ -1,5 +1,4 @@
import os
-import logging
import pathlib
from shutil import copyfile
diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py
index 1ecc7a1..525fad3 100644
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -4,33 +4,34 @@ import codecs
import os
from os.path import dirname, normpath, join
from itertools import chain
+from premailer import transform
from collections import defaultdict
from typing import Dict, Union, List
-
import ebooklib
from ebooklib import epub
from ebooklib.epub import Link, Section
-from bs4 import BeautifulSoup, Tag
-
+from bs4 import BeautifulSoup, NavigableString, Tag
from src.util.helpers import BookLogger
+from src.preset_processor import PresetProcessor
+from src.epub_converter.css_preprocessor import CSSPreprocessor
+from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes
-from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
-from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
-from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
- prepare_title, prepare_content
+from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
class EpubConverter:
- def __init__(self, file_path, access=None, logger=None):
+ def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
self.file_path = file_path
self.access = access
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file_path)
+ self.css_processor = css_preprocessor
+ self.html_preprocessor = html_processor
# main container for all epub .xhtml files
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
@@ -74,25 +75,15 @@ class EpubConverter:
self.process_inline_styles_in_html_soup()
self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
- self.logger.log("CSS styles adding.")
+ self.logger.log("CSS styles adding.")
self.add_css_styles_to_html_soup()
- # todo presets
-
self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup:
- content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
- self.html_href2html_body_soup)
- self.footnotes_contents.extend(content)
- self.noterefs.extend(noterefs)
- self.footnotes.extend(footnotes_tags)
-
- for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
- noteref.attrs["data-id"] = i + 1
- noteref.attrs["id"] = f"footnote-{i + 1}"
- footnote.attrs["href"] = f"#footnote-{i + 1}"
-
+ self.footnotes_contents, self.noterefs, self.footnotes =\
+ preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
+
self.logger.log("TOC processing.")
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed
@@ -101,6 +92,7 @@ class EpubConverter:
not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f"Html documents not added to TOC: {not_added}.")
+ self.logger.log(f"Add documents not added to TOC.")
self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f"Html internal links and structure processing.")
self.label_chapters_ids_with_lc_id()
@@ -149,7 +141,7 @@ class EpubConverter:
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
- build_inline_style_content(inline_style)
+ self.css_processor.build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]:
"""
@@ -181,16 +173,53 @@ class EpubConverter:
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
- css_href2css_content[css_href] = build_css_file_content(
+ css_href2css_content[css_href] = self.css_processor.build_css_file_content(
self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string
html_href2css_href[html_href].append(f"href{i}")
- css_href2css_content[f"href{i}"] = build_css_file_content(
+ css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
css_content)
return html_href2css_href, css_href2css_content
+ def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
+ """
+ Function adds styles from .css to inline style.
+ Parameters
+ ----------
+ html_soup: BeautifulSoup
+ html page with inline style
+ css_text: str
+ css content from css file
+ Returns
+ -------
+ inline_soup: BeautifulSoup
+ soup with styles from css
+
+ """
+ # remove this specification because it causes problems
+ css_text = css_text.replace(
+ '@namespace epub "http://www.idpf.org/2007/ops";', '')
+ # here we add css styles to inline style
+ html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
+ remove_classes=False,
+ external_styles=False,
+ allow_network=False,
+ disable_validation=True,
+ )
+ # soup with converted styles from css
+ inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
+
+ tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
+ attrs={"style": re.compile(".*")})
+
+ # go through the tags with inline style + style parsed from css file
+ for tag_inline_style in tags_with_inline_style:
+ style_converter = TagInlineStyleProcessor(tag_inline_style)
+ style_converter.convert_initial_tag()
+ return inline_soup
+
def add_css_styles_to_html_soup(self):
"""
This function is designed to update html_href2html_body_soup
@@ -203,7 +232,7 @@ class EpubConverter:
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
- html_content = convert_html_soup_with_css_style(html_content, css)
+ html_content = self.convert_html_soup_with_css_style(html_content, css)
self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -488,6 +517,48 @@ class EpubConverter:
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
f" Old id={a_tag_id}")
+ @staticmethod
+ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
+ """
+ After processing on a first_id that corresponds to current chapter,
+ from initial html_soup all tags from current chapter are extracted
+ Parameters
+ ----------
+ first_id: str
+ Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
+ href: str
+ Name of current chapters file
+ html_soup: Tag
+ Soup object of current file
+
+ Returns
+ -------
+ tags: list [Tag, NavigableString]
+ Chapter's tags
+
+ """
+ marked_tags = html_soup.find(
+ attrs={"id": first_id, "class": "converter-chapter-mark"})
+ if marked_tags:
+ next_tag = marked_tags.next_sibling
+ tags = []
+ while next_tag:
+ if not isinstance(next_tag, NavigableString) and \
+ (next_tag.attrs.get("class") == "converter-chapter-mark"):
+ break
+ tags.append(next_tag)
+ next_tag = next_tag.next_sibling
+
+ # remove tags between first_id and next found id
+ # save them in list for next steps
+ tags = [tag.extract() for tag in tags]
+ html_soup.smooth()
+
+ else:
+ assert 0, f"Warning: no match for {first_id, href}"
+
+ return tags
+
def detect_one_chapter(self, nav_point: NavPoint):
"""
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
@@ -511,11 +582,11 @@ class EpubConverter:
"""
if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href]
- chapter_tags = get_tags_between_chapter_marks(
+ subchapter_tags = self.get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup("", "html.parser")
- for tag in chapter_tags:
- new_tree.append(tag)
+ for subchapter_tag in subchapter_tags:
+ new_tree.append(subchapter_tag)
self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)] = new_tree
@@ -527,8 +598,8 @@ class EpubConverter:
"""Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
- for point in top_level_nav_points:
- self.detect_one_chapter(point)
+ for tl_nav_point in top_level_nav_points:
+ self.detect_one_chapter(tl_nav_point)
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
"""
@@ -561,9 +632,9 @@ class EpubConverter:
if hasattr(self.file_path, "stem") else "book_id")
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
- title_preprocessed = prepare_title(title)
- content_preprocessed = prepare_content(title_preprocessed, content,
- remove_title_from_chapter=is_chapter)
+ title_preprocessed = self.html_preprocessor.prepare_title(title)
+ content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
+ remove_title_from_chapter=is_chapter)
sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter
# if it doesn't have subchapters
@@ -598,11 +669,17 @@ class EpubConverter:
if __name__ == "__main__":
- epub_file_path = "../../epub/9781641050234.epub"
+ epub_file_path = "../../epub/Modern_Java_in_Action.epub"
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])
- json_converter = EpubConverter(epub_file_path, logger=logger_object)
+ preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
+ .get_preset_json()
+ css_preprocessor = CSSPreprocessor(logger=logger_object)
+ html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
+
+ json_converter = EpubConverter(epub_file_path, logger=logger_object,
+ css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
content_dict = json_converter.convert_to_dict()
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py
index 8e92a40..c1bb800 100644
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -1,4 +1,7 @@
from src.book_solver import BookSolver
+from src.preset_processor import PresetProcessor
+from src.epub_converter.css_preprocessor import CSSPreprocessor
+from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
from src.epub_converter.epub_converter import EpubConverter
@@ -14,8 +17,10 @@ class EpubBook(BookSolver):
Function
Steps
----------
- 1. Converts .epub to .html
- 2. Parses from line structure to nested structure
+ 1. Gets data from preset structure
+ 2. Add preset to html preprocessor
+ 3. Converts .epub to .html
+ 4. Parses from line structure to nested structure
Returns
----------
@@ -23,7 +28,12 @@ class EpubBook(BookSolver):
json for LiveCarta platform
"""
+ preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
+ .get_preset_json()
+ css_preprocessor = CSSPreprocessor(logger=self.logger_object)
+ html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
json_converter = EpubConverter(
- self.file_path, access=self.access, logger=self.logger_object)
+ self.file_path, access=self.access, logger=self.logger_object,
+ css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
content_dict = json_converter.convert_to_dict()
return content_dict
diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py
index 3f762b4..3ddc532 100644
--- a/src/epub_converter/html_epub_preprocessor.py
+++ b/src/epub_converter/html_epub_preprocessor.py
@@ -1,419 +1,398 @@
import re
+from bs4 import BeautifulSoup, NavigableString, Comment, Tag
-from bs4 import BeautifulSoup, NavigableString, Tag, Comment
-
-from src.livecarta_config import LiveCartaConfig
+from src.util.helpers import BookLogger
-def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
- """
- Function adds span with id from tag_to_be_removed
- because this tag will be removed(unwrapped/extract)
- Parameters
- ----------
- tag_to_be_removed: Soup object
- chapter_tag: BeautifulSoup
+class HtmlEpubPreprocessor:
+ def __init__(self, preset, logger=None):
+ self.preset = preset
+ self.logger: BookLogger = logger
+ self.name2function = {
+ "table_wrapper": self._wrap_tags_with_table,
+ "replacer": self._tags_to_correspond_livecarta_tag,
+ "unwrapper": self._unwrap_tags,
+ "inserter": self._insert_tags_into_correspond_tags
+ }
- Returns
- -------
- None
- updated body tag
+ @staticmethod
+ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
+ """
+ Function adds span with id from tag_to_be_removed
+ because this tag will be removed(unwrapped/extract)
+ Parameters
+ ----------
+ tag_to_be_removed: Soup object
+ chapter_tag: BeautifulSoup
- """
- def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
- """Function inserts span before tag aren't supported by LiveCarta"""
- new_tag = chapter_tag.new_tag("span")
- new_tag.attrs["id"] = id_ or ""
- new_tag.attrs["class"] = class_ or ""
- new_tag.string = "\xa0"
- tag_to_be_removed.insert_before(new_tag)
+ Returns
+ -------
+ None
+ updated body tag
- if tag_to_be_removed.attrs.get("id"):
- _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
- id_=tag_to_be_removed.attrs["id"],
- class_=tag_to_be_removed.attrs.get("class"))
+ """
+ def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
+ class_: list):
+ """Function inserts span before tag aren't supported by LiveCarta"""
+ new_tag = chapter_tag.new_tag("span")
+ new_tag.attrs["id"] = id_ or ""
+ new_tag.attrs["class"] = class_ or ""
+ new_tag.string = "\xa0"
+ tag_to_be_removed.insert_before(new_tag)
-def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
- """
- After processing on a first_id that corresponds to current chapter,
- from initial html_soup all tags from current chapter are extracted
- Parameters
- ----------
- first_id: str
- Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
- href: str
- Name of current chapters file
- html_soup: Tag
- Soup object of current file
+ if tag_to_be_removed.attrs.get("id"):
+ _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
+ id_=tag_to_be_removed.attrs["id"],
+ class_=tag_to_be_removed.attrs.get("class"))
- Returns
- -------
- tags: list [Tag, NavigableString]
- Chapter's tags
+ @staticmethod
+ def prepare_title(title_of_chapter: str) -> str:
+ """
+ Function finalise processing/cleaning title
+ Parameters
+ ----------
+ title_of_chapter: str
- """
- marked_tags = html_soup.find(
- attrs={"id": first_id, "class": "converter-chapter-mark"})
- if marked_tags:
- next_tag = marked_tags.next_sibling
- tags = []
- while next_tag:
- if not isinstance(next_tag, NavigableString) and \
- (next_tag.attrs.get("class") == "converter-chapter-mark"):
- break
- tags.append(next_tag)
- next_tag = next_tag.next_sibling
+ Returns
+ -------
+ title: str
+ cleaned title
- # remove tags between first_id and next found id
- # save them in list for next steps
- tags = [tag.extract() for tag in tags]
- html_soup.smooth()
+ """
+ title = BeautifulSoup(title_of_chapter, features="lxml").string
+ # clean extra whitespace characters ([\r\n\t\f\v ])
+ title = re.sub(r"[\s\xa0]", " ", title).strip()
+ return title
- else:
- assert 0, f"Warning: no match for {first_id, href}"
+ @staticmethod
+ def _remove_comments(chapter_tag):
+ """
+ Function remove comments
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
- return tags
+ Returns
+ -------
+ None
+ Chapter Tag without comments
+ """
+ for tag in chapter_tag.find_all():
+ for element in tag(text=lambda text: isinstance(text, Comment)):
+ element.extract()
-def prepare_title(title_of_chapter: str) -> str:
- """
- Function finalise processing/cleaning title
- Parameters
- ----------
- title_of_chapter: str
+ @staticmethod
+ def _wrap_strings_with_p(chapter_tag):
+ """
+ Function converts headings that aren't supported by LiveCarta with
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
- Returns
- -------
- title: str
- cleaned title
+ Returns
+ -------
+ None
+ Chapter Tag with wrapped NavigableStrings
- """
- title = BeautifulSoup(title_of_chapter, features="lxml").string
- # clean extra whitespace characters ([\r\n\t\f\v ])
- title = re.sub(r"[\s\xa0]", " ", title).strip()
- return title
+ """
+ for node in chapter_tag:
+ if isinstance(node, NavigableString):
+ content = str(node)
+ content = re.sub(r"([\s\xa0])", " ", content).strip()
+ if content:
+ p_tag = chapter_tag.new_tag("p")
+ p_tag.append(str(node))
+ node.replace_with(p_tag)
+ def _wrap_tags_with_table(self, chapter_tag, rules: list):
+ """
+ Function wraps with
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
-def _remove_comments(chapter_tag):
- """
- Function remove comments
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
+ Returns
+ -------
+ None
+ Chapter Tag with wrapped certain tags with
- Returns
- -------
- None
- Chapter Tag without comments
+ """
- """
- for tag in chapter_tag.find_all():
- for element in tag(text=lambda text: isinstance(text, Comment)):
- element.extract()
+ def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
+ table = chapter_tag.new_tag("table")
+ table.attrs["border"], table.attrs["align"], table.attrs["style"] \
+ = border, "center", f"width:{width}%;"
+ tbody, tr, td = \
+ chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
+ td.attrs["bgcolor"] = bg_color
+ tag_to_be_wrapped.wrap(td)
+ td.wrap(tr)
+ tr.wrap(tbody)
+ tbody.wrap(table)
+ table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
+ return table
+ def process_tag_using_table(tag_to_wrap):
+ _wrap_tag_with_table(
+ chapter_tag,
+ tag_to_be_wrapped=tag_to_wrap,
+ width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
+ border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
+ bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
+ self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
+ tag_to_wrap.unwrap()
-def _wrap_strings_with_p(chapter_tag):
- """
- Function converts headings that aren't supported by LiveCarta with
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
-
- Returns
- -------
- None
- Chapter Tag with wrapped NavigableStrings
-
- """
- for node in chapter_tag:
- if isinstance(node, NavigableString):
- content = str(node)
- content = re.sub(r"([\s\xa0])", " ", content).strip()
- if content:
- p_tag = chapter_tag.new_tag("p")
- p_tag.append(str(node))
- node.replace_with(p_tag)
-
-
-def _wrap_tags_with_table(chapter_tag):
- """
- Function wraps with
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
-
- Returns
- -------
- None
- Chapter Tag with wrapped certain tags with
-
- """
- def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
- table = chapter_tag.new_tag("table")
- table.attrs["border"], table.attrs["align"], table.attrs["style"] \
- = border, "center", f"width:{width}%;"
- tbody, tr, td = \
- chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
- td.attrs["bgcolor"] = bg_color
- tag_to_be_wrapped.wrap(td)
- td.wrap(tr)
- tr.wrap(tbody)
- tbody.wrap(table)
- table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
- return table
-
- def process_tag_using_table(tag_to_wrap):
- _wrap_tag_with_table(
- chapter_tag,
- tag_to_be_wrapped=tag_to_wrap,
- width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
- border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
- bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
- _add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
- tag_to_wrap.unwrap()
-
- for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
- if isinstance(attrs, tuple):
- attr, val = attrs[0], attrs[1]
- for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
- process_tag_using_table(tag_to_wrap)
- else:
- for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
- if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
+ for rule in rules:
+ tags = rule["tags"]
+ for attr in rule["attrs"]:
+ for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
+ {attr["name"]: re.compile(fr"{attr['value']}")}):
process_tag_using_table(tag_to_wrap)
+ @staticmethod
+ def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list):
+ """
+ Function to replace all tags to correspond LiveCarta tags
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
-def _tags_to_correspond_livecarta_tag(chapter_tag):
- """
- Function to replace all tags to correspond LiveCarta tags
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
+ Returns
+ -------
+ None
+ Chapter Tag with all tags replaced with LiveCarta tags
- Returns
- -------
- None
- Chapter Tag with all tags replaced with LiveCarta tags
-
- """
- for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
- for key in reg_keys:
- if isinstance(key, tuple):
- replace = key[0]
- parent, child = key[1], key[2]
- for parent_tag in chapter_tag.select(parent):
- if replace == "parent":
- parent_tag.name = to_replace_value
- elif replace == "child":
- for child_tag in parent_tag.select(child):
- child_tag.name = to_replace_value
- if not child_tag.attrs.get("style"):
- child_tag.attrs["style"] =\
- "font-size: 14px; font-family: courier new,courier,monospace;"
- else:
- tags = chapter_tag.find_all(re.compile(key))
- for tag in tags:
- # todo can cause appearance of \n ...
-> \n
...
\n
(section)
- tag.name = to_replace_value
-
-
-def _unwrap_tags(chapter_tag):
- """
- Function unwrap tags and moves id to span
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
-
- Returns
- -------
- None
- Chapter Tag with unwrapped certain tags
-
- """
- for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
- for tag in chapter_tag.select(tag_name):
- # if tag is a subtag
- if ">" in tag_name:
- tag.parent.attrs.update(tag.attrs)
- _add_span_to_save_ids_for_links(tag, chapter_tag)
- tag.unwrap()
-
-
-def _remove_headings_content(content_tag, title_of_chapter: str):
- """
- Function
- - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
- - adds span with id in order to
- Parameters
- ----------
- content_tag: soup object
- Tag of the page
- title_of_chapter: str
- Chapter title
-
- Returns
- -------
- None
- clean/remove headings & add span with id
-
- """
- title_of_chapter = title_of_chapter.lower()
- for tag in content_tag.contents:
- text = tag if isinstance(tag, NavigableString) else tag.text
- if re.sub(r"[\s\xa0]", "", text):
- text = re.sub(r"[\s\xa0]", " ", text).lower()
- text = text.strip() # delete extra spaces
- if title_of_chapter == text or \
- (title_of_chapter in text and
- re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
- _add_span_to_save_ids_for_links(tag, content_tag)
- tag.extract()
- return
- elif not isinstance(tag, NavigableString):
- if not _remove_headings_content(tag, title_of_chapter):
- break
-
-
-def _process_table(chapter_tag: BeautifulSoup):
- """
- Function preprocesses tables and tags(td|th|tr)
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
-
- Returns
- -------
- None
- Chapter Tag with processed tables
-
- """
- tables = chapter_tag.find_all("table")
- for table in tables:
- for t_tag in table.find_all(re.compile("td|th|tr")):
- width = ""
- if t_tag.get("style"):
- width_match = re.search(
- r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
- if width_match:
- size = width_match.group(1)
- width = size + "px"
-
- t_tag.attrs["width"] = t_tag.get("width") or width
-
- if t_tag.attrs.get("style"):
- t_tag.attrs["style"] = t_tag.attrs["style"].replace(
- "border:0;", "")
- if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
- del t_tag.attrs["style"]
-
- if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
- table.attrs["border"] = "1"
-
-
-def _insert_tags_in_parents(chapter_tag):
- """
- Function inserts tags into correspond tags
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
-
- Returns
- -------
- None
- Chapter Tag with inserted tags
-
- """
- parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
- for parent_tag_name, condition in parent_tag2condition.items():
- for parent_tag in chapter_tag.select(parent_tag_name):
- if parent_tag.select(condition):
- continue
+ """
+ for rule in rules:
+ tags = rule["tags"]
+ tag_to_replace = rule["tag_to_replace"]
+ if rule["condition"]:
+ for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+ if condition_on_tag[0] == 'parent_tags':
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ if tag.parent.select(condition_on_tag[1]):
+ tag.name = tag_to_replace
+ elif condition_on_tag[0] == 'child_tags':
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
+ tag.name = tag_to_replace
+ elif condition_on_tag[0] == "attrs":
+ for attr in rule["condition"]["attrs"]:
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+ {attr["name"]: re.compile(fr"{attr['value']}")}):
+ tag.name = tag_to_replace
else:
- tag_to_insert = chapter_tag.new_tag(
- LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
- # insert all items that was in pre to code and remove from pre
- for content in reversed(parent_tag.contents):
- tag_to_insert.insert(0, content.extract())
- # wrap code with items
- parent_tag.append(tag_to_insert)
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ # todo can cause appearance of \n ...
-> \n
...
\n
(section)
+ tag.name = tag_to_replace
+ def _unwrap_tags(self, chapter_tag, rules: dict):
+ """
+ Function unwrap tags and moves id to span
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
-def _class_removing(chapter_tag):
- """
- Function removes classes that aren't created by converter
- Parameters
- ----------
- chapter_tag: BeautifulSoup
- Tag & contents of the chapter tag
+ Returns
+ -------
+ None
+ Chapter Tag with unwrapped certain tags
- Returns
- -------
- None
- Chapter Tag without original classes of the book
+ """
+ for tag_name in rules["tags"]:
+ for tag in chapter_tag.select(tag_name):
+ # if tag is a subtag
+ if ">" in tag_name:
+ tag.parent.attrs.update(tag.attrs)
+ self._add_span_to_save_ids_for_links(tag, chapter_tag)
+ tag.unwrap()
- """
- for tag in chapter_tag.find_all(recursive=True):
- if tag.attrs.get("class") \
- and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
- del tag.attrs["class"]
+ @staticmethod
+ def _insert_tags_into_correspond_tags(chapter_tag, rules: list):
+ """
+ Function inserts tags into correspond tags
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+ Returns
+ -------
+ None
+ Chapter Tag with inserted tags
-def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
- """
- Function finalise processing/cleaning content
- Parameters
- ----------
- title_str: str
+ """
- content_tag: Tag, soup object
+ def insert(tag, tag_to_insert):
+ # insert all items that was in tag to subtag and remove from tag
+ for content in reversed(tag.contents):
+ tag_to_insert.insert(0, content.extract())
+ # wrap subtag with items
+ tag.append(tag_to_insert)
- remove_title_from_chapter: bool
+ for rule in rules:
+ tags = rule["tags"]
+ tag_to_insert = \
+ chapter_tag.new_tag(rule["tag_to_insert"])
+ if rule["condition"]:
+ for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+ if condition_on_tag[0] == 'parent_tags':
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ if tag.parent.select(condition_on_tag[1]):
+ insert(tag, tag_to_insert)
+ elif condition_on_tag[0] == 'child_tags':
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
+ insert(tag, tag_to_insert)
+ elif condition_on_tag[0] == "attrs":
+ for attr in rule["condition"]["attrs"]:
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+ {attr["name"]: re.compile(fr"{attr['value']}")}):
+ insert(tag, tag_to_insert)
+ else:
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ insert(tag, tag_to_insert)
- Steps
- ----------
- 1. comments removal
- 2. wrap NavigableString with tag
- 3. wrap tags with
- 4. replace tags with correspond LiveCarta tags
- 5. unwrap tags
- 6. heading removal
- 7. process_table
- 8. insert tags into correspond tags
- 9. class removal
+ def _remove_headings_content(self, content_tag, title_of_chapter: str):
+ """
+ Function
+ - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
+ - adds span with id in order to
+ Parameters
+ ----------
+ content_tag: soup object
+ Tag of the page
+ title_of_chapter: str
+ Chapter title
- Returns
- -------
- content_tag: str
- prepared content
+ Returns
+ -------
+ None
+ clean/remove headings & add span with id
- """
- # 1. remove comments
- _remove_comments(content_tag)
+ """
+ title_of_chapter = title_of_chapter.lower()
+ for tag in content_tag.contents:
+ text = tag if isinstance(tag, NavigableString) else tag.text
+ if re.sub(r"[\s\xa0]", "", text):
+ text = re.sub(r"[\s\xa0]", " ", text).lower()
+ text = text.strip() # delete extra spaces
+ if title_of_chapter == text or \
+ (title_of_chapter in text and
+ re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
+ self._add_span_to_save_ids_for_links(tag, content_tag)
+ tag.extract()
+ return
+ elif not isinstance(tag, NavigableString):
+ if not self._remove_headings_content(tag, title_of_chapter):
+ break
- # 2.
- _wrap_strings_with_p(content_tag)
- # 3.
- _wrap_tags_with_table(content_tag)
- # 4.
- _tags_to_correspond_livecarta_tag(content_tag)
- # 5.
- _unwrap_tags(content_tag)
- # 6.
- if remove_title_from_chapter:
- _remove_headings_content(content_tag, title_str)
- # 7.
- _process_table(content_tag)
- # 8.
- _insert_tags_in_parents(content_tag)
+ @staticmethod
+ def _process_tables(chapter_tag: BeautifulSoup):
+ """
+ Function preprocesses tables and tags(td|th|tr)
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
- # 9. remove classes that weren't created by converter
- _class_removing(content_tag)
- return str(content_tag)
+ Returns
+ -------
+ None
+ Chapter Tag with processed tables
+
+ """
+ tables = chapter_tag.find_all("table")
+ for table in tables:
+ for t_tag in table.find_all(re.compile("td|th|tr")):
+ width = ""
+ if t_tag.get("style"):
+ width_match = re.search(
+ r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
+ if width_match:
+ size = width_match.group(1)
+ width = size + "px"
+
+ t_tag.attrs["width"] = t_tag.get("width") or width
+
+ if t_tag.attrs.get("style"):
+ t_tag.attrs["style"] = t_tag.attrs["style"].replace(
+ "border:0;", "")
+ if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
+ del t_tag.attrs["style"]
+
+ if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
+ table.attrs["border"] = "1"
+
+ @staticmethod
+ def _class_removing(chapter_tag):
+ """
+ Function removes classes that aren't created by converter
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag without original classes of the book
+
+ """
+ for tag in chapter_tag.find_all(recursive=True):
+ if tag.attrs.get("class") \
+ and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
+ del tag.attrs["class"]
+
+ def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
+ """
+ Function finalise processing/cleaning content
+ Parameters
+ ----------
+ title_str: str
+
+ content_tag: Tag, soup object
+
+ remove_title_from_chapter: bool
+
+ Steps
+ ----------
+ 1. comments removal
+ 2. wrap NavigableString with tag
+ 3-6. wrap tags with
+ replace tags with correspond LiveCarta tags
+ unwrap tags
+ insert tags into correspond tags
+ 7. heading removal
+ 8. process_tables
+ 9. class removal
+
+ Returns
+ -------
+ content_tag: str
+ prepared content
+
+ """
+ # 1. remove comments
+ self._remove_comments(content_tag)
+ # 2.
+ self._wrap_strings_with_p(content_tag)
+ # 3-6.
+ for dict in self.preset:
+ func = self.name2function[dict["preset_name"]]
+ func(content_tag, dict['rules'])
+ # 7.
+ if remove_title_from_chapter:
+ self._remove_headings_content(content_tag, title_str)
+ # 8.
+ self._process_tables(content_tag)
+ # 9. remove classes that weren't created by converter
+ self._class_removing(content_tag)
+ return str(content_tag)
diff --git a/src/preset_processor.py b/src/preset_processor.py
new file mode 100644
index 0000000..a1cbb93
--- /dev/null
+++ b/src/preset_processor.py
@@ -0,0 +1,15 @@
+import json
+
+
+from src.util.helpers import BookLogger
+
+
+class PresetProcessor:
+ def __init__(self, preset_path="config/presets.json", logger=None):
+ self.preset_path = preset_path
+ self.logger: BookLogger = logger
+
+ def get_preset_json(self):
+ f = open(self.preset_path)
+ data = json.load(f)
+ return data