forked from LiveCarta/BookConverter
add processing of JSON presets
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
import os
|
||||
import logging
|
||||
import pathlib
|
||||
from shutil import copyfile
|
||||
|
||||
|
||||
@@ -4,33 +4,34 @@ import codecs
|
||||
import os
|
||||
from os.path import dirname, normpath, join
|
||||
from itertools import chain
|
||||
from premailer import transform
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Union, List
|
||||
|
||||
|
||||
import ebooklib
|
||||
from ebooklib import epub
|
||||
from ebooklib.epub import Link, Section
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
from src.preset_processor import PresetProcessor
|
||||
from src.epub_converter.css_preprocessor import CSSPreprocessor
|
||||
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.data_objects import ChapterItem, NavPoint
|
||||
from src.epub_converter.image_processing import update_images_src_links
|
||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
|
||||
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
|
||||
from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
|
||||
prepare_title, prepare_content
|
||||
from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
|
||||
|
||||
|
||||
class EpubConverter:
|
||||
def __init__(self, file_path, access=None, logger=None):
|
||||
def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
|
||||
self.file_path = file_path
|
||||
self.access = access
|
||||
self.logger: BookLogger = logger
|
||||
self.ebooklib_book = epub.read_epub(file_path)
|
||||
self.css_processor = css_preprocessor
|
||||
self.html_preprocessor = html_processor
|
||||
|
||||
# main container for all epub .xhtml files
|
||||
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
|
||||
@@ -74,25 +75,15 @@ class EpubConverter:
|
||||
self.process_inline_styles_in_html_soup()
|
||||
self.logger.log("CSS files processing.")
|
||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||
self.logger.log("CSS styles adding.")
|
||||
self.logger.log("CSS styles adding.")
|
||||
self.add_css_styles_to_html_soup()
|
||||
|
||||
# todo presets
|
||||
|
||||
self.logger.log("Footnotes processing.")
|
||||
for href in self.html_href2html_body_soup:
|
||||
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
|
||||
self.html_href2html_body_soup)
|
||||
self.footnotes_contents.extend(content)
|
||||
self.noterefs.extend(noterefs)
|
||||
self.footnotes.extend(footnotes_tags)
|
||||
|
||||
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
|
||||
noteref.attrs["data-id"] = i + 1
|
||||
noteref.attrs["id"] = f"footnote-{i + 1}"
|
||||
footnote.attrs["href"] = f"#footnote-{i + 1}"
|
||||
|
||||
self.footnotes_contents, self.noterefs, self.footnotes =\
|
||||
preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
|
||||
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
|
||||
|
||||
self.logger.log("TOC processing.")
|
||||
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
||||
# build simple toc from spine if needed
|
||||
@@ -101,6 +92,7 @@ class EpubConverter:
|
||||
not_added = [
|
||||
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
|
||||
self.logger.log(f"Html documents not added to TOC: {not_added}.")
|
||||
self.logger.log(f"Add documents not added to TOC.")
|
||||
self.add_not_added_files_to_adjacency_list(not_added)
|
||||
self.logger.log(f"Html internal links and structure processing.")
|
||||
self.label_chapters_ids_with_lc_id()
|
||||
@@ -149,7 +141,7 @@ class EpubConverter:
|
||||
for tag_initial_inline_style in tags_with_inline_style:
|
||||
inline_style = tag_initial_inline_style.attrs["style"]
|
||||
tag_initial_inline_style.attrs["style"] = \
|
||||
build_inline_style_content(inline_style)
|
||||
self.css_processor.build_inline_style_content(inline_style)
|
||||
|
||||
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
||||
"""
|
||||
@@ -181,16 +173,53 @@ class EpubConverter:
|
||||
html_href2css_href[html_href].append(css_href)
|
||||
if css_href not in css_href2css_content:
|
||||
# css_href not in css_href2css_content, add to this dict
|
||||
css_href2css_content[css_href] = build_css_file_content(
|
||||
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
|
||||
self.get_css_content(css_href, html_href))
|
||||
|
||||
for i, tag in enumerate(soup_html_content.find_all("style")):
|
||||
css_content = tag.string
|
||||
html_href2css_href[html_href].append(f"href{i}")
|
||||
css_href2css_content[f"href{i}"] = build_css_file_content(
|
||||
css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
|
||||
css_content)
|
||||
return html_href2css_href, css_href2css_content
|
||||
|
||||
def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
||||
"""
|
||||
Function adds styles from .css to inline style.
|
||||
Parameters
|
||||
----------
|
||||
html_soup: BeautifulSoup
|
||||
html page with inline style
|
||||
css_text: str
|
||||
css content from css file
|
||||
Returns
|
||||
-------
|
||||
inline_soup: BeautifulSoup
|
||||
soup with styles from css
|
||||
|
||||
"""
|
||||
# remove this specification because it causes problems
|
||||
css_text = css_text.replace(
|
||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||
# here we add css styles to inline style
|
||||
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
||||
remove_classes=False,
|
||||
external_styles=False,
|
||||
allow_network=False,
|
||||
disable_validation=True,
|
||||
)
|
||||
# soup with converted styles from css
|
||||
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
|
||||
|
||||
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||
attrs={"style": re.compile(".*")})
|
||||
|
||||
# go through the tags with inline style + style parsed from css file
|
||||
for tag_inline_style in tags_with_inline_style:
|
||||
style_converter = TagInlineStyleProcessor(tag_inline_style)
|
||||
style_converter.convert_initial_tag()
|
||||
return inline_soup
|
||||
|
||||
def add_css_styles_to_html_soup(self):
|
||||
"""
|
||||
This function is designed to update html_href2html_body_soup
|
||||
@@ -203,7 +232,7 @@ class EpubConverter:
|
||||
for css_href in self.html_href2css_href[html_href]:
|
||||
css += self.css_href2css_content[css_href]
|
||||
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
||||
html_content = convert_html_soup_with_css_style(html_content, css)
|
||||
html_content = self.convert_html_soup_with_css_style(html_content, css)
|
||||
self.html_href2html_body_soup[html_href] = html_content
|
||||
|
||||
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
||||
@@ -488,6 +517,48 @@ class EpubConverter:
|
||||
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
|
||||
f" Old id={a_tag_id}")
|
||||
|
||||
@staticmethod
|
||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||
"""
|
||||
After processing on a first_id that corresponds to current chapter,
|
||||
from initial html_soup all tags from current chapter are extracted
|
||||
Parameters
|
||||
----------
|
||||
first_id: str
|
||||
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
|
||||
href: str
|
||||
Name of current chapters file
|
||||
html_soup: Tag
|
||||
Soup object of current file
|
||||
|
||||
Returns
|
||||
-------
|
||||
tags: list [Tag, NavigableString]
|
||||
Chapter's tags
|
||||
|
||||
"""
|
||||
marked_tags = html_soup.find(
|
||||
attrs={"id": first_id, "class": "converter-chapter-mark"})
|
||||
if marked_tags:
|
||||
next_tag = marked_tags.next_sibling
|
||||
tags = []
|
||||
while next_tag:
|
||||
if not isinstance(next_tag, NavigableString) and \
|
||||
(next_tag.attrs.get("class") == "converter-chapter-mark"):
|
||||
break
|
||||
tags.append(next_tag)
|
||||
next_tag = next_tag.next_sibling
|
||||
|
||||
# remove tags between first_id and next found id
|
||||
# save them in list for next steps
|
||||
tags = [tag.extract() for tag in tags]
|
||||
html_soup.smooth()
|
||||
|
||||
else:
|
||||
assert 0, f"Warning: no match for {first_id, href}"
|
||||
|
||||
return tags
|
||||
|
||||
def detect_one_chapter(self, nav_point: NavPoint):
|
||||
"""
|
||||
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
||||
@@ -511,11 +582,11 @@ class EpubConverter:
|
||||
"""
|
||||
if nav_point.id:
|
||||
soup = self.html_href2html_body_soup[nav_point.href]
|
||||
chapter_tags = get_tags_between_chapter_marks(
|
||||
subchapter_tags = self.get_tags_between_chapter_marks(
|
||||
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
|
||||
new_tree = BeautifulSoup("", "html.parser")
|
||||
for tag in chapter_tags:
|
||||
new_tree.append(tag)
|
||||
for subchapter_tag in subchapter_tags:
|
||||
new_tree.append(subchapter_tag)
|
||||
self.href_chapter_id2soup_html[(
|
||||
nav_point.href, nav_point.id)] = new_tree
|
||||
|
||||
@@ -527,8 +598,8 @@ class EpubConverter:
|
||||
"""Function build chapters content, starts from top level chapters"""
|
||||
top_level_nav_points = self.adjacency_list[-1]
|
||||
if self.id_anchor_exist_in_nav_points:
|
||||
for point in top_level_nav_points:
|
||||
self.detect_one_chapter(point)
|
||||
for tl_nav_point in top_level_nav_points:
|
||||
self.detect_one_chapter(tl_nav_point)
|
||||
|
||||
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||
"""
|
||||
@@ -561,9 +632,9 @@ class EpubConverter:
|
||||
if hasattr(self.file_path, "stem") else "book_id")
|
||||
|
||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||
title_preprocessed = prepare_title(title)
|
||||
content_preprocessed = prepare_content(title_preprocessed, content,
|
||||
remove_title_from_chapter=is_chapter)
|
||||
title_preprocessed = self.html_preprocessor.prepare_title(title)
|
||||
content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
|
||||
remove_title_from_chapter=is_chapter)
|
||||
sub_nodes = []
|
||||
# warning! not EpubHtmlItems won't be added to chapter
|
||||
# if it doesn't have subchapters
|
||||
@@ -598,11 +669,17 @@ class EpubConverter:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
epub_file_path = "../../epub/9781641050234.epub"
|
||||
epub_file_path = "../../epub/Modern_Java_in_Action.epub"
|
||||
logger_object = BookLogger(
|
||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||
|
||||
json_converter = EpubConverter(epub_file_path, logger=logger_object)
|
||||
preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
|
||||
.get_preset_json()
|
||||
css_preprocessor = CSSPreprocessor(logger=logger_object)
|
||||
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
|
||||
|
||||
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
||||
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
|
||||
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
from src.book_solver import BookSolver
|
||||
from src.preset_processor import PresetProcessor
|
||||
from src.epub_converter.css_preprocessor import CSSPreprocessor
|
||||
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
|
||||
from src.epub_converter.epub_converter import EpubConverter
|
||||
|
||||
|
||||
@@ -14,8 +17,10 @@ class EpubBook(BookSolver):
|
||||
Function
|
||||
Steps
|
||||
----------
|
||||
1. Converts .epub to .html
|
||||
2. Parses from line structure to nested structure
|
||||
1. Gets data from preset structure
|
||||
2. Add preset to html preprocessor
|
||||
3. Converts .epub to .html
|
||||
4. Parses from line structure to nested structure
|
||||
|
||||
Returns
|
||||
----------
|
||||
@@ -23,7 +28,12 @@ class EpubBook(BookSolver):
|
||||
json for LiveCarta platform
|
||||
|
||||
"""
|
||||
preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
|
||||
.get_preset_json()
|
||||
css_preprocessor = CSSPreprocessor(logger=self.logger_object)
|
||||
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
|
||||
json_converter = EpubConverter(
|
||||
self.file_path, access=self.access, logger=self.logger_object)
|
||||
self.file_path, access=self.access, logger=self.logger_object,
|
||||
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
return content_dict
|
||||
|
||||
@@ -1,419 +1,398 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||||
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.util.helpers import BookLogger
|
||||
|
||||
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function adds span with id from tag_to_be_removed
|
||||
because this tag will be removed(unwrapped/extract)
|
||||
Parameters
|
||||
----------
|
||||
tag_to_be_removed: Soup object
|
||||
chapter_tag: BeautifulSoup
|
||||
class HtmlEpubPreprocessor:
|
||||
def __init__(self, preset, logger=None):
|
||||
self.preset = preset
|
||||
self.logger: BookLogger = logger
|
||||
self.name2function = {
|
||||
"table_wrapper": self._wrap_tags_with_table,
|
||||
"replacer": self._tags_to_correspond_livecarta_tag,
|
||||
"unwrapper": self._unwrap_tags,
|
||||
"inserter": self._insert_tags_into_correspond_tags
|
||||
}
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
updated body tag
|
||||
@staticmethod
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function adds span with id from tag_to_be_removed
|
||||
because this tag will be removed(unwrapped/extract)
|
||||
Parameters
|
||||
----------
|
||||
tag_to_be_removed: Soup object
|
||||
chapter_tag: BeautifulSoup
|
||||
|
||||
"""
|
||||
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
|
||||
"""Function inserts span before tag aren't supported by LiveCarta"""
|
||||
new_tag = chapter_tag.new_tag("span")
|
||||
new_tag.attrs["id"] = id_ or ""
|
||||
new_tag.attrs["class"] = class_ or ""
|
||||
new_tag.string = "\xa0"
|
||||
tag_to_be_removed.insert_before(new_tag)
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
updated body tag
|
||||
|
||||
if tag_to_be_removed.attrs.get("id"):
|
||||
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs["id"],
|
||||
class_=tag_to_be_removed.attrs.get("class"))
|
||||
"""
|
||||
|
||||
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
|
||||
class_: list):
|
||||
"""Function inserts span before tag aren't supported by LiveCarta"""
|
||||
new_tag = chapter_tag.new_tag("span")
|
||||
new_tag.attrs["id"] = id_ or ""
|
||||
new_tag.attrs["class"] = class_ or ""
|
||||
new_tag.string = "\xa0"
|
||||
tag_to_be_removed.insert_before(new_tag)
|
||||
|
||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||
"""
|
||||
After processing on a first_id that corresponds to current chapter,
|
||||
from initial html_soup all tags from current chapter are extracted
|
||||
Parameters
|
||||
----------
|
||||
first_id: str
|
||||
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
|
||||
href: str
|
||||
Name of current chapters file
|
||||
html_soup: Tag
|
||||
Soup object of current file
|
||||
if tag_to_be_removed.attrs.get("id"):
|
||||
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs["id"],
|
||||
class_=tag_to_be_removed.attrs.get("class"))
|
||||
|
||||
Returns
|
||||
-------
|
||||
tags: list [Tag, NavigableString]
|
||||
Chapter's tags
|
||||
@staticmethod
|
||||
def prepare_title(title_of_chapter: str) -> str:
|
||||
"""
|
||||
Function finalise processing/cleaning title
|
||||
Parameters
|
||||
----------
|
||||
title_of_chapter: str
|
||||
|
||||
"""
|
||||
marked_tags = html_soup.find(
|
||||
attrs={"id": first_id, "class": "converter-chapter-mark"})
|
||||
if marked_tags:
|
||||
next_tag = marked_tags.next_sibling
|
||||
tags = []
|
||||
while next_tag:
|
||||
if not isinstance(next_tag, NavigableString) and \
|
||||
(next_tag.attrs.get("class") == "converter-chapter-mark"):
|
||||
break
|
||||
tags.append(next_tag)
|
||||
next_tag = next_tag.next_sibling
|
||||
Returns
|
||||
-------
|
||||
title: str
|
||||
cleaned title
|
||||
|
||||
# remove tags between first_id and next found id
|
||||
# save them in list for next steps
|
||||
tags = [tag.extract() for tag in tags]
|
||||
html_soup.smooth()
|
||||
"""
|
||||
title = BeautifulSoup(title_of_chapter, features="lxml").string
|
||||
# clean extra whitespace characters ([\r\n\t\f\v ])
|
||||
title = re.sub(r"[\s\xa0]", " ", title).strip()
|
||||
return title
|
||||
|
||||
else:
|
||||
assert 0, f"Warning: no match for {first_id, href}"
|
||||
@staticmethod
|
||||
def _remove_comments(chapter_tag):
|
||||
"""
|
||||
Function remove comments
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
return tags
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag without comments
|
||||
|
||||
"""
|
||||
for tag in chapter_tag.find_all():
|
||||
for element in tag(text=lambda text: isinstance(text, Comment)):
|
||||
element.extract()
|
||||
|
||||
def prepare_title(title_of_chapter: str) -> str:
|
||||
"""
|
||||
Function finalise processing/cleaning title
|
||||
Parameters
|
||||
----------
|
||||
title_of_chapter: str
|
||||
@staticmethod
|
||||
def _wrap_strings_with_p(chapter_tag):
|
||||
"""
|
||||
Function converts headings that aren't supported by LiveCarta with <p>
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
title: str
|
||||
cleaned title
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with wrapped NavigableStrings
|
||||
|
||||
"""
|
||||
title = BeautifulSoup(title_of_chapter, features="lxml").string
|
||||
# clean extra whitespace characters ([\r\n\t\f\v ])
|
||||
title = re.sub(r"[\s\xa0]", " ", title).strip()
|
||||
return title
|
||||
"""
|
||||
for node in chapter_tag:
|
||||
if isinstance(node, NavigableString):
|
||||
content = str(node)
|
||||
content = re.sub(r"([\s\xa0])", " ", content).strip()
|
||||
if content:
|
||||
p_tag = chapter_tag.new_tag("p")
|
||||
p_tag.append(str(node))
|
||||
node.replace_with(p_tag)
|
||||
|
||||
def _wrap_tags_with_table(self, chapter_tag, rules: list):
|
||||
"""
|
||||
Function wraps <tag> with <table>
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
def _remove_comments(chapter_tag):
|
||||
"""
|
||||
Function remove comments
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with wrapped certain tags with <table>
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag without comments
|
||||
"""
|
||||
|
||||
"""
|
||||
for tag in chapter_tag.find_all():
|
||||
for element in tag(text=lambda text: isinstance(text, Comment)):
|
||||
element.extract()
|
||||
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
|
||||
table = chapter_tag.new_tag("table")
|
||||
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
||||
= border, "center", f"width:{width}%;"
|
||||
tbody, tr, td = \
|
||||
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||
td.attrs["bgcolor"] = bg_color
|
||||
tag_to_be_wrapped.wrap(td)
|
||||
td.wrap(tr)
|
||||
tr.wrap(tbody)
|
||||
tbody.wrap(table)
|
||||
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
||||
return table
|
||||
|
||||
def process_tag_using_table(tag_to_wrap):
|
||||
_wrap_tag_with_table(
|
||||
chapter_tag,
|
||||
tag_to_be_wrapped=tag_to_wrap,
|
||||
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
|
||||
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
|
||||
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
||||
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
||||
tag_to_wrap.unwrap()
|
||||
|
||||
def _wrap_strings_with_p(chapter_tag):
|
||||
"""
|
||||
Function converts headings that aren't supported by LiveCarta with <p>
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with wrapped NavigableStrings
|
||||
|
||||
"""
|
||||
for node in chapter_tag:
|
||||
if isinstance(node, NavigableString):
|
||||
content = str(node)
|
||||
content = re.sub(r"([\s\xa0])", " ", content).strip()
|
||||
if content:
|
||||
p_tag = chapter_tag.new_tag("p")
|
||||
p_tag.append(str(node))
|
||||
node.replace_with(p_tag)
|
||||
|
||||
|
||||
def _wrap_tags_with_table(chapter_tag):
|
||||
"""
|
||||
Function wraps <tag> with <table>
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with wrapped certain tags with <table>
|
||||
|
||||
"""
|
||||
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
|
||||
table = chapter_tag.new_tag("table")
|
||||
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
||||
= border, "center", f"width:{width}%;"
|
||||
tbody, tr, td = \
|
||||
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||
td.attrs["bgcolor"] = bg_color
|
||||
tag_to_be_wrapped.wrap(td)
|
||||
td.wrap(tr)
|
||||
tr.wrap(tbody)
|
||||
tbody.wrap(table)
|
||||
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
||||
return table
|
||||
|
||||
def process_tag_using_table(tag_to_wrap):
|
||||
_wrap_tag_with_table(
|
||||
chapter_tag,
|
||||
tag_to_be_wrapped=tag_to_wrap,
|
||||
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
|
||||
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
|
||||
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
||||
_add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
||||
tag_to_wrap.unwrap()
|
||||
|
||||
for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
|
||||
if isinstance(attrs, tuple):
|
||||
attr, val = attrs[0], attrs[1]
|
||||
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
|
||||
process_tag_using_table(tag_to_wrap)
|
||||
else:
|
||||
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
|
||||
if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
|
||||
for rule in rules:
|
||||
tags = rule["tags"]
|
||||
for attr in rule["attrs"]:
|
||||
for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||
process_tag_using_table(tag_to_wrap)
|
||||
|
||||
@staticmethod
|
||||
def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list):
|
||||
"""
|
||||
Function to replace all tags to correspond LiveCarta tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
||||
"""
|
||||
Function to replace all tags to correspond LiveCarta tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with all tags replaced with LiveCarta tags
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with all tags replaced with LiveCarta tags
|
||||
|
||||
"""
|
||||
for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
|
||||
for key in reg_keys:
|
||||
if isinstance(key, tuple):
|
||||
replace = key[0]
|
||||
parent, child = key[1], key[2]
|
||||
for parent_tag in chapter_tag.select(parent):
|
||||
if replace == "parent":
|
||||
parent_tag.name = to_replace_value
|
||||
elif replace == "child":
|
||||
for child_tag in parent_tag.select(child):
|
||||
child_tag.name = to_replace_value
|
||||
if not child_tag.attrs.get("style"):
|
||||
child_tag.attrs["style"] =\
|
||||
"font-size: 14px; font-family: courier new,courier,monospace;"
|
||||
"""
|
||||
for rule in rules:
|
||||
tags = rule["tags"]
|
||||
tag_to_replace = rule["tag_to_replace"]
|
||||
if rule["condition"]:
|
||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||
if condition_on_tag[0] == 'parent_tags':
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
if tag.parent.select(condition_on_tag[1]):
|
||||
tag.name = tag_to_replace
|
||||
elif condition_on_tag[0] == 'child_tags':
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
|
||||
tag.name = tag_to_replace
|
||||
elif condition_on_tag[0] == "attrs":
|
||||
for attr in rule["condition"]["attrs"]:
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||
tag.name = tag_to_replace
|
||||
else:
|
||||
tags = chapter_tag.find_all(re.compile(key))
|
||||
for tag in tags:
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
||||
tag.name = to_replace_value
|
||||
tag.name = tag_to_replace
|
||||
|
||||
def _unwrap_tags(self, chapter_tag, rules: dict):
|
||||
"""
|
||||
Function unwrap tags and moves id to span
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
def _unwrap_tags(chapter_tag):
|
||||
"""
|
||||
Function unwrap tags and moves id to span
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with unwrapped certain tags
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with unwrapped certain tags
|
||||
"""
|
||||
for tag_name in rules["tags"]:
|
||||
for tag in chapter_tag.select(tag_name):
|
||||
# if tag is a subtag
|
||||
if ">" in tag_name:
|
||||
tag.parent.attrs.update(tag.attrs)
|
||||
self._add_span_to_save_ids_for_links(tag, chapter_tag)
|
||||
tag.unwrap()
|
||||
|
||||
"""
|
||||
for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
|
||||
for tag in chapter_tag.select(tag_name):
|
||||
# if tag is a subtag
|
||||
if ">" in tag_name:
|
||||
tag.parent.attrs.update(tag.attrs)
|
||||
_add_span_to_save_ids_for_links(tag, chapter_tag)
|
||||
tag.unwrap()
|
||||
@staticmethod
|
||||
def _insert_tags_into_correspond_tags(chapter_tag, rules: list):
|
||||
"""
|
||||
Function inserts tags into correspond tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with inserted tags
|
||||
|
||||
def _remove_headings_content(content_tag, title_of_chapter: str):
|
||||
"""
|
||||
Function
|
||||
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
|
||||
- adds span with id in order to
|
||||
Parameters
|
||||
----------
|
||||
content_tag: soup object
|
||||
Tag of the page
|
||||
title_of_chapter: str
|
||||
Chapter title
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
clean/remove headings & add span with id
|
||||
def insert(tag, tag_to_insert):
|
||||
# insert all items that was in tag to subtag and remove from tag
|
||||
for content in reversed(tag.contents):
|
||||
tag_to_insert.insert(0, content.extract())
|
||||
# wrap subtag with items
|
||||
tag.append(tag_to_insert)
|
||||
|
||||
"""
|
||||
title_of_chapter = title_of_chapter.lower()
|
||||
for tag in content_tag.contents:
|
||||
text = tag if isinstance(tag, NavigableString) else tag.text
|
||||
if re.sub(r"[\s\xa0]", "", text):
|
||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||
text = text.strip() # delete extra spaces
|
||||
if title_of_chapter == text or \
|
||||
(title_of_chapter in text and
|
||||
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
|
||||
_add_span_to_save_ids_for_links(tag, content_tag)
|
||||
tag.extract()
|
||||
return
|
||||
elif not isinstance(tag, NavigableString):
|
||||
if not _remove_headings_content(tag, title_of_chapter):
|
||||
break
|
||||
|
||||
|
||||
def _process_table(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function preprocesses tables and tags(td|th|tr)
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with processed tables
|
||||
|
||||
"""
|
||||
tables = chapter_tag.find_all("table")
|
||||
for table in tables:
|
||||
for t_tag in table.find_all(re.compile("td|th|tr")):
|
||||
width = ""
|
||||
if t_tag.get("style"):
|
||||
width_match = re.search(
|
||||
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
|
||||
if width_match:
|
||||
size = width_match.group(1)
|
||||
width = size + "px"
|
||||
|
||||
t_tag.attrs["width"] = t_tag.get("width") or width
|
||||
|
||||
if t_tag.attrs.get("style"):
|
||||
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
||||
"border:0;", "")
|
||||
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
|
||||
del t_tag.attrs["style"]
|
||||
|
||||
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
||||
table.attrs["border"] = "1"
|
||||
|
||||
|
||||
def _insert_tags_in_parents(chapter_tag):
|
||||
"""
|
||||
Function inserts tags into correspond tags
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with inserted tags
|
||||
|
||||
"""
|
||||
parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
|
||||
for parent_tag_name, condition in parent_tag2condition.items():
|
||||
for parent_tag in chapter_tag.select(parent_tag_name):
|
||||
if parent_tag.select(condition):
|
||||
continue
|
||||
for rule in rules:
|
||||
tags = rule["tags"]
|
||||
tag_to_insert = \
|
||||
chapter_tag.new_tag(rule["tag_to_insert"])
|
||||
if rule["condition"]:
|
||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||
if condition_on_tag[0] == 'parent_tags':
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
if tag.parent.select(condition_on_tag[1]):
|
||||
insert(tag, tag_to_insert)
|
||||
elif condition_on_tag[0] == 'child_tags':
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
|
||||
insert(tag, tag_to_insert)
|
||||
elif condition_on_tag[0] == "attrs":
|
||||
for attr in rule["condition"]["attrs"]:
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||
insert(tag, tag_to_insert)
|
||||
else:
|
||||
tag_to_insert = chapter_tag.new_tag(
|
||||
LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
|
||||
# insert all items that was in pre to code and remove from pre
|
||||
for content in reversed(parent_tag.contents):
|
||||
tag_to_insert.insert(0, content.extract())
|
||||
# wrap code with items
|
||||
parent_tag.append(tag_to_insert)
|
||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
insert(tag, tag_to_insert)
|
||||
|
||||
def _remove_headings_content(self, content_tag, title_of_chapter: str):
|
||||
"""
|
||||
Function
|
||||
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
|
||||
- adds span with id in order to
|
||||
Parameters
|
||||
----------
|
||||
content_tag: soup object
|
||||
Tag of the page
|
||||
title_of_chapter: str
|
||||
Chapter title
|
||||
|
||||
def _class_removing(chapter_tag):
|
||||
"""
|
||||
Function removes classes that aren't created by converter
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
clean/remove headings & add span with id
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag without original classes of the book
|
||||
"""
|
||||
title_of_chapter = title_of_chapter.lower()
|
||||
for tag in content_tag.contents:
|
||||
text = tag if isinstance(tag, NavigableString) else tag.text
|
||||
if re.sub(r"[\s\xa0]", "", text):
|
||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||
text = text.strip() # delete extra spaces
|
||||
if title_of_chapter == text or \
|
||||
(title_of_chapter in text and
|
||||
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
|
||||
self._add_span_to_save_ids_for_links(tag, content_tag)
|
||||
tag.extract()
|
||||
return
|
||||
elif not isinstance(tag, NavigableString):
|
||||
if not self._remove_headings_content(tag, title_of_chapter):
|
||||
break
|
||||
|
||||
"""
|
||||
for tag in chapter_tag.find_all(recursive=True):
|
||||
if tag.attrs.get("class") \
|
||||
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
||||
del tag.attrs["class"]
|
||||
@staticmethod
|
||||
def _process_tables(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function preprocesses tables and tags(td|th|tr)
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag with processed tables
|
||||
|
||||
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
||||
"""
|
||||
Function finalise processing/cleaning content
|
||||
Parameters
|
||||
----------
|
||||
title_str: str
|
||||
"""
|
||||
tables = chapter_tag.find_all("table")
|
||||
for table in tables:
|
||||
for t_tag in table.find_all(re.compile("td|th|tr")):
|
||||
width = ""
|
||||
if t_tag.get("style"):
|
||||
width_match = re.search(
|
||||
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
|
||||
if width_match:
|
||||
size = width_match.group(1)
|
||||
width = size + "px"
|
||||
|
||||
content_tag: Tag, soup object
|
||||
t_tag.attrs["width"] = t_tag.get("width") or width
|
||||
|
||||
remove_title_from_chapter: bool
|
||||
if t_tag.attrs.get("style"):
|
||||
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
||||
"border:0;", "")
|
||||
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
|
||||
del t_tag.attrs["style"]
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. comments removal
|
||||
2. wrap NavigableString with tag <p>
|
||||
3. wrap tags with <table>
|
||||
4. replace tags with correspond LiveCarta tags
|
||||
5. unwrap tags
|
||||
6. heading removal
|
||||
7. process_table
|
||||
8. insert tags into correspond tags
|
||||
9. class removal
|
||||
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
||||
table.attrs["border"] = "1"
|
||||
|
||||
Returns
|
||||
-------
|
||||
content_tag: str
|
||||
prepared content
|
||||
@staticmethod
|
||||
def _class_removing(chapter_tag):
|
||||
"""
|
||||
Function removes classes that aren't created by converter
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
"""
|
||||
# 1. remove comments
|
||||
_remove_comments(content_tag)
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Chapter Tag without original classes of the book
|
||||
|
||||
# 2.
|
||||
_wrap_strings_with_p(content_tag)
|
||||
# 3.
|
||||
_wrap_tags_with_table(content_tag)
|
||||
# 4.
|
||||
_tags_to_correspond_livecarta_tag(content_tag)
|
||||
# 5.
|
||||
_unwrap_tags(content_tag)
|
||||
# 6.
|
||||
if remove_title_from_chapter:
|
||||
_remove_headings_content(content_tag, title_str)
|
||||
# 7.
|
||||
_process_table(content_tag)
|
||||
# 8.
|
||||
_insert_tags_in_parents(content_tag)
|
||||
"""
|
||||
for tag in chapter_tag.find_all(recursive=True):
|
||||
if tag.attrs.get("class") \
|
||||
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
||||
del tag.attrs["class"]
|
||||
|
||||
# 9. remove classes that weren't created by converter
|
||||
_class_removing(content_tag)
|
||||
return str(content_tag)
|
||||
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
||||
"""
|
||||
Function finalise processing/cleaning content
|
||||
Parameters
|
||||
----------
|
||||
title_str: str
|
||||
|
||||
content_tag: Tag, soup object
|
||||
|
||||
remove_title_from_chapter: bool
|
||||
|
||||
Steps
|
||||
----------
|
||||
1. comments removal
|
||||
2. wrap NavigableString with tag <p>
|
||||
3-6. wrap tags with <table>
|
||||
replace tags with correspond LiveCarta tags
|
||||
unwrap tags
|
||||
insert tags into correspond tags
|
||||
7. heading removal
|
||||
8. process_tables
|
||||
9. class removal
|
||||
|
||||
Returns
|
||||
-------
|
||||
content_tag: str
|
||||
prepared content
|
||||
|
||||
"""
|
||||
# 1. remove comments
|
||||
self._remove_comments(content_tag)
|
||||
# 2.
|
||||
self._wrap_strings_with_p(content_tag)
|
||||
# 3-6.
|
||||
for dict in self.preset:
|
||||
func = self.name2function[dict["preset_name"]]
|
||||
func(content_tag, dict['rules'])
|
||||
# 7.
|
||||
if remove_title_from_chapter:
|
||||
self._remove_headings_content(content_tag, title_str)
|
||||
# 8.
|
||||
self._process_tables(content_tag)
|
||||
# 9. remove classes that weren't created by converter
|
||||
self._class_removing(content_tag)
|
||||
return str(content_tag)
|
||||
|
||||
15
src/preset_processor.py
Normal file
15
src/preset_processor.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import json
|
||||
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
|
||||
|
||||
class PresetProcessor:
|
||||
def __init__(self, preset_path="config/presets.json", logger=None):
|
||||
self.preset_path = preset_path
|
||||
self.logger: BookLogger = logger
|
||||
|
||||
def get_preset_json(self):
|
||||
f = open(self.preset_path)
|
||||
data = json.load(f)
|
||||
return data
|
||||
Reference in New Issue
Block a user