forked from LiveCarta/BookConverter
add processing of JSON presets
This commit is contained in:
@@ -1,5 +1,4 @@
|
|||||||
import os
|
import os
|
||||||
import logging
|
|
||||||
import pathlib
|
import pathlib
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
|
|||||||
@@ -4,33 +4,34 @@ import codecs
|
|||||||
import os
|
import os
|
||||||
from os.path import dirname, normpath, join
|
from os.path import dirname, normpath, join
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
from premailer import transform
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Dict, Union, List
|
from typing import Dict, Union, List
|
||||||
|
|
||||||
|
|
||||||
import ebooklib
|
import ebooklib
|
||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
from ebooklib.epub import Link, Section
|
from ebooklib.epub import Link, Section
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||||
|
|
||||||
|
|
||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
|
from src.preset_processor import PresetProcessor
|
||||||
|
from src.epub_converter.css_preprocessor import CSSPreprocessor
|
||||||
|
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
|
||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.epub_converter.image_processing import update_images_src_links
|
from src.epub_converter.image_processing import update_images_src_links
|
||||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||||
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
|
from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
|
||||||
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
|
|
||||||
from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\
|
|
||||||
prepare_title, prepare_content
|
|
||||||
|
|
||||||
|
|
||||||
class EpubConverter:
|
class EpubConverter:
|
||||||
def __init__(self, file_path, access=None, logger=None):
|
def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None):
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
self.ebooklib_book = epub.read_epub(file_path)
|
self.ebooklib_book = epub.read_epub(file_path)
|
||||||
|
self.css_processor = css_preprocessor
|
||||||
|
self.html_preprocessor = html_processor
|
||||||
|
|
||||||
# main container for all epub .xhtml files
|
# main container for all epub .xhtml files
|
||||||
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
|
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
|
||||||
@@ -74,25 +75,15 @@ class EpubConverter:
|
|||||||
self.process_inline_styles_in_html_soup()
|
self.process_inline_styles_in_html_soup()
|
||||||
self.logger.log("CSS files processing.")
|
self.logger.log("CSS files processing.")
|
||||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||||
self.logger.log("CSS styles adding.")
|
self.logger.log("CSS styles adding.")
|
||||||
self.add_css_styles_to_html_soup()
|
self.add_css_styles_to_html_soup()
|
||||||
|
|
||||||
# todo presets
|
|
||||||
|
|
||||||
self.logger.log("Footnotes processing.")
|
self.logger.log("Footnotes processing.")
|
||||||
for href in self.html_href2html_body_soup:
|
for href in self.html_href2html_body_soup:
|
||||||
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
|
self.footnotes_contents, self.noterefs, self.footnotes =\
|
||||||
self.html_href2html_body_soup)
|
preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup)
|
||||||
self.footnotes_contents.extend(content)
|
|
||||||
self.noterefs.extend(noterefs)
|
|
||||||
self.footnotes.extend(footnotes_tags)
|
|
||||||
|
|
||||||
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
|
|
||||||
noteref.attrs["data-id"] = i + 1
|
|
||||||
noteref.attrs["id"] = f"footnote-{i + 1}"
|
|
||||||
footnote.attrs["href"] = f"#footnote-{i + 1}"
|
|
||||||
|
|
||||||
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
|
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
|
||||||
|
|
||||||
self.logger.log("TOC processing.")
|
self.logger.log("TOC processing.")
|
||||||
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
||||||
# build simple toc from spine if needed
|
# build simple toc from spine if needed
|
||||||
@@ -101,6 +92,7 @@ class EpubConverter:
|
|||||||
not_added = [
|
not_added = [
|
||||||
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
|
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
|
||||||
self.logger.log(f"Html documents not added to TOC: {not_added}.")
|
self.logger.log(f"Html documents not added to TOC: {not_added}.")
|
||||||
|
self.logger.log(f"Add documents not added to TOC.")
|
||||||
self.add_not_added_files_to_adjacency_list(not_added)
|
self.add_not_added_files_to_adjacency_list(not_added)
|
||||||
self.logger.log(f"Html internal links and structure processing.")
|
self.logger.log(f"Html internal links and structure processing.")
|
||||||
self.label_chapters_ids_with_lc_id()
|
self.label_chapters_ids_with_lc_id()
|
||||||
@@ -149,7 +141,7 @@ class EpubConverter:
|
|||||||
for tag_initial_inline_style in tags_with_inline_style:
|
for tag_initial_inline_style in tags_with_inline_style:
|
||||||
inline_style = tag_initial_inline_style.attrs["style"]
|
inline_style = tag_initial_inline_style.attrs["style"]
|
||||||
tag_initial_inline_style.attrs["style"] = \
|
tag_initial_inline_style.attrs["style"] = \
|
||||||
build_inline_style_content(inline_style)
|
self.css_processor.build_inline_style_content(inline_style)
|
||||||
|
|
||||||
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
||||||
"""
|
"""
|
||||||
@@ -181,16 +173,53 @@ class EpubConverter:
|
|||||||
html_href2css_href[html_href].append(css_href)
|
html_href2css_href[html_href].append(css_href)
|
||||||
if css_href not in css_href2css_content:
|
if css_href not in css_href2css_content:
|
||||||
# css_href not in css_href2css_content, add to this dict
|
# css_href not in css_href2css_content, add to this dict
|
||||||
css_href2css_content[css_href] = build_css_file_content(
|
css_href2css_content[css_href] = self.css_processor.build_css_file_content(
|
||||||
self.get_css_content(css_href, html_href))
|
self.get_css_content(css_href, html_href))
|
||||||
|
|
||||||
for i, tag in enumerate(soup_html_content.find_all("style")):
|
for i, tag in enumerate(soup_html_content.find_all("style")):
|
||||||
css_content = tag.string
|
css_content = tag.string
|
||||||
html_href2css_href[html_href].append(f"href{i}")
|
html_href2css_href[html_href].append(f"href{i}")
|
||||||
css_href2css_content[f"href{i}"] = build_css_file_content(
|
css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
|
||||||
css_content)
|
css_content)
|
||||||
return html_href2css_href, css_href2css_content
|
return html_href2css_href, css_href2css_content
|
||||||
|
|
||||||
|
def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
||||||
|
"""
|
||||||
|
Function adds styles from .css to inline style.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
html_soup: BeautifulSoup
|
||||||
|
html page with inline style
|
||||||
|
css_text: str
|
||||||
|
css content from css file
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
inline_soup: BeautifulSoup
|
||||||
|
soup with styles from css
|
||||||
|
|
||||||
|
"""
|
||||||
|
# remove this specification because it causes problems
|
||||||
|
css_text = css_text.replace(
|
||||||
|
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||||
|
# here we add css styles to inline style
|
||||||
|
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
|
||||||
|
remove_classes=False,
|
||||||
|
external_styles=False,
|
||||||
|
allow_network=False,
|
||||||
|
disable_validation=True,
|
||||||
|
)
|
||||||
|
# soup with converted styles from css
|
||||||
|
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
|
||||||
|
|
||||||
|
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
|
||||||
|
attrs={"style": re.compile(".*")})
|
||||||
|
|
||||||
|
# go through the tags with inline style + style parsed from css file
|
||||||
|
for tag_inline_style in tags_with_inline_style:
|
||||||
|
style_converter = TagInlineStyleProcessor(tag_inline_style)
|
||||||
|
style_converter.convert_initial_tag()
|
||||||
|
return inline_soup
|
||||||
|
|
||||||
def add_css_styles_to_html_soup(self):
|
def add_css_styles_to_html_soup(self):
|
||||||
"""
|
"""
|
||||||
This function is designed to update html_href2html_body_soup
|
This function is designed to update html_href2html_body_soup
|
||||||
@@ -203,7 +232,7 @@ class EpubConverter:
|
|||||||
for css_href in self.html_href2css_href[html_href]:
|
for css_href in self.html_href2css_href[html_href]:
|
||||||
css += self.css_href2css_content[css_href]
|
css += self.css_href2css_content[css_href]
|
||||||
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
|
||||||
html_content = convert_html_soup_with_css_style(html_content, css)
|
html_content = self.convert_html_soup_with_css_style(html_content, css)
|
||||||
self.html_href2html_body_soup[html_href] = html_content
|
self.html_href2html_body_soup[html_href] = html_content
|
||||||
|
|
||||||
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
||||||
@@ -488,6 +517,48 @@ class EpubConverter:
|
|||||||
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
|
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
|
||||||
f" Old id={a_tag_id}")
|
f" Old id={a_tag_id}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||||
|
"""
|
||||||
|
After processing on a first_id that corresponds to current chapter,
|
||||||
|
from initial html_soup all tags from current chapter are extracted
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
first_id: str
|
||||||
|
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
|
||||||
|
href: str
|
||||||
|
Name of current chapters file
|
||||||
|
html_soup: Tag
|
||||||
|
Soup object of current file
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tags: list [Tag, NavigableString]
|
||||||
|
Chapter's tags
|
||||||
|
|
||||||
|
"""
|
||||||
|
marked_tags = html_soup.find(
|
||||||
|
attrs={"id": first_id, "class": "converter-chapter-mark"})
|
||||||
|
if marked_tags:
|
||||||
|
next_tag = marked_tags.next_sibling
|
||||||
|
tags = []
|
||||||
|
while next_tag:
|
||||||
|
if not isinstance(next_tag, NavigableString) and \
|
||||||
|
(next_tag.attrs.get("class") == "converter-chapter-mark"):
|
||||||
|
break
|
||||||
|
tags.append(next_tag)
|
||||||
|
next_tag = next_tag.next_sibling
|
||||||
|
|
||||||
|
# remove tags between first_id and next found id
|
||||||
|
# save them in list for next steps
|
||||||
|
tags = [tag.extract() for tag in tags]
|
||||||
|
html_soup.smooth()
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert 0, f"Warning: no match for {first_id, href}"
|
||||||
|
|
||||||
|
return tags
|
||||||
|
|
||||||
def detect_one_chapter(self, nav_point: NavPoint):
|
def detect_one_chapter(self, nav_point: NavPoint):
|
||||||
"""
|
"""
|
||||||
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
||||||
@@ -511,11 +582,11 @@ class EpubConverter:
|
|||||||
"""
|
"""
|
||||||
if nav_point.id:
|
if nav_point.id:
|
||||||
soup = self.html_href2html_body_soup[nav_point.href]
|
soup = self.html_href2html_body_soup[nav_point.href]
|
||||||
chapter_tags = get_tags_between_chapter_marks(
|
subchapter_tags = self.get_tags_between_chapter_marks(
|
||||||
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
|
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
|
||||||
new_tree = BeautifulSoup("", "html.parser")
|
new_tree = BeautifulSoup("", "html.parser")
|
||||||
for tag in chapter_tags:
|
for subchapter_tag in subchapter_tags:
|
||||||
new_tree.append(tag)
|
new_tree.append(subchapter_tag)
|
||||||
self.href_chapter_id2soup_html[(
|
self.href_chapter_id2soup_html[(
|
||||||
nav_point.href, nav_point.id)] = new_tree
|
nav_point.href, nav_point.id)] = new_tree
|
||||||
|
|
||||||
@@ -527,8 +598,8 @@ class EpubConverter:
|
|||||||
"""Function build chapters content, starts from top level chapters"""
|
"""Function build chapters content, starts from top level chapters"""
|
||||||
top_level_nav_points = self.adjacency_list[-1]
|
top_level_nav_points = self.adjacency_list[-1]
|
||||||
if self.id_anchor_exist_in_nav_points:
|
if self.id_anchor_exist_in_nav_points:
|
||||||
for point in top_level_nav_points:
|
for tl_nav_point in top_level_nav_points:
|
||||||
self.detect_one_chapter(point)
|
self.detect_one_chapter(tl_nav_point)
|
||||||
|
|
||||||
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||||
"""
|
"""
|
||||||
@@ -561,9 +632,9 @@ class EpubConverter:
|
|||||||
if hasattr(self.file_path, "stem") else "book_id")
|
if hasattr(self.file_path, "stem") else "book_id")
|
||||||
|
|
||||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||||
title_preprocessed = prepare_title(title)
|
title_preprocessed = self.html_preprocessor.prepare_title(title)
|
||||||
content_preprocessed = prepare_content(title_preprocessed, content,
|
content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content,
|
||||||
remove_title_from_chapter=is_chapter)
|
remove_title_from_chapter=is_chapter)
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
# warning! not EpubHtmlItems won't be added to chapter
|
# warning! not EpubHtmlItems won't be added to chapter
|
||||||
# if it doesn't have subchapters
|
# if it doesn't have subchapters
|
||||||
@@ -598,11 +669,17 @@ class EpubConverter:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
epub_file_path = "../../epub/9781641050234.epub"
|
epub_file_path = "../../epub/Modern_Java_in_Action.epub"
|
||||||
logger_object = BookLogger(
|
logger_object = BookLogger(
|
||||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||||
|
|
||||||
json_converter = EpubConverter(epub_file_path, logger=logger_object)
|
preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\
|
||||||
|
.get_preset_json()
|
||||||
|
css_preprocessor = CSSPreprocessor(logger=logger_object)
|
||||||
|
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object)
|
||||||
|
|
||||||
|
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
||||||
|
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
|
|
||||||
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
|
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
from src.book_solver import BookSolver
|
from src.book_solver import BookSolver
|
||||||
|
from src.preset_processor import PresetProcessor
|
||||||
|
from src.epub_converter.css_preprocessor import CSSPreprocessor
|
||||||
|
from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor
|
||||||
from src.epub_converter.epub_converter import EpubConverter
|
from src.epub_converter.epub_converter import EpubConverter
|
||||||
|
|
||||||
|
|
||||||
@@ -14,8 +17,10 @@ class EpubBook(BookSolver):
|
|||||||
Function
|
Function
|
||||||
Steps
|
Steps
|
||||||
----------
|
----------
|
||||||
1. Converts .epub to .html
|
1. Gets data from preset structure
|
||||||
2. Parses from line structure to nested structure
|
2. Add preset to html preprocessor
|
||||||
|
3. Converts .epub to .html
|
||||||
|
4. Parses from line structure to nested structure
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
----------
|
----------
|
||||||
@@ -23,7 +28,12 @@ class EpubBook(BookSolver):
|
|||||||
json for LiveCarta platform
|
json for LiveCarta platform
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\
|
||||||
|
.get_preset_json()
|
||||||
|
css_preprocessor = CSSPreprocessor(logger=self.logger_object)
|
||||||
|
html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object)
|
||||||
json_converter = EpubConverter(
|
json_converter = EpubConverter(
|
||||||
self.file_path, access=self.access, logger=self.logger_object)
|
self.file_path, access=self.access, logger=self.logger_object,
|
||||||
|
css_preprocessor=css_preprocessor, html_processor=html_preprocessor)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
return content_dict
|
return content_dict
|
||||||
|
|||||||
@@ -1,419 +1,398 @@
|
|||||||
import re
|
import re
|
||||||
|
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
from src.util.helpers import BookLogger
|
||||||
|
|
||||||
from src.livecarta_config import LiveCartaConfig
|
|
||||||
|
|
||||||
|
|
||||||
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
|
class HtmlEpubPreprocessor:
|
||||||
"""
|
def __init__(self, preset, logger=None):
|
||||||
Function adds span with id from tag_to_be_removed
|
self.preset = preset
|
||||||
because this tag will be removed(unwrapped/extract)
|
self.logger: BookLogger = logger
|
||||||
Parameters
|
self.name2function = {
|
||||||
----------
|
"table_wrapper": self._wrap_tags_with_table,
|
||||||
tag_to_be_removed: Soup object
|
"replacer": self._tags_to_correspond_livecarta_tag,
|
||||||
chapter_tag: BeautifulSoup
|
"unwrapper": self._unwrap_tags,
|
||||||
|
"inserter": self._insert_tags_into_correspond_tags
|
||||||
|
}
|
||||||
|
|
||||||
Returns
|
@staticmethod
|
||||||
-------
|
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
|
||||||
None
|
"""
|
||||||
updated body tag
|
Function adds span with id from tag_to_be_removed
|
||||||
|
because this tag will be removed(unwrapped/extract)
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tag_to_be_removed: Soup object
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
|
||||||
"""
|
Returns
|
||||||
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
|
-------
|
||||||
"""Function inserts span before tag aren't supported by LiveCarta"""
|
None
|
||||||
new_tag = chapter_tag.new_tag("span")
|
updated body tag
|
||||||
new_tag.attrs["id"] = id_ or ""
|
|
||||||
new_tag.attrs["class"] = class_ or ""
|
|
||||||
new_tag.string = "\xa0"
|
|
||||||
tag_to_be_removed.insert_before(new_tag)
|
|
||||||
|
|
||||||
if tag_to_be_removed.attrs.get("id"):
|
"""
|
||||||
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
|
|
||||||
id_=tag_to_be_removed.attrs["id"],
|
|
||||||
class_=tag_to_be_removed.attrs.get("class"))
|
|
||||||
|
|
||||||
|
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
|
||||||
|
class_: list):
|
||||||
|
"""Function inserts span before tag aren't supported by LiveCarta"""
|
||||||
|
new_tag = chapter_tag.new_tag("span")
|
||||||
|
new_tag.attrs["id"] = id_ or ""
|
||||||
|
new_tag.attrs["class"] = class_ or ""
|
||||||
|
new_tag.string = "\xa0"
|
||||||
|
tag_to_be_removed.insert_before(new_tag)
|
||||||
|
|
||||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
if tag_to_be_removed.attrs.get("id"):
|
||||||
"""
|
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
|
||||||
After processing on a first_id that corresponds to current chapter,
|
id_=tag_to_be_removed.attrs["id"],
|
||||||
from initial html_soup all tags from current chapter are extracted
|
class_=tag_to_be_removed.attrs.get("class"))
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
first_id: str
|
|
||||||
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
|
|
||||||
href: str
|
|
||||||
Name of current chapters file
|
|
||||||
html_soup: Tag
|
|
||||||
Soup object of current file
|
|
||||||
|
|
||||||
Returns
|
@staticmethod
|
||||||
-------
|
def prepare_title(title_of_chapter: str) -> str:
|
||||||
tags: list [Tag, NavigableString]
|
"""
|
||||||
Chapter's tags
|
Function finalise processing/cleaning title
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
title_of_chapter: str
|
||||||
|
|
||||||
"""
|
Returns
|
||||||
marked_tags = html_soup.find(
|
-------
|
||||||
attrs={"id": first_id, "class": "converter-chapter-mark"})
|
title: str
|
||||||
if marked_tags:
|
cleaned title
|
||||||
next_tag = marked_tags.next_sibling
|
|
||||||
tags = []
|
|
||||||
while next_tag:
|
|
||||||
if not isinstance(next_tag, NavigableString) and \
|
|
||||||
(next_tag.attrs.get("class") == "converter-chapter-mark"):
|
|
||||||
break
|
|
||||||
tags.append(next_tag)
|
|
||||||
next_tag = next_tag.next_sibling
|
|
||||||
|
|
||||||
# remove tags between first_id and next found id
|
"""
|
||||||
# save them in list for next steps
|
title = BeautifulSoup(title_of_chapter, features="lxml").string
|
||||||
tags = [tag.extract() for tag in tags]
|
# clean extra whitespace characters ([\r\n\t\f\v ])
|
||||||
html_soup.smooth()
|
title = re.sub(r"[\s\xa0]", " ", title).strip()
|
||||||
|
return title
|
||||||
|
|
||||||
else:
|
@staticmethod
|
||||||
assert 0, f"Warning: no match for {first_id, href}"
|
def _remove_comments(chapter_tag):
|
||||||
|
"""
|
||||||
|
Function remove comments
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
Tag & contents of the chapter tag
|
||||||
|
|
||||||
return tags
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
Chapter Tag without comments
|
||||||
|
|
||||||
|
"""
|
||||||
|
for tag in chapter_tag.find_all():
|
||||||
|
for element in tag(text=lambda text: isinstance(text, Comment)):
|
||||||
|
element.extract()
|
||||||
|
|
||||||
def prepare_title(title_of_chapter: str) -> str:
|
@staticmethod
|
||||||
"""
|
def _wrap_strings_with_p(chapter_tag):
|
||||||
Function finalise processing/cleaning title
|
"""
|
||||||
Parameters
|
Function converts headings that aren't supported by LiveCarta with <p>
|
||||||
----------
|
Parameters
|
||||||
title_of_chapter: str
|
----------
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
Tag & contents of the chapter tag
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
title: str
|
None
|
||||||
cleaned title
|
Chapter Tag with wrapped NavigableStrings
|
||||||
|
|
||||||
"""
|
"""
|
||||||
title = BeautifulSoup(title_of_chapter, features="lxml").string
|
for node in chapter_tag:
|
||||||
# clean extra whitespace characters ([\r\n\t\f\v ])
|
if isinstance(node, NavigableString):
|
||||||
title = re.sub(r"[\s\xa0]", " ", title).strip()
|
content = str(node)
|
||||||
return title
|
content = re.sub(r"([\s\xa0])", " ", content).strip()
|
||||||
|
if content:
|
||||||
|
p_tag = chapter_tag.new_tag("p")
|
||||||
|
p_tag.append(str(node))
|
||||||
|
node.replace_with(p_tag)
|
||||||
|
|
||||||
|
def _wrap_tags_with_table(self, chapter_tag, rules: list):
|
||||||
|
"""
|
||||||
|
Function wraps <tag> with <table>
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
Tag & contents of the chapter tag
|
||||||
|
|
||||||
def _remove_comments(chapter_tag):
|
Returns
|
||||||
"""
|
-------
|
||||||
Function remove comments
|
None
|
||||||
Parameters
|
Chapter Tag with wrapped certain tags with <table>
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
|
|
||||||
Returns
|
"""
|
||||||
-------
|
|
||||||
None
|
|
||||||
Chapter Tag without comments
|
|
||||||
|
|
||||||
"""
|
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
|
||||||
for tag in chapter_tag.find_all():
|
table = chapter_tag.new_tag("table")
|
||||||
for element in tag(text=lambda text: isinstance(text, Comment)):
|
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
||||||
element.extract()
|
= border, "center", f"width:{width}%;"
|
||||||
|
tbody, tr, td = \
|
||||||
|
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||||
|
td.attrs["bgcolor"] = bg_color
|
||||||
|
tag_to_be_wrapped.wrap(td)
|
||||||
|
td.wrap(tr)
|
||||||
|
tr.wrap(tbody)
|
||||||
|
tbody.wrap(table)
|
||||||
|
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
||||||
|
return table
|
||||||
|
|
||||||
|
def process_tag_using_table(tag_to_wrap):
|
||||||
|
_wrap_tag_with_table(
|
||||||
|
chapter_tag,
|
||||||
|
tag_to_be_wrapped=tag_to_wrap,
|
||||||
|
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
|
||||||
|
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
|
||||||
|
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
||||||
|
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
||||||
|
tag_to_wrap.unwrap()
|
||||||
|
|
||||||
def _wrap_strings_with_p(chapter_tag):
|
for rule in rules:
|
||||||
"""
|
tags = rule["tags"]
|
||||||
Function converts headings that aren't supported by LiveCarta with <p>
|
for attr in rule["attrs"]:
|
||||||
Parameters
|
for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
----------
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
None
|
|
||||||
Chapter Tag with wrapped NavigableStrings
|
|
||||||
|
|
||||||
"""
|
|
||||||
for node in chapter_tag:
|
|
||||||
if isinstance(node, NavigableString):
|
|
||||||
content = str(node)
|
|
||||||
content = re.sub(r"([\s\xa0])", " ", content).strip()
|
|
||||||
if content:
|
|
||||||
p_tag = chapter_tag.new_tag("p")
|
|
||||||
p_tag.append(str(node))
|
|
||||||
node.replace_with(p_tag)
|
|
||||||
|
|
||||||
|
|
||||||
def _wrap_tags_with_table(chapter_tag):
|
|
||||||
"""
|
|
||||||
Function wraps <tag> with <table>
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
None
|
|
||||||
Chapter Tag with wrapped certain tags with <table>
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
|
|
||||||
table = chapter_tag.new_tag("table")
|
|
||||||
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
|
||||||
= border, "center", f"width:{width}%;"
|
|
||||||
tbody, tr, td = \
|
|
||||||
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
|
||||||
td.attrs["bgcolor"] = bg_color
|
|
||||||
tag_to_be_wrapped.wrap(td)
|
|
||||||
td.wrap(tr)
|
|
||||||
tr.wrap(tbody)
|
|
||||||
tbody.wrap(table)
|
|
||||||
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
|
||||||
return table
|
|
||||||
|
|
||||||
def process_tag_using_table(tag_to_wrap):
|
|
||||||
_wrap_tag_with_table(
|
|
||||||
chapter_tag,
|
|
||||||
tag_to_be_wrapped=tag_to_wrap,
|
|
||||||
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
|
|
||||||
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
|
|
||||||
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
|
||||||
_add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
|
||||||
tag_to_wrap.unwrap()
|
|
||||||
|
|
||||||
for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
|
|
||||||
if isinstance(attrs, tuple):
|
|
||||||
attr, val = attrs[0], attrs[1]
|
|
||||||
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
|
|
||||||
process_tag_using_table(tag_to_wrap)
|
|
||||||
else:
|
|
||||||
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
|
|
||||||
if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
|
|
||||||
process_tag_using_table(tag_to_wrap)
|
process_tag_using_table(tag_to_wrap)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list):
|
||||||
|
"""
|
||||||
|
Function to replace all tags to correspond LiveCarta tags
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
Tag & contents of the chapter tag
|
||||||
|
|
||||||
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
Returns
|
||||||
"""
|
-------
|
||||||
Function to replace all tags to correspond LiveCarta tags
|
None
|
||||||
Parameters
|
Chapter Tag with all tags replaced with LiveCarta tags
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
|
|
||||||
Returns
|
"""
|
||||||
-------
|
for rule in rules:
|
||||||
None
|
tags = rule["tags"]
|
||||||
Chapter Tag with all tags replaced with LiveCarta tags
|
tag_to_replace = rule["tag_to_replace"]
|
||||||
|
if rule["condition"]:
|
||||||
"""
|
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||||
for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
|
if condition_on_tag[0] == 'parent_tags':
|
||||||
for key in reg_keys:
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
if isinstance(key, tuple):
|
if tag.parent.select(condition_on_tag[1]):
|
||||||
replace = key[0]
|
tag.name = tag_to_replace
|
||||||
parent, child = key[1], key[2]
|
elif condition_on_tag[0] == 'child_tags':
|
||||||
for parent_tag in chapter_tag.select(parent):
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
if replace == "parent":
|
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
|
||||||
parent_tag.name = to_replace_value
|
tag.name = tag_to_replace
|
||||||
elif replace == "child":
|
elif condition_on_tag[0] == "attrs":
|
||||||
for child_tag in parent_tag.select(child):
|
for attr in rule["condition"]["attrs"]:
|
||||||
child_tag.name = to_replace_value
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
if not child_tag.attrs.get("style"):
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||||
child_tag.attrs["style"] =\
|
tag.name = tag_to_replace
|
||||||
"font-size: 14px; font-family: courier new,courier,monospace;"
|
|
||||||
else:
|
else:
|
||||||
tags = chapter_tag.find_all(re.compile(key))
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
for tag in tags:
|
|
||||||
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
||||||
tag.name = to_replace_value
|
tag.name = tag_to_replace
|
||||||
|
|
||||||
|
def _unwrap_tags(self, chapter_tag, rules: dict):
|
||||||
|
"""
|
||||||
|
Function unwrap tags and moves id to span
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
Tag & contents of the chapter tag
|
||||||
|
|
||||||
def _unwrap_tags(chapter_tag):
|
Returns
|
||||||
"""
|
-------
|
||||||
Function unwrap tags and moves id to span
|
None
|
||||||
Parameters
|
Chapter Tag with unwrapped certain tags
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
|
|
||||||
Returns
|
"""
|
||||||
-------
|
for tag_name in rules["tags"]:
|
||||||
None
|
for tag in chapter_tag.select(tag_name):
|
||||||
Chapter Tag with unwrapped certain tags
|
# if tag is a subtag
|
||||||
|
if ">" in tag_name:
|
||||||
|
tag.parent.attrs.update(tag.attrs)
|
||||||
|
self._add_span_to_save_ids_for_links(tag, chapter_tag)
|
||||||
|
tag.unwrap()
|
||||||
|
|
||||||
"""
|
@staticmethod
|
||||||
for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
|
def _insert_tags_into_correspond_tags(chapter_tag, rules: list):
|
||||||
for tag in chapter_tag.select(tag_name):
|
"""
|
||||||
# if tag is a subtag
|
Function inserts tags into correspond tags
|
||||||
if ">" in tag_name:
|
Parameters
|
||||||
tag.parent.attrs.update(tag.attrs)
|
----------
|
||||||
_add_span_to_save_ids_for_links(tag, chapter_tag)
|
chapter_tag: BeautifulSoup
|
||||||
tag.unwrap()
|
Tag & contents of the chapter tag
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
Chapter Tag with inserted tags
|
||||||
|
|
||||||
def _remove_headings_content(content_tag, title_of_chapter: str):
|
"""
|
||||||
"""
|
|
||||||
Function
|
|
||||||
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
|
|
||||||
- adds span with id in order to
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
content_tag: soup object
|
|
||||||
Tag of the page
|
|
||||||
title_of_chapter: str
|
|
||||||
Chapter title
|
|
||||||
|
|
||||||
Returns
|
def insert(tag, tag_to_insert):
|
||||||
-------
|
# insert all items that was in tag to subtag and remove from tag
|
||||||
None
|
for content in reversed(tag.contents):
|
||||||
clean/remove headings & add span with id
|
tag_to_insert.insert(0, content.extract())
|
||||||
|
# wrap subtag with items
|
||||||
|
tag.append(tag_to_insert)
|
||||||
|
|
||||||
"""
|
for rule in rules:
|
||||||
title_of_chapter = title_of_chapter.lower()
|
tags = rule["tags"]
|
||||||
for tag in content_tag.contents:
|
tag_to_insert = \
|
||||||
text = tag if isinstance(tag, NavigableString) else tag.text
|
chapter_tag.new_tag(rule["tag_to_insert"])
|
||||||
if re.sub(r"[\s\xa0]", "", text):
|
if rule["condition"]:
|
||||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||||
text = text.strip() # delete extra spaces
|
if condition_on_tag[0] == 'parent_tags':
|
||||||
if title_of_chapter == text or \
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
(title_of_chapter in text and
|
if tag.parent.select(condition_on_tag[1]):
|
||||||
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
|
insert(tag, tag_to_insert)
|
||||||
_add_span_to_save_ids_for_links(tag, content_tag)
|
elif condition_on_tag[0] == 'child_tags':
|
||||||
tag.extract()
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
return
|
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
|
||||||
elif not isinstance(tag, NavigableString):
|
insert(tag, tag_to_insert)
|
||||||
if not _remove_headings_content(tag, title_of_chapter):
|
elif condition_on_tag[0] == "attrs":
|
||||||
break
|
for attr in rule["condition"]["attrs"]:
|
||||||
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||||
def _process_table(chapter_tag: BeautifulSoup):
|
insert(tag, tag_to_insert)
|
||||||
"""
|
|
||||||
Function preprocesses tables and tags(td|th|tr)
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
None
|
|
||||||
Chapter Tag with processed tables
|
|
||||||
|
|
||||||
"""
|
|
||||||
tables = chapter_tag.find_all("table")
|
|
||||||
for table in tables:
|
|
||||||
for t_tag in table.find_all(re.compile("td|th|tr")):
|
|
||||||
width = ""
|
|
||||||
if t_tag.get("style"):
|
|
||||||
width_match = re.search(
|
|
||||||
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
|
|
||||||
if width_match:
|
|
||||||
size = width_match.group(1)
|
|
||||||
width = size + "px"
|
|
||||||
|
|
||||||
t_tag.attrs["width"] = t_tag.get("width") or width
|
|
||||||
|
|
||||||
if t_tag.attrs.get("style"):
|
|
||||||
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
|
||||||
"border:0;", "")
|
|
||||||
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
|
|
||||||
del t_tag.attrs["style"]
|
|
||||||
|
|
||||||
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
|
||||||
table.attrs["border"] = "1"
|
|
||||||
|
|
||||||
|
|
||||||
def _insert_tags_in_parents(chapter_tag):
|
|
||||||
"""
|
|
||||||
Function inserts tags into correspond tags
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
None
|
|
||||||
Chapter Tag with inserted tags
|
|
||||||
|
|
||||||
"""
|
|
||||||
parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
|
|
||||||
for parent_tag_name, condition in parent_tag2condition.items():
|
|
||||||
for parent_tag in chapter_tag.select(parent_tag_name):
|
|
||||||
if parent_tag.select(condition):
|
|
||||||
continue
|
|
||||||
else:
|
else:
|
||||||
tag_to_insert = chapter_tag.new_tag(
|
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
|
insert(tag, tag_to_insert)
|
||||||
# insert all items that was in pre to code and remove from pre
|
|
||||||
for content in reversed(parent_tag.contents):
|
|
||||||
tag_to_insert.insert(0, content.extract())
|
|
||||||
# wrap code with items
|
|
||||||
parent_tag.append(tag_to_insert)
|
|
||||||
|
|
||||||
|
def _remove_headings_content(self, content_tag, title_of_chapter: str):
|
||||||
|
"""
|
||||||
|
Function
|
||||||
|
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
|
||||||
|
- adds span with id in order to
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
content_tag: soup object
|
||||||
|
Tag of the page
|
||||||
|
title_of_chapter: str
|
||||||
|
Chapter title
|
||||||
|
|
||||||
def _class_removing(chapter_tag):
|
Returns
|
||||||
"""
|
-------
|
||||||
Function removes classes that aren't created by converter
|
None
|
||||||
Parameters
|
clean/remove headings & add span with id
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
|
|
||||||
Returns
|
"""
|
||||||
-------
|
title_of_chapter = title_of_chapter.lower()
|
||||||
None
|
for tag in content_tag.contents:
|
||||||
Chapter Tag without original classes of the book
|
text = tag if isinstance(tag, NavigableString) else tag.text
|
||||||
|
if re.sub(r"[\s\xa0]", "", text):
|
||||||
|
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||||
|
text = text.strip() # delete extra spaces
|
||||||
|
if title_of_chapter == text or \
|
||||||
|
(title_of_chapter in text and
|
||||||
|
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
|
||||||
|
self._add_span_to_save_ids_for_links(tag, content_tag)
|
||||||
|
tag.extract()
|
||||||
|
return
|
||||||
|
elif not isinstance(tag, NavigableString):
|
||||||
|
if not self._remove_headings_content(tag, title_of_chapter):
|
||||||
|
break
|
||||||
|
|
||||||
"""
|
@staticmethod
|
||||||
for tag in chapter_tag.find_all(recursive=True):
|
def _process_tables(chapter_tag: BeautifulSoup):
|
||||||
if tag.attrs.get("class") \
|
"""
|
||||||
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
Function preprocesses tables and tags(td|th|tr)
|
||||||
del tag.attrs["class"]
|
Parameters
|
||||||
|
----------
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
Tag & contents of the chapter tag
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
Chapter Tag with processed tables
|
||||||
|
|
||||||
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
"""
|
||||||
"""
|
tables = chapter_tag.find_all("table")
|
||||||
Function finalise processing/cleaning content
|
for table in tables:
|
||||||
Parameters
|
for t_tag in table.find_all(re.compile("td|th|tr")):
|
||||||
----------
|
width = ""
|
||||||
title_str: str
|
if t_tag.get("style"):
|
||||||
|
width_match = re.search(
|
||||||
|
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
|
||||||
|
if width_match:
|
||||||
|
size = width_match.group(1)
|
||||||
|
width = size + "px"
|
||||||
|
|
||||||
content_tag: Tag, soup object
|
t_tag.attrs["width"] = t_tag.get("width") or width
|
||||||
|
|
||||||
remove_title_from_chapter: bool
|
if t_tag.attrs.get("style"):
|
||||||
|
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
||||||
|
"border:0;", "")
|
||||||
|
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
|
||||||
|
del t_tag.attrs["style"]
|
||||||
|
|
||||||
Steps
|
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
||||||
----------
|
table.attrs["border"] = "1"
|
||||||
1. comments removal
|
|
||||||
2. wrap NavigableString with tag <p>
|
|
||||||
3. wrap tags with <table>
|
|
||||||
4. replace tags with correspond LiveCarta tags
|
|
||||||
5. unwrap tags
|
|
||||||
6. heading removal
|
|
||||||
7. process_table
|
|
||||||
8. insert tags into correspond tags
|
|
||||||
9. class removal
|
|
||||||
|
|
||||||
Returns
|
@staticmethod
|
||||||
-------
|
def _class_removing(chapter_tag):
|
||||||
content_tag: str
|
"""
|
||||||
prepared content
|
Function removes classes that aren't created by converter
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
Tag & contents of the chapter tag
|
||||||
|
|
||||||
"""
|
Returns
|
||||||
# 1. remove comments
|
-------
|
||||||
_remove_comments(content_tag)
|
None
|
||||||
|
Chapter Tag without original classes of the book
|
||||||
|
|
||||||
# 2.
|
"""
|
||||||
_wrap_strings_with_p(content_tag)
|
for tag in chapter_tag.find_all(recursive=True):
|
||||||
# 3.
|
if tag.attrs.get("class") \
|
||||||
_wrap_tags_with_table(content_tag)
|
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
||||||
# 4.
|
del tag.attrs["class"]
|
||||||
_tags_to_correspond_livecarta_tag(content_tag)
|
|
||||||
# 5.
|
|
||||||
_unwrap_tags(content_tag)
|
|
||||||
# 6.
|
|
||||||
if remove_title_from_chapter:
|
|
||||||
_remove_headings_content(content_tag, title_str)
|
|
||||||
# 7.
|
|
||||||
_process_table(content_tag)
|
|
||||||
# 8.
|
|
||||||
_insert_tags_in_parents(content_tag)
|
|
||||||
|
|
||||||
# 9. remove classes that weren't created by converter
|
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
||||||
_class_removing(content_tag)
|
"""
|
||||||
return str(content_tag)
|
Function finalise processing/cleaning content
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
title_str: str
|
||||||
|
|
||||||
|
content_tag: Tag, soup object
|
||||||
|
|
||||||
|
remove_title_from_chapter: bool
|
||||||
|
|
||||||
|
Steps
|
||||||
|
----------
|
||||||
|
1. comments removal
|
||||||
|
2. wrap NavigableString with tag <p>
|
||||||
|
3-6. wrap tags with <table>
|
||||||
|
replace tags with correspond LiveCarta tags
|
||||||
|
unwrap tags
|
||||||
|
insert tags into correspond tags
|
||||||
|
7. heading removal
|
||||||
|
8. process_tables
|
||||||
|
9. class removal
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
content_tag: str
|
||||||
|
prepared content
|
||||||
|
|
||||||
|
"""
|
||||||
|
# 1. remove comments
|
||||||
|
self._remove_comments(content_tag)
|
||||||
|
# 2.
|
||||||
|
self._wrap_strings_with_p(content_tag)
|
||||||
|
# 3-6.
|
||||||
|
for dict in self.preset:
|
||||||
|
func = self.name2function[dict["preset_name"]]
|
||||||
|
func(content_tag, dict['rules'])
|
||||||
|
# 7.
|
||||||
|
if remove_title_from_chapter:
|
||||||
|
self._remove_headings_content(content_tag, title_str)
|
||||||
|
# 8.
|
||||||
|
self._process_tables(content_tag)
|
||||||
|
# 9. remove classes that weren't created by converter
|
||||||
|
self._class_removing(content_tag)
|
||||||
|
return str(content_tag)
|
||||||
|
|||||||
15
src/preset_processor.py
Normal file
15
src/preset_processor.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
from src.util.helpers import BookLogger
|
||||||
|
|
||||||
|
|
||||||
|
class PresetProcessor:
|
||||||
|
def __init__(self, preset_path="config/presets.json", logger=None):
|
||||||
|
self.preset_path = preset_path
|
||||||
|
self.logger: BookLogger = logger
|
||||||
|
|
||||||
|
def get_preset_json(self):
|
||||||
|
f = open(self.preset_path)
|
||||||
|
data = json.load(f)
|
||||||
|
return data
|
||||||
Reference in New Issue
Block a user