diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py index 923a274..e593312 100644 --- a/src/docx_converter/image_processing.py +++ b/src/docx_converter/image_processing.py @@ -1,5 +1,4 @@ import os -import logging import pathlib from shutil import copyfile diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 1ecc7a1..525fad3 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -4,33 +4,34 @@ import codecs import os from os.path import dirname, normpath, join from itertools import chain +from premailer import transform from collections import defaultdict from typing import Dict, Union, List - import ebooklib from ebooklib import epub from ebooklib.epub import Link, Section -from bs4 import BeautifulSoup, Tag - +from bs4 import BeautifulSoup, NavigableString, Tag from src.util.helpers import BookLogger +from src.preset_processor import PresetProcessor +from src.epub_converter.css_preprocessor import CSSPreprocessor +from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.footnotes_processing import preprocess_footnotes -from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content -from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style -from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\ - prepare_title, prepare_content +from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor class EpubConverter: - def __init__(self, file_path, access=None, logger=None): + def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None): self.file_path = file_path self.access = access self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file_path) + self.css_processor = css_preprocessor + self.html_preprocessor = html_processor # main container for all epub .xhtml files self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} @@ -74,25 +75,15 @@ class EpubConverter: self.process_inline_styles_in_html_soup() self.logger.log("CSS files processing.") self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() - self.logger.log("CSS styles adding.") + self.logger.log("CSS styles adding.") self.add_css_styles_to_html_soup() - # todo presets - self.logger.log("Footnotes processing.") for href in self.html_href2html_body_soup: - content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], - self.html_href2html_body_soup) - self.footnotes_contents.extend(content) - self.noterefs.extend(noterefs) - self.footnotes.extend(footnotes_tags) - - for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)): - noteref.attrs["data-id"] = i + 1 - noteref.attrs["id"] = f"footnote-{i + 1}" - footnote.attrs["href"] = f"#footnote-{i + 1}" - + self.footnotes_contents, self.noterefs, self.footnotes =\ + preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup) self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") + self.logger.log("TOC processing.") self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed @@ -101,6 +92,7 @@ class EpubConverter: not_added = [ x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] self.logger.log(f"Html documents not added to TOC: {not_added}.") + self.logger.log(f"Add documents not added to TOC.") self.add_not_added_files_to_adjacency_list(not_added) self.logger.log(f"Html internal links and structure processing.") self.label_chapters_ids_with_lc_id() @@ -149,7 +141,7 @@ class EpubConverter: for tag_initial_inline_style in tags_with_inline_style: inline_style = tag_initial_inline_style.attrs["style"] tag_initial_inline_style.attrs["style"] = \ - build_inline_style_content(inline_style) + self.css_processor.build_inline_style_content(inline_style) def build_html_and_css_relations(self) -> tuple[dict, dict]: """ @@ -181,16 +173,53 @@ class EpubConverter: html_href2css_href[html_href].append(css_href) if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict - css_href2css_content[css_href] = build_css_file_content( + css_href2css_content[css_href] = self.css_processor.build_css_file_content( self.get_css_content(css_href, html_href)) for i, tag in enumerate(soup_html_content.find_all("style")): css_content = tag.string html_href2css_href[html_href].append(f"href{i}") - css_href2css_content[f"href{i}"] = build_css_file_content( + css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content( css_content) return html_href2css_href, css_href2css_content + def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: + """ + Function adds styles from .css to inline style. + Parameters + ---------- + html_soup: BeautifulSoup + html page with inline style + css_text: str + css content from css file + Returns + ------- + inline_soup: BeautifulSoup + soup with styles from css + + """ + # remove this specification because it causes problems + css_text = css_text.replace( + '@namespace epub "http://www.idpf.org/2007/ops";', '') + # here we add css styles to inline style + html_with_css_styles: str = transform(str(html_soup), css_text=css_text, + remove_classes=False, + external_styles=False, + allow_network=False, + disable_validation=True, + ) + # soup with converted styles from css + inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") + + tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + attrs={"style": re.compile(".*")}) + + # go through the tags with inline style + style parsed from css file + for tag_inline_style in tags_with_inline_style: + style_converter = TagInlineStyleProcessor(tag_inline_style) + style_converter.convert_initial_tag() + return inline_soup + def add_css_styles_to_html_soup(self): """ This function is designed to update html_href2html_body_soup @@ -203,7 +232,7 @@ class EpubConverter: for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] - html_content = convert_html_soup_with_css_style(html_content, css) + html_content = self.convert_html_soup_with_css_style(html_content, css) self.html_href2html_body_soup[html_href] = html_content def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): @@ -488,6 +517,48 @@ class EpubConverter: f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file." f" Old id={a_tag_id}") + @staticmethod + def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: + """ + After processing on a first_id that corresponds to current chapter, + from initial html_soup all tags from current chapter are extracted + Parameters + ---------- + first_id: str + Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" + href: str + Name of current chapters file + html_soup: Tag + Soup object of current file + + Returns + ------- + tags: list [Tag, NavigableString] + Chapter's tags + + """ + marked_tags = html_soup.find( + attrs={"id": first_id, "class": "converter-chapter-mark"}) + if marked_tags: + next_tag = marked_tags.next_sibling + tags = [] + while next_tag: + if not isinstance(next_tag, NavigableString) and \ + (next_tag.attrs.get("class") == "converter-chapter-mark"): + break + tags.append(next_tag) + next_tag = next_tag.next_sibling + + # remove tags between first_id and next found id + # save them in list for next steps + tags = [tag.extract() for tag in tags] + html_soup.smooth() + + else: + assert 0, f"Warning: no match for {first_id, href}" + + return tags + def detect_one_chapter(self, nav_point: NavPoint): """ Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) @@ -511,11 +582,11 @@ class EpubConverter: """ if nav_point.id: soup = self.html_href2html_body_soup[nav_point.href] - chapter_tags = get_tags_between_chapter_marks( + subchapter_tags = self.get_tags_between_chapter_marks( first_id=nav_point.id, href=nav_point.href, html_soup=soup) new_tree = BeautifulSoup("", "html.parser") - for tag in chapter_tags: - new_tree.append(tag) + for subchapter_tag in subchapter_tags: + new_tree.append(subchapter_tag) self.href_chapter_id2soup_html[( nav_point.href, nav_point.id)] = new_tree @@ -527,8 +598,8 @@ class EpubConverter: """Function build chapters content, starts from top level chapters""" top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: - for point in top_level_nav_points: - self.detect_one_chapter(point) + for tl_nav_point in top_level_nav_points: + self.detect_one_chapter(tl_nav_point) def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: """ @@ -561,9 +632,9 @@ class EpubConverter: if hasattr(self.file_path, "stem") else "book_id") is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS - title_preprocessed = prepare_title(title) - content_preprocessed = prepare_content(title_preprocessed, content, - remove_title_from_chapter=is_chapter) + title_preprocessed = self.html_preprocessor.prepare_title(title) + content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content, + remove_title_from_chapter=is_chapter) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter # if it doesn't have subchapters @@ -598,11 +669,17 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/9781641050234.epub" + epub_file_path = "../../epub/Modern_Java_in_Action.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) - json_converter = EpubConverter(epub_file_path, logger=logger_object) + preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\ + .get_preset_json() + css_preprocessor = CSSPreprocessor(logger=logger_object) + html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object) + + json_converter = EpubConverter(epub_file_path, logger=logger_object, + css_preprocessor=css_preprocessor, html_processor=html_preprocessor) content_dict = json_converter.convert_to_dict() with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 8e92a40..c1bb800 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -1,4 +1,7 @@ from src.book_solver import BookSolver +from src.preset_processor import PresetProcessor +from src.epub_converter.css_preprocessor import CSSPreprocessor +from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor from src.epub_converter.epub_converter import EpubConverter @@ -14,8 +17,10 @@ class EpubBook(BookSolver): Function Steps ---------- - 1. Converts .epub to .html - 2. Parses from line structure to nested structure + 1. Gets data from preset structure + 2. Add preset to html preprocessor + 3. Converts .epub to .html + 4. Parses from line structure to nested structure Returns ---------- @@ -23,7 +28,12 @@ class EpubBook(BookSolver): json for LiveCarta platform """ + preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\ + .get_preset_json() + css_preprocessor = CSSPreprocessor(logger=self.logger_object) + html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object) json_converter = EpubConverter( - self.file_path, access=self.access, logger=self.logger_object) + self.file_path, access=self.access, logger=self.logger_object, + css_preprocessor=css_preprocessor, html_processor=html_preprocessor) content_dict = json_converter.convert_to_dict() return content_dict diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 3f762b4..3ddc532 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -1,419 +1,398 @@ import re +from bs4 import BeautifulSoup, NavigableString, Comment, Tag -from bs4 import BeautifulSoup, NavigableString, Tag, Comment - -from src.livecarta_config import LiveCartaConfig +from src.util.helpers import BookLogger -def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup): - """ - Function adds span with id from tag_to_be_removed - because this tag will be removed(unwrapped/extract) - Parameters - ---------- - tag_to_be_removed: Soup object - chapter_tag: BeautifulSoup +class HtmlEpubPreprocessor: + def __init__(self, preset, logger=None): + self.preset = preset + self.logger: BookLogger = logger + self.name2function = { + "table_wrapper": self._wrap_tags_with_table, + "replacer": self._tags_to_correspond_livecarta_tag, + "unwrapper": self._unwrap_tags, + "inserter": self._insert_tags_into_correspond_tags + } - Returns - ------- - None - updated body tag + @staticmethod + def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup): + """ + Function adds span with id from tag_to_be_removed + because this tag will be removed(unwrapped/extract) + Parameters + ---------- + tag_to_be_removed: Soup object + chapter_tag: BeautifulSoup - """ - def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list): - """Function inserts span before tag aren't supported by LiveCarta""" - new_tag = chapter_tag.new_tag("span") - new_tag.attrs["id"] = id_ or "" - new_tag.attrs["class"] = class_ or "" - new_tag.string = "\xa0" - tag_to_be_removed.insert_before(new_tag) + Returns + ------- + None + updated body tag - if tag_to_be_removed.attrs.get("id"): - _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, - id_=tag_to_be_removed.attrs["id"], - class_=tag_to_be_removed.attrs.get("class")) + """ + def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, + class_: list): + """Function inserts span before tag aren't supported by LiveCarta""" + new_tag = chapter_tag.new_tag("span") + new_tag.attrs["id"] = id_ or "" + new_tag.attrs["class"] = class_ or "" + new_tag.string = "\xa0" + tag_to_be_removed.insert_before(new_tag) -def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: - """ - After processing on a first_id that corresponds to current chapter, - from initial html_soup all tags from current chapter are extracted - Parameters - ---------- - first_id: str - Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" - href: str - Name of current chapters file - html_soup: Tag - Soup object of current file + if tag_to_be_removed.attrs.get("id"): + _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, + id_=tag_to_be_removed.attrs["id"], + class_=tag_to_be_removed.attrs.get("class")) - Returns - ------- - tags: list [Tag, NavigableString] - Chapter's tags + @staticmethod + def prepare_title(title_of_chapter: str) -> str: + """ + Function finalise processing/cleaning title + Parameters + ---------- + title_of_chapter: str - """ - marked_tags = html_soup.find( - attrs={"id": first_id, "class": "converter-chapter-mark"}) - if marked_tags: - next_tag = marked_tags.next_sibling - tags = [] - while next_tag: - if not isinstance(next_tag, NavigableString) and \ - (next_tag.attrs.get("class") == "converter-chapter-mark"): - break - tags.append(next_tag) - next_tag = next_tag.next_sibling + Returns + ------- + title: str + cleaned title - # remove tags between first_id and next found id - # save them in list for next steps - tags = [tag.extract() for tag in tags] - html_soup.smooth() + """ + title = BeautifulSoup(title_of_chapter, features="lxml").string + # clean extra whitespace characters ([\r\n\t\f\v ]) + title = re.sub(r"[\s\xa0]", " ", title).strip() + return title - else: - assert 0, f"Warning: no match for {first_id, href}" + @staticmethod + def _remove_comments(chapter_tag): + """ + Function remove comments + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag - return tags + Returns + ------- + None + Chapter Tag without comments + """ + for tag in chapter_tag.find_all(): + for element in tag(text=lambda text: isinstance(text, Comment)): + element.extract() -def prepare_title(title_of_chapter: str) -> str: - """ - Function finalise processing/cleaning title - Parameters - ---------- - title_of_chapter: str + @staticmethod + def _wrap_strings_with_p(chapter_tag): + """ + Function converts headings that aren't supported by LiveCarta with

+ Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag - Returns - ------- - title: str - cleaned title + Returns + ------- + None + Chapter Tag with wrapped NavigableStrings - """ - title = BeautifulSoup(title_of_chapter, features="lxml").string - # clean extra whitespace characters ([\r\n\t\f\v ]) - title = re.sub(r"[\s\xa0]", " ", title).strip() - return title + """ + for node in chapter_tag: + if isinstance(node, NavigableString): + content = str(node) + content = re.sub(r"([\s\xa0])", " ", content).strip() + if content: + p_tag = chapter_tag.new_tag("p") + p_tag.append(str(node)) + node.replace_with(p_tag) + def _wrap_tags_with_table(self, chapter_tag, rules: list): + """ + Function wraps with + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag -def _remove_comments(chapter_tag): - """ - Function remove comments - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag + Returns + ------- + None + Chapter Tag with wrapped certain tags with
- Returns - ------- - None - Chapter Tag without comments + """ - """ - for tag in chapter_tag.find_all(): - for element in tag(text=lambda text: isinstance(text, Comment)): - element.extract() + def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): + table = chapter_tag.new_tag("table") + table.attrs["border"], table.attrs["align"], table.attrs["style"] \ + = border, "center", f"width:{width}%;" + tbody, tr, td = \ + chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") + td.attrs["bgcolor"] = bg_color + tag_to_be_wrapped.wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) + return table + def process_tag_using_table(tag_to_wrap): + _wrap_tag_with_table( + chapter_tag, + tag_to_be_wrapped=tag_to_wrap, + width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", + border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, + bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) + self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) + tag_to_wrap.unwrap() -def _wrap_strings_with_p(chapter_tag): - """ - Function converts headings that aren't supported by LiveCarta with

- Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with wrapped NavigableStrings - - """ - for node in chapter_tag: - if isinstance(node, NavigableString): - content = str(node) - content = re.sub(r"([\s\xa0])", " ", content).strip() - if content: - p_tag = chapter_tag.new_tag("p") - p_tag.append(str(node)) - node.replace_with(p_tag) - - -def _wrap_tags_with_table(chapter_tag): - """ - Function wraps with

- Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with wrapped certain tags with
- - """ - def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): - table = chapter_tag.new_tag("table") - table.attrs["border"], table.attrs["align"], table.attrs["style"] \ - = border, "center", f"width:{width}%;" - tbody, tr, td = \ - chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") - td.attrs["bgcolor"] = bg_color - tag_to_be_wrapped.wrap(td) - td.wrap(tr) - tr.wrap(tbody) - tbody.wrap(table) - table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) - return table - - def process_tag_using_table(tag_to_wrap): - _wrap_tag_with_table( - chapter_tag, - tag_to_be_wrapped=tag_to_wrap, - width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", - border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, - bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) - _add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) - tag_to_wrap.unwrap() - - for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items(): - if isinstance(attrs, tuple): - attr, val = attrs[0], attrs[1] - for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}): - process_tag_using_table(tag_to_wrap) - else: - for tag_to_wrap in chapter_tag.find_all(tags_to_wrap): - if any(attr_name in attrs for attr_name in tag_to_wrap.attrs): + for rule in rules: + tags = rule["tags"] + for attr in rule["attrs"]: + for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): process_tag_using_table(tag_to_wrap) + @staticmethod + def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list): + """ + Function to replace all tags to correspond LiveCarta tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag -def _tags_to_correspond_livecarta_tag(chapter_tag): - """ - Function to replace all tags to correspond LiveCarta tags - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag + Returns + ------- + None + Chapter Tag with all tags replaced with LiveCarta tags - Returns - ------- - None - Chapter Tag with all tags replaced with LiveCarta tags - - """ - for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items(): - for key in reg_keys: - if isinstance(key, tuple): - replace = key[0] - parent, child = key[1], key[2] - for parent_tag in chapter_tag.select(parent): - if replace == "parent": - parent_tag.name = to_replace_value - elif replace == "child": - for child_tag in parent_tag.select(child): - child_tag.name = to_replace_value - if not child_tag.attrs.get("style"): - child_tag.attrs["style"] =\ - "font-size: 14px; font-family: courier new,courier,monospace;" - else: - tags = chapter_tag.find_all(re.compile(key)) - for tag in tags: - # todo can cause appearance of \n

...

->

\n

...

\n

(section) - tag.name = to_replace_value - - -def _unwrap_tags(chapter_tag): - """ - Function unwrap tags and moves id to span - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with unwrapped certain tags - - """ - for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP: - for tag in chapter_tag.select(tag_name): - # if tag is a subtag - if ">" in tag_name: - tag.parent.attrs.update(tag.attrs) - _add_span_to_save_ids_for_links(tag, chapter_tag) - tag.unwrap() - - -def _remove_headings_content(content_tag, title_of_chapter: str): - """ - Function - - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content - - adds span with id in order to - Parameters - ---------- - content_tag: soup object - Tag of the page - title_of_chapter: str - Chapter title - - Returns - ------- - None - clean/remove headings & add span with id - - """ - title_of_chapter = title_of_chapter.lower() - for tag in content_tag.contents: - text = tag if isinstance(tag, NavigableString) else tag.text - if re.sub(r"[\s\xa0]", "", text): - text = re.sub(r"[\s\xa0]", " ", text).lower() - text = text.strip() # delete extra spaces - if title_of_chapter == text or \ - (title_of_chapter in text and - re.findall(r"^h[1-3]$", tag.name or content_tag.name)): - _add_span_to_save_ids_for_links(tag, content_tag) - tag.extract() - return - elif not isinstance(tag, NavigableString): - if not _remove_headings_content(tag, title_of_chapter): - break - - -def _process_table(chapter_tag: BeautifulSoup): - """ - Function preprocesses tables and tags(td|th|tr) - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with processed tables - - """ - tables = chapter_tag.find_all("table") - for table in tables: - for t_tag in table.find_all(re.compile("td|th|tr")): - width = "" - if t_tag.get("style"): - width_match = re.search( - r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) - if width_match: - size = width_match.group(1) - width = size + "px" - - t_tag.attrs["width"] = t_tag.get("width") or width - - if t_tag.attrs.get("style"): - t_tag.attrs["style"] = t_tag.attrs["style"].replace( - "border:0;", "") - if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": - del t_tag.attrs["style"] - - if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: - table.attrs["border"] = "1" - - -def _insert_tags_in_parents(chapter_tag): - """ - Function inserts tags into correspond tags - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with inserted tags - - """ - parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()} - for parent_tag_name, condition in parent_tag2condition.items(): - for parent_tag in chapter_tag.select(parent_tag_name): - if parent_tag.select(condition): - continue + """ + for rule in rules: + tags = rule["tags"] + tag_to_replace = rule["tag_to_replace"] + if rule["condition"]: + for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): + if condition_on_tag[0] == 'parent_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if tag.parent.select(condition_on_tag[1]): + tag.name = tag_to_replace + elif condition_on_tag[0] == 'child_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): + tag.name = tag_to_replace + elif condition_on_tag[0] == "attrs": + for attr in rule["condition"]["attrs"]: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): + tag.name = tag_to_replace else: - tag_to_insert = chapter_tag.new_tag( - LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)]) - # insert all items that was in pre to code and remove from pre - for content in reversed(parent_tag.contents): - tag_to_insert.insert(0, content.extract()) - # wrap code with items - parent_tag.append(tag_to_insert) + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + # todo can cause appearance of \n

...

->

\n

...

\n

(section) + tag.name = tag_to_replace + def _unwrap_tags(self, chapter_tag, rules: dict): + """ + Function unwrap tags and moves id to span + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag -def _class_removing(chapter_tag): - """ - Function removes classes that aren't created by converter - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag + Returns + ------- + None + Chapter Tag with unwrapped certain tags - Returns - ------- - None - Chapter Tag without original classes of the book + """ + for tag_name in rules["tags"]: + for tag in chapter_tag.select(tag_name): + # if tag is a subtag + if ">" in tag_name: + tag.parent.attrs.update(tag.attrs) + self._add_span_to_save_ids_for_links(tag, chapter_tag) + tag.unwrap() - """ - for tag in chapter_tag.find_all(recursive=True): - if tag.attrs.get("class") \ - and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): - del tag.attrs["class"] + @staticmethod + def _insert_tags_into_correspond_tags(chapter_tag, rules: list): + """ + Function inserts tags into correspond tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + Returns + ------- + None + Chapter Tag with inserted tags -def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: - """ - Function finalise processing/cleaning content - Parameters - ---------- - title_str: str + """ - content_tag: Tag, soup object + def insert(tag, tag_to_insert): + # insert all items that was in tag to subtag and remove from tag + for content in reversed(tag.contents): + tag_to_insert.insert(0, content.extract()) + # wrap subtag with items + tag.append(tag_to_insert) - remove_title_from_chapter: bool + for rule in rules: + tags = rule["tags"] + tag_to_insert = \ + chapter_tag.new_tag(rule["tag_to_insert"]) + if rule["condition"]: + for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): + if condition_on_tag[0] == 'parent_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if tag.parent.select(condition_on_tag[1]): + insert(tag, tag_to_insert) + elif condition_on_tag[0] == 'child_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): + insert(tag, tag_to_insert) + elif condition_on_tag[0] == "attrs": + for attr in rule["condition"]["attrs"]: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): + insert(tag, tag_to_insert) + else: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + insert(tag, tag_to_insert) - Steps - ---------- - 1. comments removal - 2. wrap NavigableString with tag

- 3. wrap tags with

- 4. replace tags with correspond LiveCarta tags - 5. unwrap tags - 6. heading removal - 7. process_table - 8. insert tags into correspond tags - 9. class removal + def _remove_headings_content(self, content_tag, title_of_chapter: str): + """ + Function + - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content + - adds span with id in order to + Parameters + ---------- + content_tag: soup object + Tag of the page + title_of_chapter: str + Chapter title - Returns - ------- - content_tag: str - prepared content + Returns + ------- + None + clean/remove headings & add span with id - """ - # 1. remove comments - _remove_comments(content_tag) + """ + title_of_chapter = title_of_chapter.lower() + for tag in content_tag.contents: + text = tag if isinstance(tag, NavigableString) else tag.text + if re.sub(r"[\s\xa0]", "", text): + text = re.sub(r"[\s\xa0]", " ", text).lower() + text = text.strip() # delete extra spaces + if title_of_chapter == text or \ + (title_of_chapter in text and + re.findall(r"^h[1-3]$", tag.name or content_tag.name)): + self._add_span_to_save_ids_for_links(tag, content_tag) + tag.extract() + return + elif not isinstance(tag, NavigableString): + if not self._remove_headings_content(tag, title_of_chapter): + break - # 2. - _wrap_strings_with_p(content_tag) - # 3. - _wrap_tags_with_table(content_tag) - # 4. - _tags_to_correspond_livecarta_tag(content_tag) - # 5. - _unwrap_tags(content_tag) - # 6. - if remove_title_from_chapter: - _remove_headings_content(content_tag, title_str) - # 7. - _process_table(content_tag) - # 8. - _insert_tags_in_parents(content_tag) + @staticmethod + def _process_tables(chapter_tag: BeautifulSoup): + """ + Function preprocesses tables and tags(td|th|tr) + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag - # 9. remove classes that weren't created by converter - _class_removing(content_tag) - return str(content_tag) + Returns + ------- + None + Chapter Tag with processed tables + + """ + tables = chapter_tag.find_all("table") + for table in tables: + for t_tag in table.find_all(re.compile("td|th|tr")): + width = "" + if t_tag.get("style"): + width_match = re.search( + r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) + if width_match: + size = width_match.group(1) + width = size + "px" + + t_tag.attrs["width"] = t_tag.get("width") or width + + if t_tag.attrs.get("style"): + t_tag.attrs["style"] = t_tag.attrs["style"].replace( + "border:0;", "") + if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": + del t_tag.attrs["style"] + + if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: + table.attrs["border"] = "1" + + @staticmethod + def _class_removing(chapter_tag): + """ + Function removes classes that aren't created by converter + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag without original classes of the book + + """ + for tag in chapter_tag.find_all(recursive=True): + if tag.attrs.get("class") \ + and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): + del tag.attrs["class"] + + def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: + """ + Function finalise processing/cleaning content + Parameters + ---------- + title_str: str + + content_tag: Tag, soup object + + remove_title_from_chapter: bool + + Steps + ---------- + 1. comments removal + 2. wrap NavigableString with tag

+ 3-6. wrap tags with

+ replace tags with correspond LiveCarta tags + unwrap tags + insert tags into correspond tags + 7. heading removal + 8. process_tables + 9. class removal + + Returns + ------- + content_tag: str + prepared content + + """ + # 1. remove comments + self._remove_comments(content_tag) + # 2. + self._wrap_strings_with_p(content_tag) + # 3-6. + for dict in self.preset: + func = self.name2function[dict["preset_name"]] + func(content_tag, dict['rules']) + # 7. + if remove_title_from_chapter: + self._remove_headings_content(content_tag, title_str) + # 8. + self._process_tables(content_tag) + # 9. remove classes that weren't created by converter + self._class_removing(content_tag) + return str(content_tag) diff --git a/src/preset_processor.py b/src/preset_processor.py new file mode 100644 index 0000000..a1cbb93 --- /dev/null +++ b/src/preset_processor.py @@ -0,0 +1,15 @@ +import json + + +from src.util.helpers import BookLogger + + +class PresetProcessor: + def __init__(self, preset_path="config/presets.json", logger=None): + self.preset_path = preset_path + self.logger: BookLogger = logger + + def get_preset_json(self): + f = open(self.preset_path) + data = json.load(f) + return data