diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index f0b7826..92a174b 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -5,6 +5,7 @@ from threading import Event from src.book_solver import BookSolver from src.util.helpers import BookLogger +from src.html_preprocessor import HtmlPreprocessor from src.style_preprocessor import StylePreprocessor from src.docx_converter.docx2libre_html import Docx2LibreHTML from src.docx_converter.html_docx_processor import HTMLDocxProcessor @@ -48,10 +49,14 @@ class DocxBook(BookSolver): # 2. Parses and cleans html, gets list of tags, gets footnotes try: - style_processor = StylePreprocessor() - parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, - logger=self.logger_object, style_processor=style_processor) - bs_tags, footnotes, top_level_headers = parser.process_html( + html_preprocessor = HtmlPreprocessor( + logger=self.logger_object, preset_path="presets/docx_presets.json") + style_preprocessor = StylePreprocessor() + html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, + logger=self.logger_object, + html_preprocessor=html_preprocessor, + style_preprocessor=style_preprocessor) + bs_tags, footnotes, top_level_headers = html_processor.process_html( self.access, html_converter.html_path, self.book_id) except Exception as exc: self.logger_object.log( @@ -84,10 +89,12 @@ if __name__ == "__main__": html_converter = Docx2LibreHTML(file_path=docx_file_path, logger=logger_object, libre_locker=locker) - css_processor = StylePreprocessor() - parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object, - style_processor=css_processor, preset_path="../../presets/docx_presets.json") - content, footnotes, top_level_headers = parser.process_html( + html_preprocessor = HtmlPreprocessor( + logger=logger_object, preset_path="../../presets/docx_presets.json") + style_preprocessor = StylePreprocessor() + html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object, + html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor) + content, footnotes, top_level_headers = html_processor.process_html( html_path=html_converter.html_path, book_id=html_converter.book_id) json_converter = LibreHTML2JSONConverter( diff --git a/src/docx_converter/html_docx_processor.py b/src/docx_converter/html_docx_processor.py index b515a37..945ab1b 100644 --- a/src/docx_converter/html_docx_processor.py +++ b/src/docx_converter/html_docx_processor.py @@ -1,32 +1,23 @@ import re -import json import pathlib from typing import List, Tuple, Dict, Union from bs4 import BeautifulSoup, Tag, NavigableString from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig +from src.html_preprocessor import _preprocess_html from src.docx_converter.image_processing import process_images from src.docx_converter.footnotes_processing import process_footnotes from src.tag_inline_style_processor import modify_html_soup_with_css_styles class HTMLDocxProcessor: - - def __init__(self, html_soup: BeautifulSoup, logger: BookLogger, - style_processor, preset_path: str = "presets/docx_presets.json"): - self.html_soup = html_soup - self.body_tag = html_soup.body + def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor): self.logger = logger - self.preset = json.load(open(preset_path)) - self.style_processor = style_processor - self.name2action = { - "wrapper": self._wrap_tag, - "decomposer": self._decompose_tag, - "replacer": self._replace_tag, - "attr_replacer": self._replace_attr, - "unwrapper": self._unwrap_tag - } + self.html_soup = html_soup + self.body_tag = self.html_soup.body + self.html_preprocessor = html_preprocessor + self.style_preprocessor = style_preprocessor def _process_toc_links(self): """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" @@ -59,84 +50,6 @@ class HTMLDocxProcessor: f"Check the structure of the file." f"Tag name: {tag.name}") - def _wrap_tag(self, **kwargs): - kwargs["tag"].wrap(self.html_soup.new_tag(kwargs["rule"]["tag_to_wrap"])) - - @staticmethod - def _decompose_tag(**kwargs): - kwargs["tag"].decompose() - - @staticmethod - def _replace_tag(**kwargs): - tag_to_replace: str = kwargs["rule"]["tag_to_replace"] - kwargs["tag"].name = tag_to_replace - - @staticmethod - def _replace_attr(**kwargs): - attr, attr_value =\ - kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"] - attr_to_replace, attr_value_to_replace =\ - kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] - if attr_to_replace: - kwargs["tag"][attr_to_replace] = kwargs["tag"][attr] - if attr_value_to_replace: - kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace - del kwargs["tag"][attr] - elif attr_value_to_replace: - kwargs["tag"].attrs[attr] = attr_value_to_replace - - @staticmethod - def _unwrap_tag(**kwargs): - kwargs["tag"].unwrap() - - @staticmethod - def _process_tags(body_tag: Tag, - rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], - action): - """ - Function do action with tags - Parameters - ---------- - body_tag: Tag - Tag & contents of the chapter tag - rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] - list of conditions when fire function - action: function - action what to do with tag - Returns - ------- - NoReturn - Body Tag with processed certain tags - - """ - for rule in rules: - tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"] - if rule["condition"]: - for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): - if condition_on_tag[0] == "parent_tags": - for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag) - for tag in tags])): - tag.parent.attrs.update(tag.attrs) - action(body_tag=body_tag, tag=tag, rule=rule) - elif condition_on_tag[0] == "child_tags": - for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1] - for tag in tags])): - action(body_tag=body_tag, tag=tag, rule=rule) - elif condition_on_tag[0] == "attrs": - for attr in rule["condition"]["attrs"]: - for tag in body_tag.find_all([re.compile(tag) for tag in tags], - {attr["name"]: re.compile(fr"{attr['value']}")}): - action(body_tag=body_tag, tag=tag, rule=rule) - # attr replacer - elif condition_on_tag[0] == "tags": - attr = rule["attr"] - for tag in body_tag.find_all([re.compile(tag) for tag in tags], - {attr['name']: re.compile(fr"{attr['value']}")}): - action(body_tag=body_tag, tag=tag, rule=rule) - else: - for tag in body_tag.find_all([re.compile(tag) for tag in tags]): - action(body_tag=body_tag, tag=tag, rule=rule) - def _process_quotes(self): """ Function to process block quotes. @@ -175,14 +88,6 @@ class HTMLDocxProcessor: table.replaceWith(new_div) - @staticmethod - def convert_pt_to_px(value: float) -> float: - value = float(value) - if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE: - return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE - else: - return value - def _process_tables(self): """Function to process tables. Set "border" attribute.""" tables = self.body_tag.find_all("table") @@ -197,7 +102,10 @@ class HTMLDocxProcessor: size = match.group(1) units = match.group(2) if units == "pt": - size = self.convert_pt_to_px(size) + value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\ + if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\ + else float(size) + size = value sizes.append(float(size)) width = td.get("width") td.attrs = {} @@ -392,14 +300,13 @@ class HTMLDocxProcessor: self.logger.log(f"Processing TOC and headers.") self._process_toc_links() - for rule in self.preset: - self.logger.log(rule["preset_name"].title() + " process.") - action = self.name2action[rule["preset_name"]] - self._process_tags(self.body_tag, rule["rules"], action) + _preprocess_html(html_preprocessor=self.html_preprocessor, + html_soup=self.html_soup) # CSS after html processing cause of that aren't supported by html self.logger.log("CSS inline style preprocessing.") - self.style_processor.process_inline_styles_in_html_soup(self.body_tag) + self.style_preprocessor.process_inline_styles_in_html_soup( + self.body_tag) self.logger.log("CSS inline style processing.") modify_html_soup_with_css_styles(self.body_tag) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 3ec04e2..70e36a8 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -13,7 +13,7 @@ from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.style_preprocessor import StylePreprocessor -from src.epub_converter.html_epub_processor import HtmlEpubProcessor +from src.epub_converter.html_epub_processor import HTMLEpubProcessor from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.footnotes_processing import preprocess_footnotes from src.tag_inline_style_processor import modify_html_soup_with_css_styles @@ -21,7 +21,7 @@ from src.tag_inline_style_processor import modify_html_soup_with_css_styles class EpubConverter: def __init__(self, book_path, access=None, logger: BookLogger = None, - style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None): + style_processor: StylePreprocessor = None, html_processor: HTMLEpubProcessor = None): self.book_path = book_path self.access = access self.logger: BookLogger = logger diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 5aa13a0..c348dba 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -30,13 +30,16 @@ class EpubBook(BookSolver): json for LiveCarta platform """ - style_processor = StylePreprocessor() - html_processor = HtmlEpubProcessor( - logger=self.logger_object) + html_preprocessor = HtmlPreprocessor( + logger=self.logger_object, preset_path="presets/epub_presets.json") + style_preprocessor = StylePreprocessor() + html_processor = HTMLEpubProcessor(logger=self.logger_object, + html_preprocessor=html_preprocessor) json_converter = EpubConverter( self.book_path, access=self.access, logger=self.logger_object, - style_processor=style_processor, html_processor=html_processor) + style_processor=style_preprocessor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() + return content_dict diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index 7f87c59..c600cd7 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -1,58 +1,16 @@ import re -import json -from typing import List, Dict, Union +from typing import Union from bs4.element import PageElement from bs4 import BeautifulSoup, Tag, NavigableString, Comment from src.util.helpers import BookLogger +from src.html_preprocessor import _preprocess_html -class HtmlEpubProcessor: - def __init__(self, preset_path: str = "presets/epub_presets.json", logger: BookLogger = None): - self.preset = json.load(open(preset_path)) +class HTMLEpubProcessor: + def __init__(self, logger: BookLogger = None, html_preprocessor=None): self.logger = logger - self.name2action = { - "table_wrapper": self._process_tag_using_table, - "replacer": self._replace_tag, - "attr_replacer": self._replace_attr, - "unwrapper": self._unwrap_tag, - "inserter": self._insert_tag - } - - @staticmethod - def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup], - chapter_tag: BeautifulSoup): - """ - Function adds span with id from tag_to_be_removed - because this tag will be removed(unwrapped/extract) - Parameters - ---------- - tag_to_be_removed: Union[PageElement, BeautifulSoup] - - chapter_tag: BeautifulSoup - - Returns - ------- - NoReturn - updated body tag - - """ - def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, - tag_to_be_removed: Tag, - id_: str, - class_: Union[List[str], str]): - """Function inserts span before tag aren't supported by LiveCarta""" - new_tag: Tag = chapter_tag.new_tag("span") - new_tag.attrs["id"] = id_ or "" - new_tag.attrs["class"] = class_ or "" - new_tag.string = "\xa0" - tag_to_be_removed.insert_before(new_tag) - - if tag_to_be_removed.attrs.get("id"): - _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, - tag_to_be_removed=tag_to_be_removed, - id_=tag_to_be_removed.attrs["id"], - class_=tag_to_be_removed.attrs.get("class")) + self.html_preprocessor = html_preprocessor @staticmethod def prepare_title(title_of_chapter: str) -> str: @@ -116,111 +74,6 @@ class HtmlEpubProcessor: p_tag.append(str(node)) node.replace_with(p_tag) - def _process_tag_using_table(self, **kwargs): - def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag: - table = kwargs["chapter_tag"].new_tag("table") - table.attrs["border"], table.attrs["align"], table.attrs["style"] \ - = border, "center", f"width:{width}%;" - tbody, tr, td = \ - kwargs["chapter_tag"].new_tag("tbody"), kwargs["chapter_tag"].new_tag( - "tr"), kwargs["chapter_tag"].new_tag("td") - td.attrs["bgcolor"] = bg_color - kwargs["tag"].wrap(td) - td.wrap(tr) - tr.wrap(tbody) - tbody.wrap(table) - table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) - return table - _wrap_tag_with_table( - width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get( - "width") else "100", - border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get( - "border") else None, - bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None) - self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["chapter_tag"]) - kwargs["tag"].unwrap() - - @staticmethod - def _replace_tag(**kwargs): - tag_to_replace: str = kwargs["rule"]["tag_to_replace"] - kwargs["tag"].name = tag_to_replace - - @staticmethod - def _replace_attr(**kwargs): - attr, attr_value =\ - kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"] - attr_to_replace, attr_value_to_replace =\ - kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] - if attr_to_replace: - kwargs["tag"][attr_to_replace] = kwargs["tag"][attr] - if attr_value_to_replace: - kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace - del kwargs["tag"][attr] - elif attr_value_to_replace: - kwargs["tag"].attrs[attr] = attr_value_to_replace - - @staticmethod - def _unwrap_tag(**kwargs): - kwargs["tag"].unwrap() - - @staticmethod - def _insert_tag(**kwargs): - tag_to_insert = \ - kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"]) - # insert all items that was in tag to subtag and remove from tag - for content in reversed(kwargs["tag"].contents): - tag_to_insert.insert(0, content.extract()) - # wrap subtag with items - kwargs["tag"].append(tag_to_insert) - - @staticmethod - def _process_tags(chapter_tag: BeautifulSoup, - rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], - action): - """ - Function do action with tags - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] - list of conditions when fire function - action: function - action what to do with tag - Returns - ------- - NoReturn - Body Tag with processed certain tags - - """ - for rule in rules: - tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"] - if rule["condition"]: - for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): - if condition_on_tag[0] == "parent_tags": - for tag in chapter_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag) - for tag in tags])): - tag.parent.attrs.update(tag.attrs) - action(chapter_tag=chapter_tag, tag=tag, rule=rule) - elif condition_on_tag[0] == "child_tags": - for tag in chapter_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1] - for tag in tags])): - action(chapter_tag=chapter_tag, tag=tag, rule=rule) - elif condition_on_tag[0] == "attrs": - for attr in rule["condition"]["attrs"]: - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], - {attr["name"]: re.compile(fr"{attr['value']}")}): - action(chapter_tag=chapter_tag, tag=tag, rule=rule) - # attr replacer - elif condition_on_tag[0] == "tags": - attr = rule["attr"] - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], - {attr['name']: re.compile(fr"{attr['value']}")}): - action(chapter_tag=chapter_tag, tag=tag, rule=rule) - else: - for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): - action(chapter_tag=chapter_tag, tag=tag, rule=rule) - def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str): """ Function @@ -250,7 +103,8 @@ class HtmlEpubProcessor: if title_of_chapter == text or \ (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): - self._add_span_to_save_ids_for_links(tag, chapter_tag) + self.html_preprocessor._add_span_to_save_ids_for_links( + tag, chapter_tag) tag.extract() return elif not self._remove_headings_content(tag, title_of_chapter): @@ -350,9 +204,8 @@ class HtmlEpubProcessor: # 2. self._wrap_strings_with_p(chapter_tag) # 3-6. - for rule in self.preset: - action = self.name2action[rule["preset_name"]] - self._process_tags(chapter_tag, rule["rules"], action) + _preprocess_html( + html_preprocessor=self.html_preprocessor, html_soup=chapter_tag) # 7. if remove_title_from_chapter: self._remove_headings_content(chapter_tag, title_str) diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py new file mode 100644 index 0000000..6c57016 --- /dev/null +++ b/src/html_preprocessor.py @@ -0,0 +1,179 @@ +import re +import json +from bs4 import BeautifulSoup, Tag +from bs4.element import PageElement +from typing import List, Dict, Union + +from src.util.helpers import BookLogger + + +class HtmlPreprocessor: + def __init__(self, logger: BookLogger, preset_path): + self.preset = json.load(open(preset_path)) + self.logger = logger + self.name2action = { + "wrapper": self._wrap_tag, + "table_wrapper": self._process_tag_using_table, + "decomposer": self._decompose_tag, + "replacer": self._replace_tag, + "attr_replacer": self._replace_attr, + "unwrapper": self._unwrap_tag, + "inserter": self._insert_tag + } + + @staticmethod + def _wrap_tag(**kwargs): + kwargs["tag"].wrap(kwargs["body_tag"].new_tag( + kwargs["rule"]["tag_to_wrap"])) + + @staticmethod + def _decompose_tag(**kwargs): + kwargs["tag"].decompose() + + @staticmethod + def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup], + chapter_tag: BeautifulSoup): + """ + Function adds span with id from tag_to_be_removed + because this tag will be removed(unwrapped/extract) + Parameters + ---------- + tag_to_be_removed: Union[PageElement, BeautifulSoup] + + chapter_tag: BeautifulSoup + + Returns + ------- + NoReturn + updated body tag + + """ + def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, + tag_to_be_removed: Tag, + id_: str, + class_: Union[List[str], str]): + """Function inserts span before tag aren't supported by LiveCarta""" + new_tag: Tag = chapter_tag.new_tag("span") + new_tag.attrs["id"] = id_ or "" + new_tag.attrs["class"] = class_ or "" + new_tag.string = "\xa0" + tag_to_be_removed.insert_before(new_tag) + + if tag_to_be_removed.attrs.get("id"): + _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, + tag_to_be_removed=tag_to_be_removed, + id_=tag_to_be_removed.attrs["id"], + class_=tag_to_be_removed.attrs.get("class")) + + def _process_tag_using_table(self, **kwargs): + def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag: + table = kwargs["body_tag"].new_tag("table") + table.attrs["border"], table.attrs["align"], table.attrs["style"] \ + = border, "center", f"width:{width}%;" + tbody, tr, td = \ + kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag( + "tr"), kwargs["body_tag"].new_tag("td") + td.attrs["bgcolor"] = bg_color + kwargs["tag"].wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) + return table + _wrap_tag_with_table( + width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get( + "width") else "100", + border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get( + "border") else None, + bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None) + self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"]) + kwargs["tag"].unwrap() + + @staticmethod + def _replace_tag(**kwargs): + tag_to_replace: str = kwargs["rule"]["tag_to_replace"] + kwargs["tag"].name = tag_to_replace + + @staticmethod + def _replace_attr(**kwargs): + attr, attr_value =\ + kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"] + attr_to_replace, attr_value_to_replace =\ + kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] + if attr_to_replace: + kwargs["tag"][attr_to_replace] = kwargs["tag"][attr] + if attr_value_to_replace: + kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace + del kwargs["tag"][attr] + elif attr_value_to_replace: + kwargs["tag"].attrs[attr] = attr_value_to_replace + + @staticmethod + def _unwrap_tag(**kwargs): + kwargs["tag"].unwrap() + + @staticmethod + def _insert_tag(**kwargs): + tag_to_insert = \ + kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"]) + # insert all items that was in tag to subtag and remove from tag + for content in reversed(kwargs["tag"].contents): + tag_to_insert.insert(0, content.extract()) + # wrap subtag with items + kwargs["tag"].append(tag_to_insert) + + @staticmethod + def _process_tags(body_tag: BeautifulSoup, + rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], + action): + """ + Function does action with tags + Parameters + ---------- + body_tag: BeautifulSoup + Tag & contents of the body tag + rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] + list of conditions when fire function + action: function + action what to do with tag + Returns + ------- + NoReturn + Body Tag with processed certain tags + + """ + for rule in rules: + tags: List[str] = rule["tags"] if rule.get( + "tags") else rule["condition"]["tags"] + if rule["condition"]: + for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): + if condition_on_tag[0] == "parent_tags": + for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag) + for tag in tags])): + tag.parent.attrs.update(tag.attrs) + action(body_tag=body_tag, tag=tag, rule=rule) + elif condition_on_tag[0] == "child_tags": + for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1] + for tag in tags])): + action(body_tag=body_tag, tag=tag, rule=rule) + elif condition_on_tag[0] == "attrs": + for attr in rule["condition"]["attrs"]: + for tag in body_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): + action(body_tag=body_tag, tag=tag, rule=rule) + # attr replacer + elif condition_on_tag[0] == "tags": + attr = rule["attr"] + for tag in body_tag.find_all([re.compile(tag) for tag in tags], + {attr['name']: re.compile(fr"{attr['value']}")}): + action(body_tag=body_tag, tag=tag, rule=rule) + else: + for tag in body_tag.find_all([re.compile(tag) for tag in tags]): + action(body_tag=body_tag, tag=tag, rule=rule) + + +def _preprocess_html(html_preprocessor: HtmlPreprocessor, html_soup: BeautifulSoup): + for rule in html_preprocessor.preset: + # html_preprocessor.logger.log(rule["preset_name"].title() + " process.") + action = html_preprocessor.name2action[rule["preset_name"]] + html_preprocessor._process_tags(html_soup, rule["rules"], action)