diff --git a/presets/docx_presets.json b/presets/docx_presets.json index fed2d62..96f861b 100644 --- a/presets/docx_presets.json +++ b/presets/docx_presets.json @@ -12,9 +12,13 @@ "name": "id", "value": "^Table of Contents\\d+" } - ] + ], + "text": null }, - "tag_to_wrap": "TOC" + "tag_to_wrap": { + "name": "TOC", + "attrs": [] + } } ] }, @@ -35,7 +39,8 @@ "name": "id", "value": "^Table of Contents\\d+" } - ] + ], + "text": null } } ] @@ -46,7 +51,10 @@ { "tags": ["^h[6-9]$"], "condition": null, - "tag_to_replace": "p" + "tag_to_replace": { + "name": "p", + "attrs": null + } }, { "tags": ["^div$"], @@ -58,9 +66,13 @@ "name": "style", "value": "column-count: 2" } - ] + ], + "text": null }, - "tag_to_replace": "p" + "tag_to_replace": { + "name": "p", + "attrs": null + } } ] }, @@ -68,12 +80,14 @@ "preset_name": "attr_replacer", "rules": [ { - "attr": { - "name": "style", - "value": "column-count: 2" - }, + "tags": ["^p$"], "condition": { - "tags": ["^p$"] + "attrs": [ + { + "name": "style", + "value": "column-count: 2" + } + ] }, "attr_to_replace": { "name": "class", @@ -103,7 +117,8 @@ "name": "face", "value": "^Times New Roman[\\w, ]+$" } - ] + ], + "text": null } }, { @@ -111,7 +126,8 @@ "condition": { "parent_tags": ":is(li)", "child_tags": null, - "attrs": null + "attrs": null, + "text": null } }, { @@ -124,7 +140,8 @@ "name": "name", "value": "_GoBack" } - ] + ], + "text": null } }, { @@ -132,7 +149,8 @@ "condition": { "parent_tags": ":is(a)", "child_tags": ":is(a)", - "attrs": null + "attrs": null, + "text": null } }, { @@ -140,7 +158,8 @@ "condition": { "parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)", "child_tags": null, - "attrs": null + "attrs": null, + "text": null } }, { @@ -148,5 +167,40 @@ "condition": null } ] + }, + { + "preset_name": "inserter", + "rules": [ + { + "tags": ["^p$"], + "condition": { + "parent_tags": null, + "child_tags": null, + "attrs": null, + "text": "\\$\\$[\\s\\S]*?\\$\\$" + }, + "tag_to_insert": { + "name": "span", + "attrs": [ + { + "name": "class", + "value": "math-tex" + } + ] + } + } + ] + }, + { + "preset_name": "text_replacer", + "rules": [ + { + "tags": ["^p$"], + "condition": { + "text": "(\\\\nonumber\\\\\\\\\\\\noalign{\\\\pagebreak}[\\s\\S]*?)\\\\" + }, + "text_to_replace": "\\\\" + } + ] } ] diff --git a/presets/epub_presets.json b/presets/epub_presets.json index 07e191c..7c8f672 100644 --- a/presets/epub_presets.json +++ b/presets/epub_presets.json @@ -48,17 +48,23 @@ { "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"], "condition": null, - "tag_to_replace": "p" + "tag_to_replace": { + "name": "p" + } }, { "tags": ["^aside$"], "condition": null, - "tag_to_replace": "div" + "tag_to_replace": { + "name": "div" + } }, { "tags": ["^header$", "^footer$"], "condition": null, - "tag_to_replace": "span" + "tag_to_replace": { + "name": "span" + } }, { "tags": ["^code$", "^kbd$", "^var$"], @@ -67,22 +73,30 @@ "child_tags": null, "attrs": null }, - "tag_to_replace": "span" + "tag_to_replace": { + "name": "span" + } }, { "tags": ["^em$"], "condition": null, - "tag_to_replace": "i" + "tag_to_replace": { + "name": "i" + } }, { "tags": ["^b$"], "condition": null, - "tag_to_replace": "strong" + "tag_to_replace": { + "name": "strong" + } }, { "tags": ["^image$"], "condition": null, - "tag_to_replace": "img" + "tag_to_replace": { + "name": "img" + } } ] }, @@ -90,12 +104,14 @@ "preset_name": "attr_replacer", "rules": [ { - "attr": { - "name": "xlink:href", - "value": ".*" - }, + "tags": ["^img$"], "condition": { - "tags": ["^img$"] + "attrs": [ + { + "name": "xlink:href", + "value": ".*" + } + ] }, "attr_to_replace": { "name": "src", @@ -140,12 +156,18 @@ "child_tags": ":not(:has(code, kbd, var))", "attrs": null }, - "tag_to_insert": "code" + "tag_to_insert": { + "name": "code", + "attrs": [] + } }, { "tags": ["^h[1-5]$"], "condition": null, - "tag_to_insert": "strong" + "tag_to_insert": { + "name":"strong", + "attrs": [] + } } ] } diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 3cd324d..abb13e3 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -80,7 +80,8 @@ class DocxBook(BookSolver): if __name__ == "__main__": - docx_file_path = "../../books/docx/AmericanGovernment3e-WEB.docx" + + docx_file_path = "../../books/docx/output.docx" logger_object = BookLogger( name="docx", book_id=docx_file_path.split("/")[-1]) locker = Event() @@ -88,7 +89,6 @@ if __name__ == "__main__": html_converter = Docx2LibreHtml(file_path=docx_file_path, logger=logger_object, libre_locker=locker) - html_preprocessor = HtmlPresetsProcessor( logger=logger_object, preset_path="../../presets/docx_presets.json") style_preprocessor = StyleReader() diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 90c3b95..5a6a9b5 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -43,7 +43,7 @@ class EpubBook(BookSolver): if __name__ == "__main__": - epub_file_path = "../../books/epub/9780763774134.epub" + epub_file_path = "../../books/epub/9781634259804.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) diff --git a/src/html_presets_processor.py b/src/html_presets_processor.py index cfffe7b..c908ccb 100644 --- a/src/html_presets_processor.py +++ b/src/html_presets_processor.py @@ -18,13 +18,14 @@ class HtmlPresetsProcessor: "replacer": self._replace_tag, "attr_replacer": self._replace_attr, "unwrapper": self._unwrap_tag, - "inserter": self._insert_tag + "inserter": self._insert_tag, + "text_replacer": self._replace_text } @staticmethod def _wrap_tag(**kwargs): kwargs["tag"].wrap(kwargs["body_tag"].new_tag( - kwargs["rule"]["tag_to_wrap"])) + kwargs["rule"]["tag_to_wrap"]["name"])) @staticmethod def set_attrs_to_parent(tag, parent_tag): @@ -98,24 +99,31 @@ class HtmlPresetsProcessor: @staticmethod def _replace_tag(**kwargs): - tag_to_replace: str = kwargs["rule"]["tag_to_replace"] + tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"] kwargs["tag"].name = tag_to_replace + if kwargs["rule"]["tag_to_replace"].get("attrs"): + dict_attributes = {attr["name"]: attr["value"] + for attr in kwargs["rule"]["tag_to_replace"]["attrs"]} + kwargs["tag"].attrs = dict_attributes @staticmethod def _replace_attr(**kwargs): - attr, attr_value =\ - kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"] + attr = kwargs["rule"]["condition"]["attrs"][0] + attr_name, attr_value =\ + attr["name"], attr["value"] attr_to_replace, attr_value_to_replace =\ kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] if attr_to_replace: - kwargs["tag"][attr_to_replace] = kwargs["tag"][attr] + kwargs["tag"][attr_to_replace] = kwargs["tag"][attr_name] \ + if kwargs["tag"].get(attr_name)\ + else "" if attr_value_to_replace: kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace - del kwargs["tag"][attr] + del kwargs["tag"][attr_name] elif attr_value_to_replace: - kwargs["tag"].attrs[attr] = attr_value_to_replace - elif attr: - del kwargs["tag"][attr] + kwargs["tag"].attrs[attr_name] = attr_value_to_replace + elif attr_name: + del kwargs["tag"][attr_name] def _unwrap_tag(self, **kwargs): if kwargs["tag"].parent: @@ -124,14 +132,24 @@ class HtmlPresetsProcessor: @staticmethod def _insert_tag(**kwargs): + dict_attributes = {attr["name"]: attr["value"] + for attr in kwargs["rule"]["tag_to_insert"]["attrs"]} tag_to_insert = \ - kwargs["body_tag"].new_tag(kwargs["rule"]["tag_to_insert"]) + kwargs["body_tag"].new_tag( + kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes) # insert all items that was in tag to subtag and remove from tag for content in reversed(kwargs["tag"].contents): tag_to_insert.insert(0, content.extract()) # wrap subtag with items kwargs["tag"].append(tag_to_insert) + @staticmethod + def _replace_text(**kwargs): + if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["tag"].string): + new_text = re.sub(re.compile( + kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["tag"].string) + kwargs["tag"].string.replace_with(new_text) + @staticmethod def _process_tags(body_tag: BeautifulSoup, rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], @@ -171,11 +189,12 @@ class HtmlPresetsProcessor: for tag in body_tag.find_all([re.compile(tag) for tag in tags], {attr["name"]: re.compile(fr"{attr['value']}")}): action(body_tag=body_tag, tag=tag, rule=rule) - # attr replacer - elif condition_on_tag[0] == "tags": - attr = rule["attr"] - for tag in body_tag.find_all([re.compile(tag) for tag in tags], - {attr['name']: re.compile(fr"{attr['value']}")}): + elif condition_on_tag[0] == "text": + # find all tags that are in List of tags and tags that contains required text + for tag in body_tag.find_all( + lambda t: re.search(r"(?=(" + '|'.join([tag for tag in tags]) + r"))", + t.name) and re.search(re.compile(rule["condition"]["text"]), + t.text)): action(body_tag=body_tag, tag=tag, rule=rule) else: for tag in body_tag.find_all([re.compile(tag) for tag in tags]):