forked from LiveCarta/BookConverter
Merge pull request #307 from Teqniksoft/kiryl/converter_fix
Kiryl/converter fix
This commit is contained in:
@@ -12,9 +12,13 @@
|
|||||||
"name": "id",
|
"name": "id",
|
||||||
"value": "^Table of Contents\\d+"
|
"value": "^Table of Contents\\d+"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"text": null
|
||||||
},
|
},
|
||||||
"tag_to_wrap": "TOC"
|
"tag_to_wrap": {
|
||||||
|
"name": "TOC",
|
||||||
|
"attrs": []
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -35,7 +39,8 @@
|
|||||||
"name": "id",
|
"name": "id",
|
||||||
"value": "^Table of Contents\\d+"
|
"value": "^Table of Contents\\d+"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"text": null
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -46,7 +51,10 @@
|
|||||||
{
|
{
|
||||||
"tags": ["^h[6-9]$"],
|
"tags": ["^h[6-9]$"],
|
||||||
"condition": null,
|
"condition": null,
|
||||||
"tag_to_replace": "p"
|
"tag_to_replace": {
|
||||||
|
"name": "p",
|
||||||
|
"attrs": null
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"tags": ["^div$"],
|
"tags": ["^div$"],
|
||||||
@@ -58,9 +66,13 @@
|
|||||||
"name": "style",
|
"name": "style",
|
||||||
"value": "column-count: 2"
|
"value": "column-count: 2"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"text": null
|
||||||
},
|
},
|
||||||
"tag_to_replace": "p"
|
"tag_to_replace": {
|
||||||
|
"name": "p",
|
||||||
|
"attrs": null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -68,12 +80,14 @@
|
|||||||
"preset_name": "attr_replacer",
|
"preset_name": "attr_replacer",
|
||||||
"rules": [
|
"rules": [
|
||||||
{
|
{
|
||||||
"attr": {
|
"tags": ["^p$"],
|
||||||
"name": "style",
|
|
||||||
"value": "column-count: 2"
|
|
||||||
},
|
|
||||||
"condition": {
|
"condition": {
|
||||||
"tags": ["^p$"]
|
"attrs": [
|
||||||
|
{
|
||||||
|
"name": "style",
|
||||||
|
"value": "column-count: 2"
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"attr_to_replace": {
|
"attr_to_replace": {
|
||||||
"name": "class",
|
"name": "class",
|
||||||
@@ -103,7 +117,8 @@
|
|||||||
"name": "face",
|
"name": "face",
|
||||||
"value": "^Times New Roman[\\w, ]+$"
|
"value": "^Times New Roman[\\w, ]+$"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"text": null
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -111,7 +126,8 @@
|
|||||||
"condition": {
|
"condition": {
|
||||||
"parent_tags": ":is(li)",
|
"parent_tags": ":is(li)",
|
||||||
"child_tags": null,
|
"child_tags": null,
|
||||||
"attrs": null
|
"attrs": null,
|
||||||
|
"text": null
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -124,7 +140,8 @@
|
|||||||
"name": "name",
|
"name": "name",
|
||||||
"value": "_GoBack"
|
"value": "_GoBack"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"text": null
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -132,7 +149,8 @@
|
|||||||
"condition": {
|
"condition": {
|
||||||
"parent_tags": ":is(a)",
|
"parent_tags": ":is(a)",
|
||||||
"child_tags": ":is(a)",
|
"child_tags": ":is(a)",
|
||||||
"attrs": null
|
"attrs": null,
|
||||||
|
"text": null
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -140,7 +158,8 @@
|
|||||||
"condition": {
|
"condition": {
|
||||||
"parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)",
|
"parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)",
|
||||||
"child_tags": null,
|
"child_tags": null,
|
||||||
"attrs": null
|
"attrs": null,
|
||||||
|
"text": null
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -148,5 +167,40 @@
|
|||||||
"condition": null
|
"condition": null
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"preset_name": "inserter",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"tags": ["^p$"],
|
||||||
|
"condition": {
|
||||||
|
"parent_tags": null,
|
||||||
|
"child_tags": null,
|
||||||
|
"attrs": null,
|
||||||
|
"text": "\\$\\$[\\s\\S]*?\\$\\$"
|
||||||
|
},
|
||||||
|
"tag_to_insert": {
|
||||||
|
"name": "span",
|
||||||
|
"attrs": [
|
||||||
|
{
|
||||||
|
"name": "class",
|
||||||
|
"value": "math-tex"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"preset_name": "text_replacer",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"tags": ["^p$"],
|
||||||
|
"condition": {
|
||||||
|
"text": "(\\\\nonumber\\\\\\\\\\\\noalign{\\\\pagebreak}[\\s\\S]*?)\\\\"
|
||||||
|
},
|
||||||
|
"text_to_replace": "\\\\"
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -48,17 +48,23 @@
|
|||||||
{
|
{
|
||||||
"tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"],
|
"tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"],
|
||||||
"condition": null,
|
"condition": null,
|
||||||
"tag_to_replace": "p"
|
"tag_to_replace": {
|
||||||
|
"name": "p"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"tags": ["^aside$"],
|
"tags": ["^aside$"],
|
||||||
"condition": null,
|
"condition": null,
|
||||||
"tag_to_replace": "div"
|
"tag_to_replace": {
|
||||||
|
"name": "div"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"tags": ["^header$", "^footer$"],
|
"tags": ["^header$", "^footer$"],
|
||||||
"condition": null,
|
"condition": null,
|
||||||
"tag_to_replace": "span"
|
"tag_to_replace": {
|
||||||
|
"name": "span"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"tags": ["^code$", "^kbd$", "^var$"],
|
"tags": ["^code$", "^kbd$", "^var$"],
|
||||||
@@ -67,22 +73,30 @@
|
|||||||
"child_tags": null,
|
"child_tags": null,
|
||||||
"attrs": null
|
"attrs": null
|
||||||
},
|
},
|
||||||
"tag_to_replace": "span"
|
"tag_to_replace": {
|
||||||
|
"name": "span"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"tags": ["^em$"],
|
"tags": ["^em$"],
|
||||||
"condition": null,
|
"condition": null,
|
||||||
"tag_to_replace": "i"
|
"tag_to_replace": {
|
||||||
|
"name": "i"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"tags": ["^b$"],
|
"tags": ["^b$"],
|
||||||
"condition": null,
|
"condition": null,
|
||||||
"tag_to_replace": "strong"
|
"tag_to_replace": {
|
||||||
|
"name": "strong"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"tags": ["^image$"],
|
"tags": ["^image$"],
|
||||||
"condition": null,
|
"condition": null,
|
||||||
"tag_to_replace": "img"
|
"tag_to_replace": {
|
||||||
|
"name": "img"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -90,12 +104,14 @@
|
|||||||
"preset_name": "attr_replacer",
|
"preset_name": "attr_replacer",
|
||||||
"rules": [
|
"rules": [
|
||||||
{
|
{
|
||||||
"attr": {
|
"tags": ["^img$"],
|
||||||
"name": "xlink:href",
|
|
||||||
"value": ".*"
|
|
||||||
},
|
|
||||||
"condition": {
|
"condition": {
|
||||||
"tags": ["^img$"]
|
"attrs": [
|
||||||
|
{
|
||||||
|
"name": "xlink:href",
|
||||||
|
"value": ".*"
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"attr_to_replace": {
|
"attr_to_replace": {
|
||||||
"name": "src",
|
"name": "src",
|
||||||
@@ -140,12 +156,18 @@
|
|||||||
"child_tags": ":not(:has(code, kbd, var))",
|
"child_tags": ":not(:has(code, kbd, var))",
|
||||||
"attrs": null
|
"attrs": null
|
||||||
},
|
},
|
||||||
"tag_to_insert": "code"
|
"tag_to_insert": {
|
||||||
|
"name": "code",
|
||||||
|
"attrs": []
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"tags": ["^h[1-5]$"],
|
"tags": ["^h[1-5]$"],
|
||||||
"condition": null,
|
"condition": null,
|
||||||
"tag_to_insert": "strong"
|
"tag_to_insert": {
|
||||||
|
"name":"strong",
|
||||||
|
"attrs": []
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -80,7 +80,8 @@ class DocxBook(BookSolver):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
docx_file_path = "../../books/docx/AmericanGovernment3e-WEB.docx"
|
|
||||||
|
docx_file_path = "../../books/docx/output.docx"
|
||||||
logger_object = BookLogger(
|
logger_object = BookLogger(
|
||||||
name="docx", book_id=docx_file_path.split("/")[-1])
|
name="docx", book_id=docx_file_path.split("/")[-1])
|
||||||
locker = Event()
|
locker = Event()
|
||||||
@@ -88,7 +89,6 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
html_converter = Docx2LibreHtml(file_path=docx_file_path,
|
html_converter = Docx2LibreHtml(file_path=docx_file_path,
|
||||||
logger=logger_object, libre_locker=locker)
|
logger=logger_object, libre_locker=locker)
|
||||||
|
|
||||||
html_preprocessor = HtmlPresetsProcessor(
|
html_preprocessor = HtmlPresetsProcessor(
|
||||||
logger=logger_object, preset_path="../../presets/docx_presets.json")
|
logger=logger_object, preset_path="../../presets/docx_presets.json")
|
||||||
style_preprocessor = StyleReader()
|
style_preprocessor = StyleReader()
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ class EpubBook(BookSolver):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
epub_file_path = "../../books/epub/9780763774134.epub"
|
epub_file_path = "../../books/epub/9781634259804.epub"
|
||||||
logger_object = BookLogger(
|
logger_object = BookLogger(
|
||||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||||
|
|
||||||
|
|||||||
@@ -18,13 +18,14 @@ class HtmlPresetsProcessor:
|
|||||||
"replacer": self._replace_tag,
|
"replacer": self._replace_tag,
|
||||||
"attr_replacer": self._replace_attr,
|
"attr_replacer": self._replace_attr,
|
||||||
"unwrapper": self._unwrap_tag,
|
"unwrapper": self._unwrap_tag,
|
||||||
"inserter": self._insert_tag
|
"inserter": self._insert_tag,
|
||||||
|
"text_replacer": self._replace_text
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _wrap_tag(**kwargs):
|
def _wrap_tag(**kwargs):
|
||||||
kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
|
kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
|
||||||
kwargs["rule"]["tag_to_wrap"]))
|
kwargs["rule"]["tag_to_wrap"]["name"]))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def set_attrs_to_parent(tag, parent_tag):
|
def set_attrs_to_parent(tag, parent_tag):
|
||||||
@@ -98,24 +99,31 @@ class HtmlPresetsProcessor:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _replace_tag(**kwargs):
|
def _replace_tag(**kwargs):
|
||||||
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
|
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"]
|
||||||
kwargs["tag"].name = tag_to_replace
|
kwargs["tag"].name = tag_to_replace
|
||||||
|
if kwargs["rule"]["tag_to_replace"].get("attrs"):
|
||||||
|
dict_attributes = {attr["name"]: attr["value"]
|
||||||
|
for attr in kwargs["rule"]["tag_to_replace"]["attrs"]}
|
||||||
|
kwargs["tag"].attrs = dict_attributes
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _replace_attr(**kwargs):
|
def _replace_attr(**kwargs):
|
||||||
attr, attr_value =\
|
attr = kwargs["rule"]["condition"]["attrs"][0]
|
||||||
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
|
attr_name, attr_value =\
|
||||||
|
attr["name"], attr["value"]
|
||||||
attr_to_replace, attr_value_to_replace =\
|
attr_to_replace, attr_value_to_replace =\
|
||||||
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
|
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
|
||||||
if attr_to_replace:
|
if attr_to_replace:
|
||||||
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
|
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr_name] \
|
||||||
|
if kwargs["tag"].get(attr_name)\
|
||||||
|
else ""
|
||||||
if attr_value_to_replace:
|
if attr_value_to_replace:
|
||||||
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
|
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
|
||||||
del kwargs["tag"][attr]
|
del kwargs["tag"][attr_name]
|
||||||
elif attr_value_to_replace:
|
elif attr_value_to_replace:
|
||||||
kwargs["tag"].attrs[attr] = attr_value_to_replace
|
kwargs["tag"].attrs[attr_name] = attr_value_to_replace
|
||||||
elif attr:
|
elif attr_name:
|
||||||
del kwargs["tag"][attr]
|
del kwargs["tag"][attr_name]
|
||||||
|
|
||||||
def _unwrap_tag(self, **kwargs):
|
def _unwrap_tag(self, **kwargs):
|
||||||
if kwargs["tag"].parent:
|
if kwargs["tag"].parent:
|
||||||
@@ -124,14 +132,24 @@ class HtmlPresetsProcessor:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _insert_tag(**kwargs):
|
def _insert_tag(**kwargs):
|
||||||
|
dict_attributes = {attr["name"]: attr["value"]
|
||||||
|
for attr in kwargs["rule"]["tag_to_insert"]["attrs"]}
|
||||||
tag_to_insert = \
|
tag_to_insert = \
|
||||||
kwargs["body_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
|
kwargs["body_tag"].new_tag(
|
||||||
|
kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes)
|
||||||
# insert all items that was in tag to subtag and remove from tag
|
# insert all items that was in tag to subtag and remove from tag
|
||||||
for content in reversed(kwargs["tag"].contents):
|
for content in reversed(kwargs["tag"].contents):
|
||||||
tag_to_insert.insert(0, content.extract())
|
tag_to_insert.insert(0, content.extract())
|
||||||
# wrap subtag with items
|
# wrap subtag with items
|
||||||
kwargs["tag"].append(tag_to_insert)
|
kwargs["tag"].append(tag_to_insert)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _replace_text(**kwargs):
|
||||||
|
if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["tag"].string):
|
||||||
|
new_text = re.sub(re.compile(
|
||||||
|
kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["tag"].string)
|
||||||
|
kwargs["tag"].string.replace_with(new_text)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _process_tags(body_tag: BeautifulSoup,
|
def _process_tags(body_tag: BeautifulSoup,
|
||||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
|
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
|
||||||
@@ -171,11 +189,12 @@ class HtmlPresetsProcessor:
|
|||||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||||
# attr replacer
|
elif condition_on_tag[0] == "text":
|
||||||
elif condition_on_tag[0] == "tags":
|
# find all tags that are in List of tags and tags that contains required text
|
||||||
attr = rule["attr"]
|
for tag in body_tag.find_all(
|
||||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in tags]) + r"))",
|
||||||
{attr['name']: re.compile(fr"{attr['value']}")}):
|
t.name) and re.search(re.compile(rule["condition"]["text"]),
|
||||||
|
t.text)):
|
||||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||||
else:
|
else:
|
||||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
|
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
|
|||||||
Reference in New Issue
Block a user