Update livecarta_config.py with processing changes

This commit is contained in:
Kiryl
2022-06-27 19:16:17 +03:00
parent 9b4ecfd63c
commit eab4f0130a
2 changed files with 11 additions and 4 deletions

View File

@@ -198,7 +198,7 @@ def _remove_headings_content(content_tag, title_of_chapter: str):
text = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
text = text.strip() # delete extra spaces
if title_of_chapter == text or \
(title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):

View File

@@ -120,13 +120,20 @@ class LiveCartaConfig:
("section", "blockquote",) : ("class", r"feature[1234]"),
}
REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = {
"""('what to replace', 'parent tag', 'child tag')"""
REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS = {
(r"^h[6-9]$", "^figure$", "^section$", "^div$"): "p",
("^aside$",): "blockquote",
("^header$", "^footer$"): "span",
("^header$", "^footer$", ("child", ":not(pre)", "code, kbd, var")): "span",
("^b$",): "strong",
# (("parent", ":not(pre)", "code")): "p",
}
""" > == in (p in li)"""
TAGS_TO_UNWRAP = [
"section", "article", "figcaption", "main", "body", "html",
"section", "article", "figcaption", "main", "body", "html", "li > p",
]
INSERT_TAG_IN_PARENT_TAG = {
("pre", "code, kbd, var"): "code",
}