Create preset for wrapping tags with tables

This commit is contained in:
Kiryl
2022-06-24 17:12:21 +03:00
parent d91f6aba4a
commit f690412f5c
2 changed files with 71 additions and 97 deletions

View File

@@ -103,6 +103,61 @@ def _wrap_strings_with_p(chapter_tag):
node.replace_with(p_tag)
def _wrap_tags_with_table(chapter_tag):
"""Function wraps <tag> with <table>"""
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_be_wrapped.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def process_tag_using_table(tag_to_wrap):
_wrap_tag_with_table(
chapter_tag,
tag_to_be_wrapped=tag_to_wrap,
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
_add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap()
for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
if isinstance(attrs, tuple):
attr, val = attrs[0], attrs[1]
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
process_tag_using_table(tag_to_wrap)
else:
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
process_tag_using_table(tag_to_wrap)
def _tags_to_correspond_livecarta_tag(chapter_tag):
"""Function to replace all tags to correspond livecarta tags"""
for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items():
for key in reg_key:
tags = chapter_tag.find_all(re.compile(key))
for tag in tags:
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
tag.name = to_replace_value
def _unwrap_tags(chapter_tag):
"""Function unwrap tags and move id to span"""
for tag in LiveCartaConfig.TAGS_TO_UNWRAP:
for s in chapter_tag.find_all(tag):
_add_span_to_save_ids_for_links(s, chapter_tag)
s.unwrap()
def _remove_headings_content(content_tag, title_of_chapter: str):
"""
Function
@@ -138,23 +193,6 @@ def _remove_headings_content(content_tag, title_of_chapter: str):
break
def _tags_to_correspond_livecarta_tag(chapter_tag):
"""Function to replace all tags to correspond livecarta tags"""
for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items():
for key in reg_key:
tags = chapter_tag.find_all(re.compile(key))
for tag in tags:
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
tag.name = to_replace_value
def _unwrap_tags(chapter_tag):
"""Function unwrap tags and move id to span"""
for tag in LiveCartaConfig. TAGS_TO_UNWRAP:
for s in chapter_tag.find_all(tag):
_add_span_to_save_ids_for_links(s, chapter_tag)
s.unwrap()
# todo remove
def _process_lists(chapter_tag: BeautifulSoup):
"""
@@ -181,13 +219,11 @@ def _preprocess_table(chapter_tag: BeautifulSoup):
"""Function to preprocess tables and tags(td|th|tr): style"""
tables = chapter_tag.find_all("table")
for table in tables:
t_tags = table.find_all(re.compile("td|th|tr"))
for t_tag in t_tags:
style = t_tag.get("style")
for t_tag in table.find_all(re.compile("td|th|tr")):
width = ""
if style:
if t_tag.get("style"):
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
if width_match:
size = width_match.group(1)
width = size + "px"
@@ -197,9 +233,8 @@ def _preprocess_table(chapter_tag: BeautifulSoup):
if t_tag.attrs.get("style"):
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
"border:0;", "")
elif t_tag.attrs.get("style") == "":
del t_tag.attrs["style"]
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
del t_tag.attrs["style"]
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1"
@@ -254,72 +289,6 @@ def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
pre.append(code)
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
"""Function wraps <tag> with <table>"""
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_be_wrapped.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def _preprocess_div_tags(chapter_tag):
"""
Function replace <div> with <table>:
"""
for div in chapter_tag.find_all("div"):
if any(attr in ["width", "border", "bgcolor"] for attr in div.attrs):
_wrap_tag_with_table(
chapter_tag,
tag_to_be_wrapped=div,
width=div.attrs["width"] if div.attrs.get("width") else "100",
border=div.attrs["border"] if div.attrs.get("border") else None,
bg_color=div.attrs["bgcolor"] if div.attrs.get("bgcolor") else None)
else:
div.name = "p"
continue
_add_span_to_save_ids_for_links(div, chapter_tag)
div.unwrap()
def _clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
hr.extract()
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
h.insert_before(BeautifulSoup(features="lxml").new_tag("br"))
def _preprocess_block_tags(chapter_tag: Tag):
"""Function preprocessing <block> tags"""
for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(block)
color = "#DDDDDD" if block.attrs.get(
"class") == "feature1" else None
color = "#EEEEEE" if block.attrs.get(
"class") == "feature2" else color
_wrap_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(future_block)
color = "#DDDDDD" if future_block.attrs.get(
"class") == "feature1" else None
color = "#EEEEEE" if future_block.attrs.get(
"class") == "feature2" else color
_wrap_tag_with_table(chapter_tag, future_block, bg_color=color)
def _class_removing(chapter_tag):
for tag in chapter_tag.find_all(recursive=True):
if tag.attrs.get("class") \
@@ -356,6 +325,8 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
# 2. wrap NavigableString with tag <p>
_wrap_strings_with_p(content_tag)
_wrap_tags_with_table(content_tag)
_tags_to_correspond_livecarta_tag(content_tag)
_unwrap_tags(content_tag)
@@ -365,12 +336,10 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
_remove_headings_content(content_tag, title_str)
# 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
_process_lists(content_tag)
_process_lists(content_tag) # todo regex
_preprocess_table(content_tag)
_preprocess_code_tags(content_tag)
_preprocess_pre_tags(content_tag)
_preprocess_div_tags(content_tag)
_preprocess_block_tags(content_tag)
_preprocess_code_tags(content_tag) # todo regex
_preprocess_pre_tags(content_tag) # todo regex
# 5. remove classes that weren't created by converter
_class_removing(content_tag)

View File

@@ -115,8 +115,13 @@ class LiveCartaConfig:
r"(^h[1-9]$)": ["list-style-type"]
}
WRAP_TAGS_WITH_TABLE = {
("div",) :["width", "border", "bgcolor"],
("section", "blockquote",) : ("class", r"feature[1234]"),
}
REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = {
(r"^h[6-9]$", "figure$", "section$"): "p",
(r"^h[6-9]$", "^figure$", "^section$", "^div$"): "p",
("^aside$",): "blockquote",
("^header$", "^footer$"): "span",
("^b$",): "strong",