forked from LiveCarta/BookConverter
Create preset for wrapping tags with tables
This commit is contained in:
@@ -103,6 +103,61 @@ def _wrap_strings_with_p(chapter_tag):
|
|||||||
node.replace_with(p_tag)
|
node.replace_with(p_tag)
|
||||||
|
|
||||||
|
|
||||||
|
def _wrap_tags_with_table(chapter_tag):
|
||||||
|
"""Function wraps <tag> with <table>"""
|
||||||
|
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
|
||||||
|
table = chapter_tag.new_tag("table")
|
||||||
|
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
||||||
|
= border, "center", f"width:{width}%;"
|
||||||
|
tbody, tr, td = \
|
||||||
|
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||||
|
td.attrs["bgcolor"] = bg_color
|
||||||
|
tag_to_be_wrapped.wrap(td)
|
||||||
|
td.wrap(tr)
|
||||||
|
tr.wrap(tbody)
|
||||||
|
tbody.wrap(table)
|
||||||
|
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
||||||
|
return table
|
||||||
|
|
||||||
|
def process_tag_using_table(tag_to_wrap):
|
||||||
|
_wrap_tag_with_table(
|
||||||
|
chapter_tag,
|
||||||
|
tag_to_be_wrapped=tag_to_wrap,
|
||||||
|
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
|
||||||
|
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
|
||||||
|
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
||||||
|
_add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
||||||
|
tag_to_wrap.unwrap()
|
||||||
|
|
||||||
|
for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
|
||||||
|
if isinstance(attrs, tuple):
|
||||||
|
attr, val = attrs[0], attrs[1]
|
||||||
|
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
|
||||||
|
process_tag_using_table(tag_to_wrap)
|
||||||
|
else:
|
||||||
|
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
|
||||||
|
if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
|
||||||
|
process_tag_using_table(tag_to_wrap)
|
||||||
|
|
||||||
|
|
||||||
|
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
||||||
|
"""Function to replace all tags to correspond livecarta tags"""
|
||||||
|
for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items():
|
||||||
|
for key in reg_key:
|
||||||
|
tags = chapter_tag.find_all(re.compile(key))
|
||||||
|
for tag in tags:
|
||||||
|
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
||||||
|
tag.name = to_replace_value
|
||||||
|
|
||||||
|
|
||||||
|
def _unwrap_tags(chapter_tag):
|
||||||
|
"""Function unwrap tags and move id to span"""
|
||||||
|
for tag in LiveCartaConfig.TAGS_TO_UNWRAP:
|
||||||
|
for s in chapter_tag.find_all(tag):
|
||||||
|
_add_span_to_save_ids_for_links(s, chapter_tag)
|
||||||
|
s.unwrap()
|
||||||
|
|
||||||
|
|
||||||
def _remove_headings_content(content_tag, title_of_chapter: str):
|
def _remove_headings_content(content_tag, title_of_chapter: str):
|
||||||
"""
|
"""
|
||||||
Function
|
Function
|
||||||
@@ -138,23 +193,6 @@ def _remove_headings_content(content_tag, title_of_chapter: str):
|
|||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
|
||||||
"""Function to replace all tags to correspond livecarta tags"""
|
|
||||||
for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items():
|
|
||||||
for key in reg_key:
|
|
||||||
tags = chapter_tag.find_all(re.compile(key))
|
|
||||||
for tag in tags:
|
|
||||||
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
|
||||||
tag.name = to_replace_value
|
|
||||||
|
|
||||||
def _unwrap_tags(chapter_tag):
|
|
||||||
"""Function unwrap tags and move id to span"""
|
|
||||||
for tag in LiveCartaConfig. TAGS_TO_UNWRAP:
|
|
||||||
for s in chapter_tag.find_all(tag):
|
|
||||||
_add_span_to_save_ids_for_links(s, chapter_tag)
|
|
||||||
s.unwrap()
|
|
||||||
|
|
||||||
|
|
||||||
# todo remove
|
# todo remove
|
||||||
def _process_lists(chapter_tag: BeautifulSoup):
|
def _process_lists(chapter_tag: BeautifulSoup):
|
||||||
"""
|
"""
|
||||||
@@ -181,13 +219,11 @@ def _preprocess_table(chapter_tag: BeautifulSoup):
|
|||||||
"""Function to preprocess tables and tags(td|th|tr): style"""
|
"""Function to preprocess tables and tags(td|th|tr): style"""
|
||||||
tables = chapter_tag.find_all("table")
|
tables = chapter_tag.find_all("table")
|
||||||
for table in tables:
|
for table in tables:
|
||||||
t_tags = table.find_all(re.compile("td|th|tr"))
|
for t_tag in table.find_all(re.compile("td|th|tr")):
|
||||||
for t_tag in t_tags:
|
|
||||||
style = t_tag.get("style")
|
|
||||||
width = ""
|
width = ""
|
||||||
if style:
|
if t_tag.get("style"):
|
||||||
width_match = re.search(
|
width_match = re.search(
|
||||||
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
|
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
|
||||||
if width_match:
|
if width_match:
|
||||||
size = width_match.group(1)
|
size = width_match.group(1)
|
||||||
width = size + "px"
|
width = size + "px"
|
||||||
@@ -197,8 +233,7 @@ def _preprocess_table(chapter_tag: BeautifulSoup):
|
|||||||
if t_tag.attrs.get("style"):
|
if t_tag.attrs.get("style"):
|
||||||
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
||||||
"border:0;", "")
|
"border:0;", "")
|
||||||
|
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
|
||||||
elif t_tag.attrs.get("style") == "":
|
|
||||||
del t_tag.attrs["style"]
|
del t_tag.attrs["style"]
|
||||||
|
|
||||||
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
||||||
@@ -254,72 +289,6 @@ def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
|
|||||||
pre.append(code)
|
pre.append(code)
|
||||||
|
|
||||||
|
|
||||||
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
|
|
||||||
"""Function wraps <tag> with <table>"""
|
|
||||||
table = chapter_tag.new_tag("table")
|
|
||||||
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
|
||||||
= border, "center", f"width:{width}%;"
|
|
||||||
tbody, tr, td = \
|
|
||||||
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
|
||||||
td.attrs["bgcolor"] = bg_color
|
|
||||||
tag_to_be_wrapped.wrap(td)
|
|
||||||
td.wrap(tr)
|
|
||||||
tr.wrap(tbody)
|
|
||||||
tbody.wrap(table)
|
|
||||||
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
|
||||||
return table
|
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_div_tags(chapter_tag):
|
|
||||||
"""
|
|
||||||
Function replace <div> with <table>:
|
|
||||||
"""
|
|
||||||
for div in chapter_tag.find_all("div"):
|
|
||||||
if any(attr in ["width", "border", "bgcolor"] for attr in div.attrs):
|
|
||||||
_wrap_tag_with_table(
|
|
||||||
chapter_tag,
|
|
||||||
tag_to_be_wrapped=div,
|
|
||||||
width=div.attrs["width"] if div.attrs.get("width") else "100",
|
|
||||||
border=div.attrs["border"] if div.attrs.get("border") else None,
|
|
||||||
bg_color=div.attrs["bgcolor"] if div.attrs.get("bgcolor") else None)
|
|
||||||
else:
|
|
||||||
div.name = "p"
|
|
||||||
continue
|
|
||||||
_add_span_to_save_ids_for_links(div, chapter_tag)
|
|
||||||
div.unwrap()
|
|
||||||
|
|
||||||
|
|
||||||
def _clean_wiley_block(block):
|
|
||||||
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
|
|
||||||
for hr in hrs:
|
|
||||||
hr.extract()
|
|
||||||
h = block.find(re.compile("h[1-9]"))
|
|
||||||
if h:
|
|
||||||
h.name = "p"
|
|
||||||
h.insert_before(BeautifulSoup(features="lxml").new_tag("br"))
|
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_block_tags(chapter_tag: Tag):
|
|
||||||
"""Function preprocessing <block> tags"""
|
|
||||||
for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
|
|
||||||
_clean_wiley_block(block)
|
|
||||||
color = "#DDDDDD" if block.attrs.get(
|
|
||||||
"class") == "feature1" else None
|
|
||||||
color = "#EEEEEE" if block.attrs.get(
|
|
||||||
"class") == "feature2" else color
|
|
||||||
_wrap_tag_with_table(chapter_tag, block, bg_color=color)
|
|
||||||
block.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
|
||||||
block.unwrap()
|
|
||||||
|
|
||||||
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
|
|
||||||
_clean_wiley_block(future_block)
|
|
||||||
color = "#DDDDDD" if future_block.attrs.get(
|
|
||||||
"class") == "feature1" else None
|
|
||||||
color = "#EEEEEE" if future_block.attrs.get(
|
|
||||||
"class") == "feature2" else color
|
|
||||||
_wrap_tag_with_table(chapter_tag, future_block, bg_color=color)
|
|
||||||
|
|
||||||
|
|
||||||
def _class_removing(chapter_tag):
|
def _class_removing(chapter_tag):
|
||||||
for tag in chapter_tag.find_all(recursive=True):
|
for tag in chapter_tag.find_all(recursive=True):
|
||||||
if tag.attrs.get("class") \
|
if tag.attrs.get("class") \
|
||||||
@@ -356,6 +325,8 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
|||||||
# 2. wrap NavigableString with tag <p>
|
# 2. wrap NavigableString with tag <p>
|
||||||
_wrap_strings_with_p(content_tag)
|
_wrap_strings_with_p(content_tag)
|
||||||
|
|
||||||
|
_wrap_tags_with_table(content_tag)
|
||||||
|
|
||||||
_tags_to_correspond_livecarta_tag(content_tag)
|
_tags_to_correspond_livecarta_tag(content_tag)
|
||||||
|
|
||||||
_unwrap_tags(content_tag)
|
_unwrap_tags(content_tag)
|
||||||
@@ -365,12 +336,10 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
|||||||
_remove_headings_content(content_tag, title_str)
|
_remove_headings_content(content_tag, title_str)
|
||||||
|
|
||||||
# 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
|
# 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
|
||||||
_process_lists(content_tag)
|
_process_lists(content_tag) # todo regex
|
||||||
_preprocess_table(content_tag)
|
_preprocess_table(content_tag)
|
||||||
_preprocess_code_tags(content_tag)
|
_preprocess_code_tags(content_tag) # todo regex
|
||||||
_preprocess_pre_tags(content_tag)
|
_preprocess_pre_tags(content_tag) # todo regex
|
||||||
_preprocess_div_tags(content_tag)
|
|
||||||
_preprocess_block_tags(content_tag)
|
|
||||||
|
|
||||||
# 5. remove classes that weren't created by converter
|
# 5. remove classes that weren't created by converter
|
||||||
_class_removing(content_tag)
|
_class_removing(content_tag)
|
||||||
|
|||||||
@@ -115,8 +115,13 @@ class LiveCartaConfig:
|
|||||||
r"(^h[1-9]$)": ["list-style-type"]
|
r"(^h[1-9]$)": ["list-style-type"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
WRAP_TAGS_WITH_TABLE = {
|
||||||
|
("div",) :["width", "border", "bgcolor"],
|
||||||
|
("section", "blockquote",) : ("class", r"feature[1234]"),
|
||||||
|
}
|
||||||
|
|
||||||
REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = {
|
REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = {
|
||||||
(r"^h[6-9]$", "figure$", "section$"): "p",
|
(r"^h[6-9]$", "^figure$", "^section$", "^div$"): "p",
|
||||||
("^aside$",): "blockquote",
|
("^aside$",): "blockquote",
|
||||||
("^header$", "^footer$"): "span",
|
("^header$", "^footer$"): "span",
|
||||||
("^b$",): "strong",
|
("^b$",): "strong",
|
||||||
|
|||||||
Reference in New Issue
Block a user