forked from LiveCarta/BookConverter
307 lines
11 KiB
Python
307 lines
11 KiB
Python
import re
|
|
|
|
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
|
|
|
from src.livecarta_config import LiveCartaConfig
|
|
|
|
|
|
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
|
|
"""
|
|
Function adds span with id from tag_to_be_removed
|
|
because this tag will be removed(unwrapped/extract)
|
|
Parameters
|
|
----------
|
|
tag_to_be_removed: Soup object
|
|
chapter_tag: BeautifulSoup
|
|
|
|
Returns
|
|
-------
|
|
None
|
|
updated body tag
|
|
|
|
"""
|
|
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
|
|
"""Function inserts span before tag aren't supported by livecarta"""
|
|
new_tag = chapter_tag.new_tag("span")
|
|
new_tag.attrs["id"] = id_ or ""
|
|
new_tag.attrs["class"] = class_ or ""
|
|
new_tag.string = "\xa0"
|
|
tag_to_be_removed.insert_before(new_tag)
|
|
|
|
if tag_to_be_removed.attrs.get("id"):
|
|
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
|
|
id_=tag_to_be_removed.attrs["id"],
|
|
class_=tag_to_be_removed.attrs.get("class"))
|
|
|
|
|
|
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
|
"""
|
|
After processing on a first_id that corresponds to current chapter,
|
|
from initial html_soup all tags from current chapter are extracted
|
|
Parameters
|
|
----------
|
|
first_id: str
|
|
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
|
|
href: str
|
|
Name of current chapters file
|
|
html_soup: Tag
|
|
Soup object of current file
|
|
|
|
Returns
|
|
-------
|
|
tags: list [Tag, NavigableString]
|
|
Chapter's tags
|
|
|
|
"""
|
|
marked_tags = html_soup.find(
|
|
attrs={"id": first_id, "class": "converter-chapter-mark"})
|
|
if marked_tags:
|
|
next_tag = marked_tags.next_sibling
|
|
tags = []
|
|
while next_tag:
|
|
if not isinstance(next_tag, NavigableString) and \
|
|
(next_tag.attrs.get("class") == "converter-chapter-mark"):
|
|
break
|
|
tags.append(next_tag)
|
|
next_tag = next_tag.next_sibling
|
|
|
|
# remove tags between first_id and next found id
|
|
# save them in list for next steps
|
|
tags = [tag.extract() for tag in tags]
|
|
html_soup.smooth()
|
|
|
|
else:
|
|
assert 0, f"Warning: no match for {first_id, href}"
|
|
|
|
return tags
|
|
|
|
|
|
def prepare_title(title_of_chapter: str) -> str:
|
|
"""Function finalise processing/cleaning title"""
|
|
title_str = BeautifulSoup(title_of_chapter, features="lxml").string
|
|
# clean extra whitespace characters ([\r\n\t\f\v ])
|
|
title_str = re.sub(r"[\s\xa0]", " ", title_str).strip()
|
|
return title_str
|
|
|
|
|
|
def _remove_comments(chapter_tag):
|
|
for tag in chapter_tag.find_all():
|
|
for element in tag(text=lambda text: isinstance(text, Comment)):
|
|
element.extract()
|
|
|
|
|
|
def _wrap_strings_with_p(chapter_tag):
|
|
# Headings that are not supported by livecarta converts to <p>
|
|
# wrap NavigableString with <p>
|
|
for node in chapter_tag:
|
|
if isinstance(node, NavigableString):
|
|
content = str(node)
|
|
content = re.sub(r"([\s\xa0])", " ", content).strip()
|
|
if content:
|
|
p_tag = chapter_tag.new_tag("p")
|
|
p_tag.append(str(node))
|
|
node.replace_with(p_tag)
|
|
|
|
|
|
def _wrap_tags_with_table(chapter_tag):
|
|
"""Function wraps <tag> with <table>"""
|
|
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
|
|
table = chapter_tag.new_tag("table")
|
|
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
|
= border, "center", f"width:{width}%;"
|
|
tbody, tr, td = \
|
|
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
|
td.attrs["bgcolor"] = bg_color
|
|
tag_to_be_wrapped.wrap(td)
|
|
td.wrap(tr)
|
|
tr.wrap(tbody)
|
|
tbody.wrap(table)
|
|
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
|
return table
|
|
|
|
def process_tag_using_table(tag_to_wrap):
|
|
_wrap_tag_with_table(
|
|
chapter_tag,
|
|
tag_to_be_wrapped=tag_to_wrap,
|
|
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
|
|
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
|
|
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
|
|
_add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
|
|
tag_to_wrap.unwrap()
|
|
|
|
for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
|
|
if isinstance(attrs, tuple):
|
|
attr, val = attrs[0], attrs[1]
|
|
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
|
|
process_tag_using_table(tag_to_wrap)
|
|
else:
|
|
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
|
|
if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
|
|
process_tag_using_table(tag_to_wrap)
|
|
|
|
|
|
def _tags_to_correspond_livecarta_tag(chapter_tag):
|
|
"""Function to replace all tags to correspond livecarta tags"""
|
|
for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
|
|
for key in reg_keys:
|
|
if isinstance(key, tuple):
|
|
replace = key[0]
|
|
parent, child = key[1], key[2]
|
|
for parent_tag in chapter_tag.select(parent):
|
|
if replace == "parent":
|
|
parent_tag.name = to_replace_value
|
|
elif replace == "child":
|
|
for child_tag in parent_tag.select(child):
|
|
child_tag.name = to_replace_value
|
|
if not child_tag.attrs.get("style"):
|
|
child_tag.attrs["style"] =\
|
|
"font-size: 14px; font-family: courier new,courier,monospace;"
|
|
else:
|
|
tags = chapter_tag.find_all(re.compile(key))
|
|
for tag in tags:
|
|
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
|
|
tag.name = to_replace_value
|
|
|
|
|
|
def _unwrap_tags(chapter_tag):
|
|
"""Function unwrap tags and move id to span"""
|
|
for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
|
|
for tag in chapter_tag.select(tag_name):
|
|
# if tag is a subtag
|
|
if ">" in tag_name:
|
|
parent = tag.parent
|
|
tag.parent.attrs.update(tag.attrs)
|
|
_add_span_to_save_ids_for_links(tag, chapter_tag)
|
|
tag.unwrap()
|
|
|
|
|
|
def _remove_headings_content(content_tag, title_of_chapter: str):
|
|
"""
|
|
Function
|
|
clean/remove headings from chapter in order to avoid duplication of chapter titles in the content
|
|
add span with id in order to
|
|
Parameters
|
|
----------
|
|
content_tag: soup object
|
|
Tag of the page
|
|
title_of_chapter: str
|
|
Chapter title
|
|
|
|
Returns
|
|
-------
|
|
None
|
|
clean/remove headings & add span with id
|
|
|
|
"""
|
|
title_of_chapter = title_of_chapter.lower()
|
|
for tag in content_tag.contents:
|
|
text = tag if isinstance(tag, NavigableString) else tag.text
|
|
if re.sub(r"[\s\xa0]", "", text):
|
|
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
|
text = text.strip() # delete extra spaces
|
|
if title_of_chapter == text or \
|
|
(title_of_chapter in text and
|
|
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
|
|
_add_span_to_save_ids_for_links(tag, content_tag)
|
|
tag.extract()
|
|
return
|
|
elif not isinstance(tag, NavigableString):
|
|
if not _remove_headings_content(tag, title_of_chapter):
|
|
break
|
|
|
|
|
|
def _preprocess_table(chapter_tag: BeautifulSoup):
|
|
"""Function to preprocess tables and tags(td|th|tr): style"""
|
|
tables = chapter_tag.find_all("table")
|
|
for table in tables:
|
|
for t_tag in table.find_all(re.compile("td|th|tr")):
|
|
width = ""
|
|
if t_tag.get("style"):
|
|
width_match = re.search(
|
|
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
|
|
if width_match:
|
|
size = width_match.group(1)
|
|
width = size + "px"
|
|
|
|
t_tag.attrs["width"] = t_tag.get("width") or width
|
|
|
|
if t_tag.attrs.get("style"):
|
|
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
|
"border:0;", "")
|
|
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
|
|
del t_tag.attrs["style"]
|
|
|
|
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
|
table.attrs["border"] = "1"
|
|
|
|
|
|
def _insert_tags_in_parents(chapter_tag):
|
|
parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
|
|
for parent_tag_name, condition in parent_tag2condition.items():
|
|
for parent_tag in chapter_tag.select(parent_tag_name):
|
|
if parent_tag.select(condition):
|
|
continue
|
|
else:
|
|
tag_to_insert = chapter_tag.new_tag(
|
|
LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
|
|
# insert all items that was in pre to code and remove from pre
|
|
for content in reversed(parent_tag.contents):
|
|
tag_to_insert.insert(0, content.extract())
|
|
# wrap code with items
|
|
parent_tag.append(tag_to_insert)
|
|
|
|
|
|
def _class_removing(chapter_tag):
|
|
for tag in chapter_tag.find_all(recursive=True):
|
|
if tag.attrs.get("class") \
|
|
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
|
|
del tag.attrs["class"]
|
|
|
|
|
|
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
|
"""
|
|
Function finalise processing/cleaning content
|
|
Parameters
|
|
----------
|
|
title_str: str
|
|
|
|
content_tag: Tag, soup object
|
|
|
|
remove_title_from_chapter: bool
|
|
|
|
Steps
|
|
----------
|
|
1. heading removal
|
|
2. processing tags
|
|
3. class removal
|
|
|
|
Returns
|
|
-------
|
|
content_tag: str
|
|
prepared content
|
|
|
|
"""
|
|
# 1. remove comments
|
|
_remove_comments(content_tag)
|
|
|
|
# 2. wrap NavigableString with tag <p>
|
|
_wrap_strings_with_p(content_tag)
|
|
|
|
_wrap_tags_with_table(content_tag)
|
|
|
|
_tags_to_correspond_livecarta_tag(content_tag)
|
|
|
|
_unwrap_tags(content_tag)
|
|
|
|
# 3. heading removal
|
|
if remove_title_from_chapter:
|
|
_remove_headings_content(content_tag, title_str)
|
|
|
|
# 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
|
|
_preprocess_table(content_tag)
|
|
_insert_tags_in_parents(content_tag)
|
|
|
|
# 5. remove classes that weren't created by converter
|
|
_class_removing(content_tag)
|
|
return str(content_tag)
|