This repository has been archived on 2026-04-06. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BookConverter/src/epub_converter/html_epub_preprocessor.py

307 lines
11 KiB
Python

import re
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from src.livecarta_config import LiveCartaConfig
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Soup object
chapter_tag: BeautifulSoup
Returns
-------
None
updated body tag
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
"""Function inserts span before tag aren't supported by livecarta"""
new_tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id: str
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
href: str
Name of current chapters file
html_soup: Tag
Soup object of current file
Returns
-------
tags: list [Tag, NavigableString]
Chapter's tags
"""
marked_tags = html_soup.find(
attrs={"id": first_id, "class": "converter-chapter-mark"})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and \
(next_tag.attrs.get("class") == "converter-chapter-mark"):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f"Warning: no match for {first_id, href}"
return tags
def prepare_title(title_of_chapter: str) -> str:
"""Function finalise processing/cleaning title"""
title_str = BeautifulSoup(title_of_chapter, features="lxml").string
# clean extra whitespace characters ([\r\n\t\f\v ])
title_str = re.sub(r"[\s\xa0]", " ", title_str).strip()
return title_str
def _remove_comments(chapter_tag):
for tag in chapter_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
def _wrap_strings_with_p(chapter_tag):
# Headings that are not supported by livecarta converts to <p>
# wrap NavigableString with <p>
for node in chapter_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r"([\s\xa0])", " ", content).strip()
if content:
p_tag = chapter_tag.new_tag("p")
p_tag.append(str(node))
node.replace_with(p_tag)
def _wrap_tags_with_table(chapter_tag):
"""Function wraps <tag> with <table>"""
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_be_wrapped.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def process_tag_using_table(tag_to_wrap):
_wrap_tag_with_table(
chapter_tag,
tag_to_be_wrapped=tag_to_wrap,
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
_add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap()
for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items():
if isinstance(attrs, tuple):
attr, val = attrs[0], attrs[1]
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}):
process_tag_using_table(tag_to_wrap)
else:
for tag_to_wrap in chapter_tag.find_all(tags_to_wrap):
if any(attr_name in attrs for attr_name in tag_to_wrap.attrs):
process_tag_using_table(tag_to_wrap)
def _tags_to_correspond_livecarta_tag(chapter_tag):
"""Function to replace all tags to correspond livecarta tags"""
for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items():
for key in reg_keys:
if isinstance(key, tuple):
replace = key[0]
parent, child = key[1], key[2]
for parent_tag in chapter_tag.select(parent):
if replace == "parent":
parent_tag.name = to_replace_value
elif replace == "child":
for child_tag in parent_tag.select(child):
child_tag.name = to_replace_value
if not child_tag.attrs.get("style"):
child_tag.attrs["style"] =\
"font-size: 14px; font-family: courier new,courier,monospace;"
else:
tags = chapter_tag.find_all(re.compile(key))
for tag in tags:
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
tag.name = to_replace_value
def _unwrap_tags(chapter_tag):
"""Function unwrap tags and move id to span"""
for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP:
for tag in chapter_tag.select(tag_name):
# if tag is a subtag
if ">" in tag_name:
parent = tag.parent
tag.parent.attrs.update(tag.attrs)
_add_span_to_save_ids_for_links(tag, chapter_tag)
tag.unwrap()
def _remove_headings_content(content_tag, title_of_chapter: str):
"""
Function
clean/remove headings from chapter in order to avoid duplication of chapter titles in the content
add span with id in order to
Parameters
----------
content_tag: soup object
Tag of the page
title_of_chapter: str
Chapter title
Returns
-------
None
clean/remove headings & add span with id
"""
title_of_chapter = title_of_chapter.lower()
for tag in content_tag.contents:
text = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
if title_of_chapter == text or \
(title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or content_tag.name)):
_add_span_to_save_ids_for_links(tag, content_tag)
tag.extract()
return
elif not isinstance(tag, NavigableString):
if not _remove_headings_content(tag, title_of_chapter):
break
def _preprocess_table(chapter_tag: BeautifulSoup):
"""Function to preprocess tables and tags(td|th|tr): style"""
tables = chapter_tag.find_all("table")
for table in tables:
for t_tag in table.find_all(re.compile("td|th|tr")):
width = ""
if t_tag.get("style"):
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
if width_match:
size = width_match.group(1)
width = size + "px"
t_tag.attrs["width"] = t_tag.get("width") or width
if t_tag.attrs.get("style"):
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
"border:0;", "")
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
del t_tag.attrs["style"]
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1"
def _insert_tags_in_parents(chapter_tag):
parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
for parent_tag_name, condition in parent_tag2condition.items():
for parent_tag in chapter_tag.select(parent_tag_name):
if parent_tag.select(condition):
continue
else:
tag_to_insert = chapter_tag.new_tag(
LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
# insert all items that was in pre to code and remove from pre
for content in reversed(parent_tag.contents):
tag_to_insert.insert(0, content.extract())
# wrap code with items
parent_tag.append(tag_to_insert)
def _class_removing(chapter_tag):
for tag in chapter_tag.find_all(recursive=True):
if tag.attrs.get("class") \
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""
Function finalise processing/cleaning content
Parameters
----------
title_str: str
content_tag: Tag, soup object
remove_title_from_chapter: bool
Steps
----------
1. heading removal
2. processing tags
3. class removal
Returns
-------
content_tag: str
prepared content
"""
# 1. remove comments
_remove_comments(content_tag)
# 2. wrap NavigableString with tag <p>
_wrap_strings_with_p(content_tag)
_wrap_tags_with_table(content_tag)
_tags_to_correspond_livecarta_tag(content_tag)
_unwrap_tags(content_tag)
# 3. heading removal
if remove_title_from_chapter:
_remove_headings_content(content_tag, title_str)
# 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
_preprocess_table(content_tag)
_insert_tags_in_parents(content_tag)
# 5. remove classes that weren't created by converter
_class_removing(content_tag)
return str(content_tag)