Updates to presets

This commit is contained in:
Kiryl
2022-06-21 11:47:26 +03:00
parent 73513e63b5
commit c62192d028
9 changed files with 668 additions and 739 deletions

View File

@@ -222,7 +222,6 @@ class HTMLDocxPreprocessor:
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")

View File

@@ -11,13 +11,13 @@ from src.livecarta_config import LiveCartaConfig
def get_text_color(x):
color = str2hex(x)
color = color if color not in ['#000000', '#000', 'black'] else ''
color = color if color not in ["#000000", "#000", "black"] else ""
return color
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
color = color if color not in ["#ffffff", "#fff", "white"] else ""
return color
@@ -43,25 +43,25 @@ def convert_tag_style_values(size_value: str) -> str:
return LiveCartaConfig.sizes_px[last_possible_size_index]
font_size_regexp = re.compile(
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)")
has_style_attrs = re.search(font_size_regexp, size_value)
if has_style_attrs:
if has_style_attrs.group(1):
size_value = float(size_value.replace('%', '')) / 100.0
size_value = float(size_value.replace("%", "")) / 100.0
return find_closest_size(size_value)
elif has_style_attrs.group(3):
size_value = float(size_value.replace('em', ''))
size_value = float(size_value.replace("em", ""))
return find_closest_size(size_value)
elif has_style_attrs.group(5):
return size_value.replace('pt', 'px')
return size_value.replace("pt", "px")
else:
return ''
return ""
return size_value
def convert_indents_tag_values(size_value: str) -> str:
"""
Function converts values of ['text-indent', 'margin-left', 'margin']
Function converts values of ["text-indent", "margin-left", "margin"]
Parameters
----------
size_value: str
@@ -71,12 +71,12 @@ def convert_indents_tag_values(size_value: str) -> str:
size_value: str
"""
if len(size_value.split(' ')) == 3:
if len(size_value.split(" ")) == 3:
size_value = convert_tag_style_values(size_value.split(
' ')[-2]) # returns middle value
" ")[-2]) # returns middle value
else:
size_value = convert_tag_style_values(size_value.split(
' ')[-1]) # returns last value
" ")[-1]) # returns last value
return size_value
@@ -87,35 +87,35 @@ If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [],
'font': [],
'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys()
"text-indent": [],
"font-variant": ["small-caps"],
"text-align": [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
"align": [],
"font": [],
"font-family": [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys()
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
'color': [],
'background-color': [],
'background': [],
'width': [],
'border': [],
'border-top-width': [],
'border-right-width': [],
'border-left-width': [],
'border-bottom-width': [],
'border-top': [],
'border-bottom': [],
'list-style-type': [],
'list-style-image': [],
'margin-left': [],
'margin-top': [],
'margin': [],
"font-size": [],
"font-weight": ["bold", "600", "700", "800", "900"], # <strong>
"font-style": ["italic"], # <i>
"text-decoration": ["underline", "line-through"], # <u> , <s>
"text-decoration-line": ["underline", "line-through"], # <u> , <s>
"vertical-align": ["super"], # <sup>
"color": [],
"background-color": [],
"background": [],
"width": [],
"border": [],
"border-top-width": [],
"border-right-width": [],
"border-left-width": [],
"border-bottom-width": [],
"border-top": [],
"border-bottom": [],
"list-style-type": [],
"list-style-image": [],
"margin-left": [],
"margin-top": [],
"margin": [],
}
"""
@@ -125,28 +125,28 @@ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING shou
to suit livecarta style convention.
"""
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents_tag_values,
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title()))
"text-indent": convert_indents_tag_values,
"font-variant": lambda x: x,
"text-align": lambda x: x,
"font": lambda x: "",
"font-family": lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title()))
or LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x)),
'font-size': convert_tag_style_values,
'color': get_text_color,
'background-color': get_bg_color,
'background': get_bg_color,
'border': lambda x: x if x != '0' else '',
'border-top-width': lambda x: x if x != '0' else '',
'border-right-width': lambda x: x if x != '0' else '',
'border-left-width': lambda x: x if x != '0' else '',
'border-bottom-width': lambda x: x if x != '0' else '',
'border-top': lambda x: x if x != '0' else '',
'border-bottom': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc',
'list-style-image': lambda x: 'disc',
'margin-left': convert_indents_tag_values,
'margin-top': convert_tag_style_values,
'margin': convert_indents_tag_values
"font-size": convert_tag_style_values,
"color": get_text_color,
"background-color": get_bg_color,
"background": get_bg_color,
"border": lambda x: x if x != "0" else "",
"border-top-width": lambda x: x if x != "0" else "",
"border-right-width": lambda x: x if x != "0" else "",
"border-left-width": lambda x: x if x != "0" else "",
"border-bottom-width": lambda x: x if x != "0" else "",
"border-top": lambda x: x if x != "0" else "",
"border-bottom": lambda x: x if x != "0" else "",
"list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
"list-style-image": lambda x: "disc",
"margin-left": convert_indents_tag_values,
"margin-top": convert_tag_style_values,
"margin": convert_indents_tag_values
}
@@ -155,17 +155,17 @@ def update_inline_styles_to_livecarta_convention(split_style: list):
style_name, style_value = style.split(":")
if style_name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ''
split_style[i] = ""
return split_style
cleaned_value = style_value.replace('\"', '').split()[-1]
cleaned_value = style_value.replace("\"", "").split()[-1]
constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
style_name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_name]
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ''
split_style[i] = ""
else:
if style_name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
@@ -177,14 +177,14 @@ def update_inline_styles_to_livecarta_convention(split_style: list):
def build_inline_style_content(style: str) -> str:
"""Build inline style with livecarta convention"""
# replace all spaces between '; & letter' to ';'
# replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", style)
# when we split style by ';', last element of the list is '' - None
# when we split style by ";", last element of the list is "" - None
# remove it
split_style: list = list(filter(None, style.split(';')))
# replace all spaces between ': & letter' to ':'
split_style: list = list(filter(None, style.split(";")))
# replace all spaces between ": & letter" to ":"
split_style = [el.replace(
re.search(r'(:\s*)', el).group(1), ':') for el in split_style]
re.search(r"(:\s*)", el).group(1), ":") for el in split_style]
split_style = update_inline_styles_to_livecarta_convention(split_style)
style = "; ".join(split_style)
@@ -195,17 +195,17 @@ def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRul
style_type: cssutils.css.property.Property):
if style_type.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
css_rule.style[style_type.name] = ""
return
cleaned_value = style_type.value.replace('\"', '')
cleaned_value = style_type.value.replace("\"", "")
constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
style_type.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_type.name]
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
css_rule.style[style_type.name] = ""
else:
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
@@ -227,12 +227,12 @@ def build_css_file_content(css_content: str) -> str:
return css_text
if __name__ == '__main__':
file = '../../epub/9781627222174.epub'
if __name__ == "__main__":
file = "../../epub/9781627222174.epub"
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = ebooklib_book.get_item_with_href("css/epub.css")
css_ = css_.get_content().decode()
css_cleaned = build_css_file_content(css_)
html_ = ebooklib_book.get_item_with_href(
'pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml')
"pr01s05.xhtml").get_body_content().decode()
html_soup = BeautifulSoup(html_, features="lxml")

View File

@@ -17,10 +17,12 @@ from bs4 import BeautifulSoup, Tag
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\
prepare_title, prepare_content, update_images_src_links, preprocess_footnotes
from src.epub_converter.html_epub_preprocessor import process_structural_tags, get_tags_between_chapter_marks,\
prepare_title, prepare_content
class EpubConverter:
@@ -57,26 +59,27 @@ class EpubConverter:
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
self.logger.log('Image processing.')
self.logger.log("Image processing.")
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.img_href2img_bytes[file_name] = content
self.logger.log('HTML files reading.')
self.logger.log("HTML files reading.")
self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content()
# TODO Presets
self.logger.log('Process CSS inline styles.')
self.logger.log("Process CSS inline styles.")
self.process_inline_styles_in_html_soup()
self.logger.log('CSS files processing.')
self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log('CSS styles adding.')
self.logger.log("CSS styles adding.")
self.add_css_styles_to_html_soup()
self.logger.log('Footnotes processing.')
# todo presets
self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup:
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
self.html_href2html_body_soup)
@@ -85,27 +88,28 @@ class EpubConverter:
self.footnotes.extend(footnotes_tags)
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
noteref.attrs['data-id'] = i + 1
noteref.attrs['id'] = f'footnote-{i + 1}'
footnote.attrs['href'] = f'#footnote-{i + 1}'
noteref.attrs["data-id"] = i + 1
noteref.attrs["id"] = f"footnote-{i + 1}"
footnote.attrs["href"] = f"#footnote-{i + 1}"
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
self.logger.log('TOC processing.')
self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
self.logger.log("TOC processing.")
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed
if self.is_toc_empty():
self.build_adjacency_list_from_spine()
not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f'Html documents not added to TOC: {not_added}.')
self.logger.log(f"Html documents not added to TOC: {not_added}.")
self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.')
self.label_chapters_ids_with_tmp_id()
self.logger.log(f"Html internal links and structure processing.")
self.label_chapters_ids_with_lc_id()
# used only after parsed toc, ids from toc needed
self.process_html_soup_structure_to_line()
self.process_internal_links()
self.logger.log(f'Building chapters content.')
self.logger.log(f"Define chapters content.")
self.define_chapters_content()
self.logger.log(f"Converting html_nodes to LiveCarta chapter items.")
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
@@ -115,7 +119,7 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_body_text = item.get_body_content()
# html.parser closes tags if needed
soup = BeautifulSoup(html_body_text, features='html.parser')
soup = BeautifulSoup(html_body_text, features="html.parser")
nodes[item.file_name] = soup
return nodes
@@ -123,15 +127,15 @@ class EpubConverter:
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace('\\', '/')
join(html_folder, path_to_css_from_html)).replace("\\", "/")
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search('"(.*)"', str(css_obj.content)).group(1)
re.search("'(.*)'", str(css_obj.content)).group(1)
css_obj = self.ebooklib_book.get_item_with_href(
path_to_css_from_root)
assert css_obj, f'Css style {css_href} was not in manifest.'
assert css_obj, f"Css style {css_href} was not in manifest."
css_content: str = css_obj.get_content().decode()
return css_content
@@ -140,11 +144,11 @@ class EpubConverter:
for html_href in self.html_href2html_body_soup:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')})
attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs['style']
tag_initial_inline_style.attrs['style'] = \
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]:
@@ -167,23 +171,23 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content
html_href = item.file_name
soup_html_content = BeautifulSoup(html_content, features='lxml')
soup_html_content = BeautifulSoup(html_content, features="lxml")
# check if file links to css file
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
for tag in soup_html_content.find_all("link", attrs={"type": "text/css"}):
# alternate page of original page (e.g. another language)
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
if tag.attrs.get("rel") and ("alternate" in tag.attrs["rel"]):
continue
css_href = tag.attrs.get('href')
css_href = tag.attrs.get("href")
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_file_content(
self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all('style')):
for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string
html_href2css_href[html_href].append(f'href{i}')
css_href2css_content[f'href{i}'] = build_css_file_content(
html_href2css_href[html_href].append(f"href{i}")
css_href2css_content[f"href{i}"] = build_css_file_content(
css_content)
return html_href2css_href, css_href2css_content
@@ -195,7 +199,7 @@ class EpubConverter:
"""
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
css = ''
css = ""
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
@@ -243,7 +247,7 @@ class EpubConverter:
sub_nodes = []
for elem in second:
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
if ("section" in first.title.lower() or "part" in first.title.lower()) and lvl == 1:
self.offset_sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl))
else:
@@ -267,7 +271,7 @@ class EpubConverter:
self.adjacency_list[-1] = nodes
else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
assert 0, f"Error. Element is not tuple/Link/list instance: {type(element)}"
def is_toc_empty(self) -> bool:
"""Function checks is toc empty"""
@@ -297,36 +301,36 @@ class EpubConverter:
"""Function add files that not added to adjacency list"""
for i, file in enumerate(not_added):
nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file))
Section(f"To check #{i}, filename: {file}", file))
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(file)
def label_chapters_ids_with_tmp_id(self):
def label_chapters_ids_with_lc_id(self):
for html_href in self.html_href2html_body_soup:
ids = self.html_href2subchapter_ids[html_href]
for i in ids:
soup = self.html_href2html_body_soup[html_href]
tag = soup.find(id=i)
new_h = soup.new_tag('tmp')
new_h.attrs['class'] = 'converter-chapter-mark'
new_h.attrs['id'] = i
new_h = soup.new_tag("tmp")
new_h.attrs["class"] = "converter-chapter-mark"
new_h.attrs["id"] = i
tag.insert_before(new_h)
def process_html_soup_structure_to_line(self):
# go to line structure
for html_href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
self.html_href2html_body_soup[html_href] = process_structural_tags(soup)
@staticmethod
def create_unique_id(href, id_):
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
@staticmethod
def create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs['id'] = id_
new_anchor_span.attrs['class'] = 'link-anchor'
new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs["class"] = "link-anchor"
new_anchor_span.string = "\xa0"
return new_anchor_span
@@ -353,18 +357,18 @@ class EpubConverter:
"""
dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join(
dir_name, href_in_link)).replace('\\', '/')
dir_name, href_in_link)).replace("\\", "/")
full_path = [
path for path in self.hrefs_added_to_toc if normed_path in path]
if not full_path:
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.')
internal_link_tag.attrs['converter-mark'] = 'bad-link'
self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
f"While processing href in {internal_link_tag}.")
internal_link_tag.attrs["converter-mark"] = "bad-link"
return None
if len(full_path) > 1:
self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
f' while {internal_link_tag} processing. The first one will be chosen.')
self.logger.log(f"Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}"
f" while {internal_link_tag} processing. The first one will be chosen.")
return full_path[0]
@@ -387,30 +391,30 @@ class EpubConverter:
"""
# 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
if tag.attrs.get('class') == 'converter-chapter-mark':
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
if tag.attrs.get("class") == "converter-chapter-mark":
continue
if tag.attrs.get('class') == 'footnote-element':
if tag.attrs.get("class") == "footnote-element":
continue
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
tag.attrs['id'] = new_id
new_id = self.create_unique_id(toc_href, tag.attrs["id"])
tag.attrs["id"] = new_id
# 2a. process anchor which is a whole xhtml file
internal_link_reg1 = re.compile(
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
r"(^(?!https?://).+\.(htm|html|xhtml)$)")
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
a_tag_href = internal_link_tag.attrs['href']
for internal_link_tag in soup.find_all("a", {"href": internal_link_reg1}):
a_tag_href = internal_link_tag.attrs["href"]
# find full path
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc:
continue
new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
@@ -418,22 +422,22 @@ class EpubConverter:
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href']
del internal_link_tag.attrs["href"]
# 2b. process anchor which is an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)')
internal_link_reg2 = re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split(
'#')
for internal_link_tag in soup.find_all("a", {"href": internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
"#")
# find full path
if a_tag_href:
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
internal_link_tag)
else:
a_tag_href_matched_to_toc = os.path.normpath(
toc_href).replace('\\', '/')
toc_href).replace("\\", "/")
if not a_tag_href_matched_to_toc:
continue
@@ -442,45 +446,45 @@ class EpubConverter:
a_tag_href_matched_to_toc, a_tag_id)
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id, })
anchor_tags = anchor_soup.find_all(attrs={"id": new_id, })
anchor_tags = anchor_tags or anchor_soup.find_all(
attrs={'id': a_tag_id}) # if link is a footnote
attrs={"id": a_tag_id}) # if link is a footnote
if anchor_tags:
if len(anchor_tags) > 1:
self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n'
f'{anchor_tags}\n'
f' While processing {internal_link_tag}')
self.logger.log(f"Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n"
f"{anchor_tags}\n"
f" While processing {internal_link_tag}")
anchor_tag = anchor_tags[0]
assert anchor_tag.attrs['id'] in [new_id, a_tag_id]
assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
# if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
# create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs['id'] not in self.internal_anchors:
if anchor_tag.attrs["id"] not in self.internal_anchors:
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id)
del anchor_tag.attrs['id']
del internal_link_tag.attrs['href']
del anchor_tag.attrs["id"]
del internal_link_tag.attrs["href"]
else:
internal_link_tag.attrs['converter-mark'] = 'bad-link'
self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={a_tag_id}')
internal_link_tag.attrs["converter-mark"] = "bad-link"
self.logger.log(f"Error in {toc_href}. While processing {internal_link_tag} no anchor found."
f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
f" Old id={a_tag_id}")
def build_one_chapter(self, nav_point: NavPoint):
def detect_one_chapter(self, nav_point: NavPoint):
"""
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
3 cases:
id wraps all chapter content,
id wraps chapter's content + subchapters' content
id wraps chapter"s content + subchapters" content
id points to the start of title of a chapter
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter"s id
and id of the next chapter/subchapter
Parameters
----------
@@ -496,7 +500,7 @@ class EpubConverter:
soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser')
new_tree = BeautifulSoup("", "html.parser")
for tag in chapter_tags:
new_tree.append(tag)
self.href_chapter_id2soup_html[(
@@ -504,16 +508,30 @@ class EpubConverter:
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
self.build_one_chapter(sub_node)
self.detect_one_chapter(sub_node)
def define_chapters_content(self):
"""Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points:
self.build_one_chapter(point)
self.detect_one_chapter(point)
def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
"""
Function prepare style, tags to json structure
Parameters
----------
nav_point: NavPoint
lvl: int
level of chapter
Returns
-------
ChapterItem
built chapter
"""
title = nav_point.title
if nav_point.id:
content: BeautifulSoup = self.href_chapter_id2soup_html[(
@@ -526,7 +544,7 @@ class EpubConverter:
access=self.access,
path2aws_path=self.book_image_src_path2aws_path,
book_id=self.file_path.stem
if hasattr(self.file_path, 'stem') else 'book_id')
if hasattr(self.file_path, "stem") else "book_id")
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)
@@ -534,15 +552,16 @@ class EpubConverter:
remove_title_from_chapter=is_chapter)
sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter
# if it doesn't have subchapters
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
sub_chapter_item = self.node_to_livecarta_chapter_item(
sub_chapter_item = self.html_node_to_livecarta_chapter_item(
sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item)
if self.logger:
indent = ' ' * lvl
self.logger.log(f'{indent}Chapter: {title} is prepared.')
indent = " " * lvl
self.logger.log(f"{indent}Chapter: {title} is prepared.")
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self) -> dict:
@@ -550,12 +569,13 @@ class EpubConverter:
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
for nav_point in top_level_nav_points:
chapter = self.node_to_livecarta_chapter_item(nav_point)
# loop through to level chapters
for tl_nav_point in top_level_nav_points:
chapter = self.html_node_to_livecarta_chapter_item(tl_nav_point)
top_level_chapters.append(chapter)
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
self.logger.log(f'Anchors found: {len(self.internal_anchors)}.')
self.logger.log('End conversion.')
self.logger.log(f"Anchors found: {len(self.internal_anchors)}.")
self.logger.log("End conversion.")
return {
"content": top_level_dict_chapters,
@@ -564,12 +584,12 @@ class EpubConverter:
if __name__ == "__main__":
epub_file_path = '../../epub/9781614382264.epub'
epub_file_path = "../../epub/9781614382264.epub"
logger_object = BookLogger(
name='epub', book_id=epub_file_path.split('/')[-1])
name="epub", book_id=epub_file_path.split("/")[-1])
json_converter = EpubConverter(epub_file_path, logger=logger_object)
content_dict = json_converter.convert_to_dict()
with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json:
with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
json.dump(content_dict, f_json, ensure_ascii=False)

View File

@@ -7,7 +7,7 @@ class EpubBook(BookSolver):
def __init__(self, book_id=0, access=None, main_logger=None):
super().__init__(book_id, access, main_logger)
self.book_type = 'epub'
self.book_type = "epub"
def get_converted_book(self):
"""

View File

@@ -0,0 +1,87 @@
from typing import Tuple
from bs4 import BeautifulSoup, Tag
def _replace_with_livecarta_anchor_tag(anchor, i):
"""Function replace noteref_tag(anchor) with new livecarta tag"""
new_tag = BeautifulSoup(features="lxml").new_tag("sup")
new_tag["class"] = "footnote-element"
new_tag["data-id"] = i + 1
new_tag["id"] = f"footnote-{i + 1}"
new_tag.string = "*"
if anchor.parent.name == "sup":
anchor.parent.unwrap()
anchor.replace_with(new_tag)
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name="epub:type") \
-> Tuple[list, list, list]:
"""
This function preprocessing footnotes
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(
attrs={noteref_attr_name: "noteref"})
bad_noterefs_tags = set(
[tag for tag in noterefs_tags if not tag.attrs.get("href")])
noterefs_tags = [
tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
new_noterefs_tags = []
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str) -> Tuple[str, str]:
"""Returns name of file & id of an anchor"""
assert "#" in s, f"Error. Unexpected href: {s} in a tag. Href must contain an id."
f, id_ = s.split("#")
return f, id_
def verify_footnote_tag(tags: list):
"""Function verifies is tag - footnote"""
assert len(tags) <= 1, f"Error, Multiple id: {href}.\n{tags}"
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(
f"Warning. Href for tag is detected as footnote:\n{noteref_tag}")
return anchored_tags
else:
assert 0, f"Error, No element with id: {href} found."
return tags
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs["href"]
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(
f"Error while footnotes processing. For {noteref_tag} invalid path: {file}.")
continue
possible_footnote = "note|footnote|endnote|rearenote"
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={"epub:type": re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote":
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(
_replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(
attrs={"role": "doc-backlink"}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
return footnotes, new_noterefs_tags, new_footnotes_tags

View File

@@ -1,305 +1,107 @@
import os
import re
import pathlib
from typing import Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from src.access import Access
from src.livecarta_config import LiveCartaConfig
def _replace_with_livecarta_anchor_tag(anchor, i):
"""Function replace noteref_tag(anchor) with new livecarta tag"""
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
if anchor.parent.name == 'sup':
anchor.parent.unwrap()
anchor.replace_with(new_tag)
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-> Tuple[list, list, list]:
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
"""
This function preprocessing footnotes
This function should be earlier that adding fonts in pipeline.
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Soup object
chapter_tag: BeautifulSoup
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
Returns
-------
None
updated body tag
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(
attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set(
[tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [
tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
new_noterefs_tags = []
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
"""Function inserts span before tag aren't supported by livecarta"""
new_tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
def parse_a_tag_href(s: str) -> Tuple[str, str]:
"""Returns name of file & id of an anchor"""
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#')
return f, id_
def verify_footnote_tag(tags: list):
"""Function verifies is tag - footnote"""
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(
f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
return tags
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(
f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={'epub:type': re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(
_replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(
attrs={'role': 'doc-backlink'}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
return footnotes, new_noterefs_tags, new_footnotes_tags
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup:
def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup:
"""
Main function that works with structure of html. Make changes inplace.
Parameters
----------
body_tag: Tag, soup object
chapter_tag: Tag, soup object
Steps
----------
1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
This tag must have a body_tag as a parent.
Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed.
This tag must have a chapter_tag as a parent.
Otherwise, it is wrapped with some tags. Like:
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
<p> <span id="123", class="converter-chapter-mark"> </span> </p>
3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString
Returns
-------
body_tag: Tag, BeautifulSoup
adjusted body_tag
chapter_tag: Tag, BeautifulSoup
adjusted chapter_tag
"""
def _preserve_class_in_aside_tag(tag_):
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
if tag_.parent.name == 'aside':
if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class
def _tags_to_correspond_livecarta_tag(chapter_tag):
"""Function to replace all tags to correspond livecarta tags"""
for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items():
for key in reg_key:
# text = tag if isinstance(tag, NavigableString) else tag.text
tags = chapter_tag.find_all(re.compile(key))
for tag in tags:
tag.name = to_replace_value
def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
"""
Function saves css style inherited from class, copies class to child <p>
returns True, if <section> could be unwrapped
Parameters
----------
tag_: Tag, soup object
def _unwrap_tags(chapter_tag):
"""Function unwrap tags and move id to span"""
for tag in LiveCartaConfig. TAGS_TO_UNWRAP:
for s in chapter_tag.find_all(tag):
_add_span_to_save_ids_for_links(s, chapter_tag)
s.unwrap()
Returns
-------
bool
def _mark_parent_is_body(chapter_tag):
# check marks for chapter starting are on the same level - 1st
marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"})
"""
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
if 'feature' not in tag_class:
return True
child_p_tags = tag_.find_all("p")
if len(child_p_tags) == 1:
child_p_tag = child_p_tags[0]
if not child_p_tag.attrs.get('class'):
child_p_tag.attrs['class'] = tag_class
return True
# fix marks to be on 1 level
for mark in marks:
while mark.parent != chapter_tag:
mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
elif len(child_p_tags) > 1:
tag_.name = 'p'
return False
else:
return True
_tags_to_correspond_livecarta_tag(chapter_tag)
def _add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
_insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
_unwrap_tags(chapter_tag)
def _replace_div_tag_with_table():
"""
Function replace <div> with <table>:
1. Convert div with certain classes to tables
2. Add background color to div with background-color
_mark_parent_is_body(chapter_tag)
"""
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(
div.attrs['class'], list) else div.attrs['class'][0]
if div_class in ['C409', 'C409a']:
_wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')
elif div_class in ['C441', 'C816']:
_wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')
if div.attrs.get('style'):
if 'background-color' in div.attrs['style']:
end_index = div.attrs['style'].find(
'background-color') + len('background-color')
start_index_of_color = end_index + 2
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
_wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
elif div.attrs.get('style') == '':
del div.attrs['style']
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
if div.contents:
is_not_struct_tag = [
child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
_add_span_to_save_ids_for_links(div)
div.unwrap()
def _heading_tag_to_p_tag(body_tag):
"""Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
# comments removal
for tag in body_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
_replace_div_tag_with_table()
for s in body_tag.find_all("section"):
could_be_unwrapped = True
if s.attrs.get('class'):
could_be_unwrapped = _preserve_class_in_section_tag(s)
_add_span_to_save_ids_for_links(s)
if could_be_unwrapped:
s.unwrap()
for s in body_tag.find_all("article"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("figure"):
s.name = 'p'
# to center image inside this tag
s.attrs['style'] = "text-align: center;"
for s in body_tag.find_all("figcaption"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("body"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("html"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# check marks for chapter starting are on the same 1 level
marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
parents_marks_are_body = [x.parent == body_tag for x in marks]
# fix marks to be on 1 level
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(
parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
_heading_tag_to_p_tag(body_tag)
# wrap NavigableString with <p>
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
return chapter_tag
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""After processing on a first_id that corresponds to current chapter,
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id:
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
href:
Name of current chapter's file
first_id: str
Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
href: str
Name of current chapters file
html_soup: Tag
Soup object of current file
@@ -310,13 +112,13 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
"""
marked_tags = html_soup.find(
attrs={'id': first_id, 'class': 'converter-chapter-mark'})
attrs={"id": first_id, "class": "converter-chapter-mark"})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and\
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
if not isinstance(next_tag, NavigableString) and \
(next_tag.attrs.get("class") == "converter-chapter-mark"):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
@@ -327,182 +129,119 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
html_soup.smooth()
else:
assert 0, f'Warning: no match for {first_id, href}'
assert 0, f"Warning: no match for {first_id, href}"
return tags
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images to Amazon web service"""
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link_path
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'../json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def update_images_src_links(body_tag: BeautifulSoup,
href2img_content: dict,
path_to_html: str,
access=None,
path2aws_path: dict = None,
book_id: str = None) -> dict:
"""Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(
html_folder, path_to_img_from_html)).replace('\\', '/')
assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(
access, path_to_img_from_root, img_content, book_id)
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(
path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):
del img.attrs['width']
if img.attrs.get('height'):
del img.attrs['height']
if img.attrs.get('style'):
del img.attrs['style']
return path2aws_path
def _clean_title_from_numbering(title: str):
"""Function removes numbering from titles"""
title = re.sub(r'^(\s+)+', '', title)
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
# title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
def prepare_title(title_of_chapter: str) -> str:
"""Function finalise processing/cleaning title"""
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
title_str = _clean_title_from_numbering(title_str)
title_str = BeautifulSoup(title_of_chapter, features="lxml").string
title_str = re.sub(r"([\n\t\xa0])", " ", title_str)
title_str = re.sub(r" +", " ", title_str).rstrip()
# clean whitespace characters ([\r\n\t\f\v ])
title_str = re.sub(r"(^\s+)|(\s+$)", "", title_str)
return title_str
def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
"""Function inserts span before tag aren't supported by livecarta"""
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
new_tag.string = "\xa0"
tag.insert_before(new_tag)
def _remove_comments(chapter_tag):
for tag in chapter_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
def _clean_headings_content(content: BeautifulSoup, title: str):
def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
if tag_to_be_removed.attrs.get('id'):
_insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=tag_to_be_removed.attrs.get(
'id'),
class_=tag_to_be_removed.attrs.get('class'))
def _wrap_strings_with_p(chapter_tag):
# wrap NavigableString with <p>
for node in chapter_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r"([\n\t\xa0])", " ", content)
# remove spaces at the beginning and at the end of the string:
content = content.strip()
if content:
tag = chapter_tag.new_tag("p")
tag.append(str(node))
node.replace_with(tag)
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
_insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class'))
title = title.lower()
for child in content.contents:
if isinstance(child, NavigableString):
text = child
else:
text = child.text
if text and re.sub(r'([\n\t\xa0])', '', text):
text = re.sub(r'([\n\t\xa0])', ' ', text)
text = re.sub(r' +', ' ', text).strip()
text = text.lower()
if title == text:
add_span_to_save_ids_for_links(child, content)
child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
add_span_to_save_ids_for_links(child, content)
child.extract()
def _remove_headings_content(content_tag, title_of_chapter: str):
"""
Function
clean/remove headings from chapter in order to avoid duplication of chapter titles in the content
add span with id in order to
Parameters
----------
content_tag: soup object
Tag of the page
title_of_chapter: str
Chapter title
Returns
-------
None
clean/remove headings & add span with id
"""
title_of_chapter = title_of_chapter.lower()
for tag in content_tag.contents:
text = tag if isinstance(tag, NavigableString) else tag.text
if text:
text = re.sub(r"^[\s\xa0]+|[\s\xa0]+$", " ", text).lower()
if title_of_chapter == text or \
(title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name)):
_add_span_to_save_ids_for_links(tag, content_tag)
tag.extract()
break
def _process_lists(body_tag: BeautifulSoup):
# todo remove
def _process_lists(chapter_tag: BeautifulSoup):
"""
Function
- process tags <li>.
- unwrap <p> tags.
Parameters
----------
body_tag: Tag, soup object
chapter_tag: Tag, soup object
Returns
-------
None
"""
li_tags = body_tag.find_all("li")
li_tags = chapter_tag.find_all("li")
for li_tag in li_tags:
if li_tag.p:
li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap()
def _preprocess_table(body_tag: BeautifulSoup):
def _preprocess_table(chapter_tag: BeautifulSoup):
"""Function to preprocess tables and tags(td|th|tr): style"""
tables = body_tag.find_all("table")
tables = chapter_tag.find_all("table")
for table in tables:
t_tags = table.find_all(re.compile("td|th|tr"))
for t_tag in t_tags:
style = t_tag.get('style')
width = ''
style = t_tag.get("style")
width = ""
if style:
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match:
size = width_match.group(1)
width = size + 'px'
width = size + "px"
t_tag.attrs['width'] = t_tag.get('width') or width
t_tag.attrs["width"] = t_tag.get("width") or width
if t_tag.attrs.get('style'):
t_tag.attrs['style'] = t_tag.attrs['style'].replace(
'border:0;', '')
if t_tag.attrs.get("style"):
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
"border:0;", "")
elif t_tag.attrs.get('style') == '':
del t_tag.attrs['style']
elif t_tag.attrs.get("style") == "":
del t_tag.attrs["style"]
if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
table.attrs['border'] = '1'
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1"
def _preprocess_code_tags(chapter_tag: BeautifulSoup):
@@ -523,25 +262,15 @@ def _preprocess_code_tags(chapter_tag: BeautifulSoup):
if not code.parent.name == "pre":
code.name = "span"
continue
# if tag isn't in pre and doesn't have style
if not code.attrs.get('style'):
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
def _prepare_formatted(text: str) -> str:
"""Function replaces special symbols with their Unicode representation"""
text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
text = text.replace(' ', "\xa0")
text = text.replace('𝑓', "\xf0\x9d\x91\x93")
return text
# if tag isn"t in pre and doesn"t have style
if not code.attrs.get("style"):
code.attrs["style"] = "font-size: 14px; font-family: courier new,courier,monospace;"
def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
"""
Function preprocessing <pre> tags
Wrap string of the tag with <code> if it's necessary
Wrap string of the tag with <code> if its necessary
Parameters
----------
chapter_tag: Tag, soup object
@@ -564,6 +293,42 @@ def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
pre.append(code)
# todo replace
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
"""Function wraps <tag> with <table>"""
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_be_wrapped.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def _preprocess_div_tags(chapter_tag):
"""
Function replace <div> with <table>:
"""
for div in chapter_tag.find_all("div"):
if div.attrs.get('style'):
_wrap_tag_with_table(
chapter_tag,
tag_to_be_wrapped=div,
width=div.attrs['width'] if div.attrs.get('width') else '100',
border=div.attrs['border'] if div.attrs.get('border') else None,
bg_color=div.attrs['bgcolor'] if div.attrs.get('bgcolor') else None)
else:
div.name = "p"
continue
_add_span_to_save_ids_for_links(div, chapter_tag)
div.unwrap()
def _clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
@@ -571,48 +336,30 @@ def _clean_wiley_block(block):
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
h.insert_before(BeautifulSoup(features="lxml").new_tag("br"))
def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
"""Function wraps <block> with <table>"""
table = main_tag.new_tag("table")
table.attrs['border'] = border
table.attrs['align'] = 'center'
table.attrs['style'] = f'width:{width}%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
# td.attrs['border-radius'] = '8px'
if bg_color:
td.attrs['bgcolor'] = bg_color
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
return table
def _preprocess_block_tags(chapter_tag: Tag):
"""Function preprocessing <block> tags"""
for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get(
'class') == 'feature2' else color
_wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
color = "#DDDDDD" if block.attrs.get(
"class") == "feature1" else None
color = "#EEEEEE" if block.attrs.get(
"class") == "feature2" else color
_wrap_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(future_block)
color = '#DDDDDD' if future_block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if future_block.attrs.get(
'class') == 'feature2' else color
_wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
color = "#DDDDDD" if future_block.attrs.get(
"class") == "feature1" else None
color = "#EEEEEE" if future_block.attrs.get(
"class") == "feature2" else color
_wrap_tag_with_table(chapter_tag, future_block, bg_color=color)
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
@@ -628,10 +375,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
Steps
----------
1. find \n
2. heading removal
3. processing tags
4. class removal
1. heading removal
2. processing tags
3. class removal
Returns
-------
@@ -639,28 +385,27 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
prepared content
"""
# 1. find \n
to_remove = []
for child in content_tag.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t])', '', child.string)
if s == '':
to_remove.append(child)
# 1. remove comments
_remove_comments(content_tag)
# 2. heading removal
# 2. wrap NavigableString with tag <p>
_wrap_strings_with_p(content_tag)
# 3. heading removal
if remove_title_from_chapter:
_clean_headings_content(content_tag, title_str)
_remove_headings_content(content_tag, title_str)
# 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
# 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
_process_lists(content_tag)
_preprocess_table(content_tag)
_preprocess_code_tags(content_tag)
_preprocess_pre_tags(content_tag)
_preprocess_div_tags(content_tag)
_preprocess_block_tags(content_tag)
# 4. class removal
# 5. remove classes that were created by converter
for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']):
del tag.attrs['class']
if hasattr(tag, "attrs") and tag.attrs.get("class") \
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
return str(content_tag)

View File

@@ -0,0 +1,67 @@
import os
import pathlib
from bs4 import BeautifulSoup
from src.access import Access
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images to Amazon web service"""
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link_path
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f"../json/img_{book_id}/"))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, "wb+")
f.write(img_content)
f.close()
return new_img_path
def update_images_src_links(body_tag: BeautifulSoup,
href2img_content: dict,
path_to_html: str,
access=None,
path2aws_path: dict = None,
book_id: str = None) -> dict:
"""Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all("img")
for img in img_tags:
path_to_img_from_html = img.attrs.get("src")
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(
html_folder, path_to_img_from_html)).replace("\\", "/")
assert path_to_img_from_root in href2img_content, \
f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest."
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(
access, path_to_img_from_root, img_content, book_id)
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(
path_to_img_from_root, img_content, "book_id")
img.attrs["src"] = str(new_folder)
if img.attrs.get("width"):
del img.attrs["width"]
if img.attrs.get("height"):
del img.attrs["height"]
if img.attrs.get("style"):
del img.attrs["style"]
return path2aws_path

View File

@@ -21,33 +21,33 @@ class TagStyleConverter:
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
"""Function remove text white color if there is no bg color"""
if 'background' in style_:
if "background" in style_:
style_ = style_.replace(
'background:', 'background-color:')
"background:", "background-color:")
return style_
# if text color is white, check that we have bg-color
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
if ("color:#ffffff" in style_) or ("color:#fff" in style_) or ("color:white" in style_):
# if bg color is inherited, just return style as is
for parent_tag in tag.parents:
# white bg color not need to be checked as we do not write 'white bg color'
tag_with_bg = ['span', 'td', 'tr', 'p']
# white bg color not need to be checked as we do not write "white bg color"
tag_with_bg = ["span", "td", "tr", "p"]
tag_will_be_saved = parent_tag.name in tag_with_bg
has_bg = parent_tag.attrs.get('style') and (
'background' in parent_tag.attrs.get('style'))
has_bg = parent_tag.attrs.get("style") and (
"background" in parent_tag.attrs.get("style"))
if has_bg and tag_will_be_saved:
return style_
children = tag.find_all()
for child in children:
if child.attrs.get('style') and ('background' in child.attrs.get('style')):
tmp_style = child.attrs['style'] + '; color:#fff; '
child.attrs['style'] = tmp_style
if child.attrs.get("style") and ("background" in child.attrs.get("style")):
tmp_style = child.attrs["style"] + "; color:#fff; "
child.attrs["style"] = tmp_style
# for child with bg color we added white text color, so this tag don't need white color
style_ = style_.replace('color:#fff;', '')
style_ = style_.replace('color:#ffffff;', '')
style_ = style_.replace('color:white;', '')
# for child with bg color we added white text color, so this tag don"t need white color
style_ = style_.replace("color:#fff;", "")
style_ = style_.replace("color:#ffffff;", "")
style_ = style_.replace("color:white;", "")
return style_
@staticmethod
@@ -68,7 +68,7 @@ class TagStyleConverter:
Parameters
----------
split_style: list
list of styles split by ';'
list of styles split by ";"
Returns
----------
@@ -79,9 +79,9 @@ class TagStyleConverter:
processed_style = ";".join(split_style)
margin_left_regexp = re.compile(
r'((margin-left|margin): *(-*\w+);*)')
r"((margin-left|margin): *(-*\w+);*)")
text_indent_regexp = re.compile(
r'(text-indent: *(-*\w+);*)')
r"(text-indent: *(-*\w+);*)")
has_margin = re.search(margin_left_regexp, processed_style)
has_text_indent = re.search(text_indent_regexp, processed_style)
@@ -92,21 +92,21 @@ class TagStyleConverter:
if has_text_indent:
num_ti = abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2))))))
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_m - num_ti)) + 'px; ')
processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(num_m - num_ti)) + "px; ")
processed_style = processed_style.replace(
has_margin.group(1), '')
has_margin.group(1), "")
return processed_style
processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' +
str(abs(num_m)) + 'px; ')
processed_style = processed_style.replace(has_margin.group(1), "text-indent: " +
str(abs(num_m)) + "px; ")
return processed_style
elif has_text_indent:
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' +
processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2)))))))
+ 'px; ')
+ "px; ")
return processed_style
return processed_style
@@ -126,18 +126,18 @@ class TagStyleConverter:
processed inline style
"""
inline_style = self.tag_inline_style.attrs.get('style') + ';'
# 1. Remove white color if tag doesn't have background color in style
inline_style = self.tag_inline_style.attrs.get("style") + ";"
# 1. Remove white color if tag doesn"t have background color in style
inline_style = self.remove_white_if_no_bgcolor(
inline_style, self.tag_inline_style)
inline_style = inline_style.replace(
'list-style-image', 'list-style-type')
"list-style-image", "list-style-type")
# 2. Create list of styles from inline style
# replace all spaces between '; & letter' to ';'
# replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", inline_style)
# when we split style by ';', last element of the list is '' - None (remove it)
split_inline_style: list = list(filter(None, style.split(';')))
# when we split style by ";", last element of the list is "" - None (remove it)
split_inline_style: list = list(filter(None, style.split(";")))
# 3. Duplicate styles check - if the tag had duplicate styles
split_inline_style = self.duplicate_styles_check(split_inline_style)
@@ -164,7 +164,7 @@ class TagStyleConverter:
"""
styles_to_remove = []
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
if f"{k[0]}:{k[1]}" in style:
styles_to_remove.append(k)
return styles_to_remove
@@ -172,11 +172,11 @@ class TagStyleConverter:
# adds <strong>, <u>, <sup> instead of styles
styles_to_remove = self.check_style_to_be_tag(self.style)
for i, (attr, value) in enumerate(styles_to_remove):
self.tag_inline_style.attrs['style'] = self.tag_inline_style.attrs['style']\
.replace(f'{attr}:{value};', '').strip()
self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\
.replace(f"{attr}:{value};", "").strip()
corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)]
correspond_tag = BeautifulSoup(features='lxml').new_tag(corr_tag_name)
correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name)
for content in reversed(self.tag_inline_style.contents):
correspond_tag.insert(0, content.extract())
self.tag_inline_style.append(correspond_tag)
@@ -184,34 +184,34 @@ class TagStyleConverter:
@staticmethod
def wrap_span_in_tag_to_save_style_attrs(initial_tag):
"""Function designed to save style attrs that cannot be in tag.name -> span"""
dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG))
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'):
dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG))
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
styles_can_be_in_tag = [style
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items()
if re.match(tag, initial_tag.name)
for style in styles]
styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in styles_can_be_in_tag]
span_style = initial_tag.attrs['style']
span_style = initial_tag.attrs["style"]
# here check that this style is exactly the same.
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
styles_to_be_saved_in_span = [((attr + ':') in span_style) & (
'-' + attr not in span_style) for attr in styles_cant_be_in_tag]
# Not "align" when we have "text-align", or "border" when we have "border-top"
styles_to_be_saved_in_span = [((attr + ":") in span_style) & (
"-" + attr not in span_style) for attr in styles_cant_be_in_tag]
if any(styles_to_be_saved_in_span):
# if we find styles that cannot be in <tag.name> -> wrap them in span
tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}')
style = ''
possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag]
tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}")
style = ""
possible_attrs_regexp = [re.compile(fr"({style}: *(\w+);)") for style in styles_can_be_in_tag]
for possible_attr_regexp in possible_attrs_regexp:
has_style_attrs = re.search(
possible_attr_regexp, span_style)
if has_style_attrs and has_style_attrs.group(1):
style += has_style_attrs.group(1)
span_style = span_style.replace(
has_style_attrs.group(1), '')
tag.attrs['style'] = style
initial_tag.name = 'span'
initial_tag.attrs['style'] = span_style
has_style_attrs.group(1), "")
tag.attrs["style"] = style
initial_tag.name = "span"
initial_tag.attrs["style"] = span_style
initial_tag.wrap(tag)
def convert_initial_tag(self):
@@ -246,10 +246,10 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) ->
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')})
attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:

View File

@@ -9,12 +9,12 @@ class LiveCartaConfig:
HEADERS_LEVELS = {"h1", "h2", "h3",
"h4", "h5", "h6", "h7", "h8", "h9"}
DEFAULT_ALIGN_STYLE = 'left'
DEFAULT_ALIGN_STYLE = "left"
ALIGN_STYLES = ['justify', 'right', 'center', 'left']
ALIGN_STYLES = ["justify", "right", "center", "left"]
# Main constant values
DEFAULT_FONT_NAME = 'Times New Roman'
DEFAULT_FONT_NAME = "Times New Roman"
WORD_DEFAULT_FONT_SIZE = 11
@@ -38,65 +38,65 @@ class LiveCartaConfig:
}
COLORS_MAP = {
'#ffff00': 'yellow',
'#00ff00': 'darkYellow',
'#00ffff': 'cyan',
'#ff00ff': 'magenta',
'#0000ff': 'blue',
'#ff0000': 'red',
'#000080': 'darkBlue',
'#008080': 'darkCyan',
'#008000': 'green',
'#800080': 'darkMagenta',
'#808000': 'darkGreen',
'#c0c0c0': 'lightGray',
'#ffffff': 'white',
'#800000': '#800000',
'#808080': '#808080'
"#ffff00": "yellow",
"#00ff00": "darkYellow",
"#00ffff": "cyan",
"#ff00ff": "magenta",
"#0000ff": "blue",
"#ff0000": "red",
"#000080": "darkBlue",
"#008080": "darkCyan",
"#008000": "green",
"#800080": "darkMagenta",
"#808000": "darkGreen",
"#c0c0c0": "lightGray",
"#ffffff": "white",
"#800000": "#800000",
"#808080": "#808080"
}
HTML42LIVECARTA_COLORS = {
'yellow': 'yellow',
'lime': 'green',
'aqua': 'cyan',
'fuchsia': 'magenta',
'blue': 'blue',
'red': 'red',
'navy': 'darkBlue',
'teal': 'darkCyan',
'green': 'darkGreen',
'purple': 'darkMagenta',
'olive': 'darkYellow',
'silver': 'lightGray',
'white': 'white',
'maroon': 'darkRed', # '#800000',
'gray': 'darkGray',
'grey': 'darkGray',
"yellow": "yellow",
"lime": "green",
"aqua": "cyan",
"fuchsia": "magenta",
"blue": "blue",
"red": "red",
"navy": "darkBlue",
"teal": "darkCyan",
"green": "darkGreen",
"purple": "darkMagenta",
"olive": "darkYellow",
"silver": "lightGray",
"white": "white",
"maroon": "darkRed", # "#800000",
"gray": "darkGray",
"grey": "darkGray",
}
INDENT = '30px'
INDENT = "30px"
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0,
1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69,
1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38,
2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px',
'19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px',
'30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px',
'41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px']
sizes_px = ["0px", "10px", "10px", "11px", "12px", "13px", "14px", "15px", "16px", "17px", "18px",
"19px", "20px", "21px", "22px", "23px", "24px", "25px", "26px", "27px", "28px", "29px",
"30px", "31px", "32px", "33px", "34px", "35px", "36px", "37px", "38px", "39px", "40px",
"41px", "42px", "43px", "44px", "45px", "46px", "47px", "48px", "49px", "50px", "64px", "72px"]
list_types = ['circle', 'disc', 'armenian', 'decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
list_types = ["circle", "disc", "armenian", "decimal",
"decimal-leading-zero", "georgian", "lower-alpha", "lower-latin",
"lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"]
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside',
'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p'
"div", "section", "article", "main", "body", "html", "aside",
"canvas", "data", "figure", "footer", "iframe", "span", "p"
]
could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
"(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)")
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
@@ -104,23 +104,34 @@ class LiveCartaConfig:
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong',
('font-weight', '600'): 'strong',
('font-weight', '700'): 'strong',
('font-weight', '800'): 'strong',
('font-weight', '900'): 'strong',
('font-style', 'italic'): 'i',
('text-decoration', 'underline'): 'u',
('text-decoration', 'line-through'): 's',
('text-decoration-line', 'underline'): 'u',
('text-decoration-line', 'line-through'): 's',
('vertical-align', 'super'): 'sup'
("font-weight", "bold"): "strong",
("font-weight", "600"): "strong",
("font-weight", "700"): "strong",
("font-weight", "800"): "strong",
("font-weight", "900"): "strong",
("font-style", "italic"): "i",
("text-decoration", "underline"): "u",
("text-decoration", "line-through"): "s",
("text-decoration-line", "underline"): "u",
("text-decoration-line", "line-through"): "s",
("vertical-align", "super"): "sup"
}
LIVECARTA_STYLES_CANT_BE_IN_TAG = {
'p': ['text-align', 'text-indent', 'border-bottom', 'border-top'],
'li': ['text-align', 'list-style-type'],
'ul': ['list-style-type'],
'ol': ['list-style-type'],
'(^h[1-9]$)': ['list-style-type']
"p": ["text-align", "text-indent", "border-bottom", "border-top"],
"li": ["text-align", "list-style-type"],
"ul": ["list-style-type"],
"ol": ["list-style-type"],
r"(^h[1-9]$)": ["list-style-type"]
}
REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = {
(r"^h[6-9]$", "figure$", "section$"): "p",
("^aside$",): "blockquote",
("^header$",): "span",
("^b$",): "strong",
}
TAGS_TO_UNWRAP = [
"section", "article", "figcaption", "main", "body", "html",
]