Updates to presets

This commit is contained in:
Kiryl
2022-06-21 11:47:26 +03:00
parent 73513e63b5
commit c62192d028
9 changed files with 668 additions and 739 deletions

View File

@@ -222,7 +222,6 @@ class HTMLDocxPreprocessor:
def _process_tables(self): def _process_tables(self):
"""Function to process tables. Set "border" attribute.""" """Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table") tables = self.body_tag.find_all("table")
for table in tables: for table in tables:
tds = table.find_all("td") tds = table.find_all("td")

View File

@@ -11,13 +11,13 @@ from src.livecarta_config import LiveCartaConfig
def get_text_color(x): def get_text_color(x):
color = str2hex(x) color = str2hex(x)
color = color if color not in ['#000000', '#000', 'black'] else '' color = color if color not in ["#000000", "#000", "black"] else ""
return color return color
def get_bg_color(x): def get_bg_color(x):
color = str2hex(x) color = str2hex(x)
color = color if color not in ['#ffffff', '#fff', 'white'] else '' color = color if color not in ["#ffffff", "#fff", "white"] else ""
return color return color
@@ -43,25 +43,25 @@ def convert_tag_style_values(size_value: str) -> str:
return LiveCartaConfig.sizes_px[last_possible_size_index] return LiveCartaConfig.sizes_px[last_possible_size_index]
font_size_regexp = re.compile( font_size_regexp = re.compile(
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)")
has_style_attrs = re.search(font_size_regexp, size_value) has_style_attrs = re.search(font_size_regexp, size_value)
if has_style_attrs: if has_style_attrs:
if has_style_attrs.group(1): if has_style_attrs.group(1):
size_value = float(size_value.replace('%', '')) / 100.0 size_value = float(size_value.replace("%", "")) / 100.0
return find_closest_size(size_value) return find_closest_size(size_value)
elif has_style_attrs.group(3): elif has_style_attrs.group(3):
size_value = float(size_value.replace('em', '')) size_value = float(size_value.replace("em", ""))
return find_closest_size(size_value) return find_closest_size(size_value)
elif has_style_attrs.group(5): elif has_style_attrs.group(5):
return size_value.replace('pt', 'px') return size_value.replace("pt", "px")
else: else:
return '' return ""
return size_value return size_value
def convert_indents_tag_values(size_value: str) -> str: def convert_indents_tag_values(size_value: str) -> str:
""" """
Function converts values of ['text-indent', 'margin-left', 'margin'] Function converts values of ["text-indent", "margin-left", "margin"]
Parameters Parameters
---------- ----------
size_value: str size_value: str
@@ -71,12 +71,12 @@ def convert_indents_tag_values(size_value: str) -> str:
size_value: str size_value: str
""" """
if len(size_value.split(' ')) == 3: if len(size_value.split(" ")) == 3:
size_value = convert_tag_style_values(size_value.split( size_value = convert_tag_style_values(size_value.split(
' ')[-2]) # returns middle value " ")[-2]) # returns middle value
else: else:
size_value = convert_tag_style_values(size_value.split( size_value = convert_tag_style_values(size_value.split(
' ')[-1]) # returns last value " ")[-1]) # returns last value
return size_value return size_value
@@ -87,35 +87,35 @@ If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed. If property has not empty list, it means that only certain property-value combinations can be transformed.
""" """
LIVECARTA_STYLE_ATTRS = { LIVECARTA_STYLE_ATTRS = {
'text-indent': [], "text-indent": [],
'font-variant': ['small-caps'], "font-variant": ["small-caps"],
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], "text-align": [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [], "align": [],
'font': [], "font": [],
'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys() "font-family": [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys()
if x != LiveCartaConfig.DEFAULT_FONT_NAME], if x != LiveCartaConfig.DEFAULT_FONT_NAME],
'font-size': [], "font-size": [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong> "font-weight": ["bold", "600", "700", "800", "900"], # <strong>
'font-style': ['italic'], # <i> "font-style": ["italic"], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s> "text-decoration": ["underline", "line-through"], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s> "text-decoration-line": ["underline", "line-through"], # <u> , <s>
'vertical-align': ['super'], # <sup> "vertical-align": ["super"], # <sup>
'color': [], "color": [],
'background-color': [], "background-color": [],
'background': [], "background": [],
'width': [], "width": [],
'border': [], "border": [],
'border-top-width': [], "border-top-width": [],
'border-right-width': [], "border-right-width": [],
'border-left-width': [], "border-left-width": [],
'border-bottom-width': [], "border-bottom-width": [],
'border-top': [], "border-top": [],
'border-bottom': [], "border-bottom": [],
'list-style-type': [], "list-style-type": [],
'list-style-image': [], "list-style-image": [],
'margin-left': [], "margin-left": [],
'margin-top': [], "margin-top": [],
'margin': [], "margin": [],
} }
""" """
@@ -125,28 +125,28 @@ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING shou
to suit livecarta style convention. to suit livecarta style convention.
""" """
LIVECARTA_STYLE_ATTRS_MAPPING = { LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents_tag_values, "text-indent": convert_indents_tag_values,
'font-variant': lambda x: x, "font-variant": lambda x: x,
'text-align': lambda x: x, "text-align": lambda x: x,
'font': lambda x: '', "font": lambda x: "",
'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title())) "font-family": lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title()))
or LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x)), or LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x)),
'font-size': convert_tag_style_values, "font-size": convert_tag_style_values,
'color': get_text_color, "color": get_text_color,
'background-color': get_bg_color, "background-color": get_bg_color,
'background': get_bg_color, "background": get_bg_color,
'border': lambda x: x if x != '0' else '', "border": lambda x: x if x != "0" else "",
'border-top-width': lambda x: x if x != '0' else '', "border-top-width": lambda x: x if x != "0" else "",
'border-right-width': lambda x: x if x != '0' else '', "border-right-width": lambda x: x if x != "0" else "",
'border-left-width': lambda x: x if x != '0' else '', "border-left-width": lambda x: x if x != "0" else "",
'border-bottom-width': lambda x: x if x != '0' else '', "border-bottom-width": lambda x: x if x != "0" else "",
'border-top': lambda x: x if x != '0' else '', "border-top": lambda x: x if x != "0" else "",
'border-bottom': lambda x: x if x != '0' else '', "border-bottom": lambda x: x if x != "0" else "",
'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc', "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
'list-style-image': lambda x: 'disc', "list-style-image": lambda x: "disc",
'margin-left': convert_indents_tag_values, "margin-left": convert_indents_tag_values,
'margin-top': convert_tag_style_values, "margin-top": convert_tag_style_values,
'margin': convert_indents_tag_values "margin": convert_indents_tag_values
} }
@@ -155,17 +155,17 @@ def update_inline_styles_to_livecarta_convention(split_style: list):
style_name, style_value = style.split(":") style_name, style_value = style.split(":")
if style_name not in LIVECARTA_STYLE_ATTRS: if style_name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file # property not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = '' split_style[i] = ""
return split_style return split_style
cleaned_value = style_value.replace('\"', '').split()[-1] cleaned_value = style_value.replace("\"", "").split()[-1]
constraints_on_value = LIVECARTA_STYLE_ATTRS.get( constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
style_name) style_name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_name] style_name]
if constraints_on_value and value_not_in_possible_values_list: if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = '' split_style[i] = ""
else: else:
if style_name in LIVECARTA_STYLE_ATTRS_MAPPING: if style_name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data # function that converts our data
@@ -177,14 +177,14 @@ def update_inline_styles_to_livecarta_convention(split_style: list):
def build_inline_style_content(style: str) -> str: def build_inline_style_content(style: str) -> str:
"""Build inline style with livecarta convention""" """Build inline style with livecarta convention"""
# replace all spaces between '; & letter' to ';' # replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", style) style = re.sub(r"; *", ";", style)
# when we split style by ';', last element of the list is '' - None # when we split style by ";", last element of the list is "" - None
# remove it # remove it
split_style: list = list(filter(None, style.split(';'))) split_style: list = list(filter(None, style.split(";")))
# replace all spaces between ': & letter' to ':' # replace all spaces between ": & letter" to ":"
split_style = [el.replace( split_style = [el.replace(
re.search(r'(:\s*)', el).group(1), ':') for el in split_style] re.search(r"(:\s*)", el).group(1), ":") for el in split_style]
split_style = update_inline_styles_to_livecarta_convention(split_style) split_style = update_inline_styles_to_livecarta_convention(split_style)
style = "; ".join(split_style) style = "; ".join(split_style)
@@ -195,17 +195,17 @@ def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRul
style_type: cssutils.css.property.Property): style_type: cssutils.css.property.Property):
if style_type.name not in LIVECARTA_STYLE_ATTRS: if style_type.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file # property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = '' css_rule.style[style_type.name] = ""
return return
cleaned_value = style_type.value.replace('\"', '') cleaned_value = style_type.value.replace("\"", "")
constraints_on_value = LIVECARTA_STYLE_ATTRS.get( constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
style_type.name) style_type.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_type.name] style_type.name]
if constraints_on_value and value_not_in_possible_values_list: if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = '' css_rule.style[style_type.name] = ""
else: else:
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data # function that converts our data
@@ -227,12 +227,12 @@ def build_css_file_content(css_content: str) -> str:
return css_text return css_text
if __name__ == '__main__': if __name__ == "__main__":
file = '../../epub/9781627222174.epub' file = "../../epub/9781627222174.epub"
ebooklib_book = epub.read_epub(file) ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css') css_ = ebooklib_book.get_item_with_href("css/epub.css")
css_ = css_.get_content().decode() css_ = css_.get_content().decode()
css_cleaned = build_css_file_content(css_) css_cleaned = build_css_file_content(css_)
html_ = ebooklib_book.get_item_with_href( html_ = ebooklib_book.get_item_with_href(
'pr01s05.xhtml').get_body_content().decode() "pr01s05.xhtml").get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml') html_soup = BeautifulSoup(html_, features="lxml")

View File

@@ -17,10 +17,12 @@ from bs4 import BeautifulSoup, Tag
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\ from src.epub_converter.html_epub_preprocessor import process_structural_tags, get_tags_between_chapter_marks,\
prepare_title, prepare_content, update_images_src_links, preprocess_footnotes prepare_title, prepare_content
class EpubConverter: class EpubConverter:
@@ -57,26 +59,27 @@ class EpubConverter:
self.noterefs: List[Tag] = [] # start of the footnote self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote self.footnotes: List[Tag] = [] # end of the footnote
self.logger.log('Image processing.') self.logger.log("Image processing.")
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name file_name = x.file_name
content = x.content content = x.content
self.img_href2img_bytes[file_name] = content self.img_href2img_bytes[file_name] = content
self.logger.log('HTML files reading.') self.logger.log("HTML files reading.")
self.html_href2html_body_soup: Dict[str, self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content() BeautifulSoup] = self.build_href2soup_content()
# TODO Presets
self.logger.log('Process CSS inline styles.') self.logger.log("Process CSS inline styles.")
self.process_inline_styles_in_html_soup() self.process_inline_styles_in_html_soup()
self.logger.log('CSS files processing.') self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log('CSS styles adding.') self.logger.log("CSS styles adding.")
self.add_css_styles_to_html_soup() self.add_css_styles_to_html_soup()
self.logger.log('Footnotes processing.') # todo presets
self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup: for href in self.html_href2html_body_soup:
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
self.html_href2html_body_soup) self.html_href2html_body_soup)
@@ -85,27 +88,28 @@ class EpubConverter:
self.footnotes.extend(footnotes_tags) self.footnotes.extend(footnotes_tags)
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)): for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
noteref.attrs['data-id'] = i + 1 noteref.attrs["data-id"] = i + 1
noteref.attrs['id'] = f'footnote-{i + 1}' noteref.attrs["id"] = f"footnote-{i + 1}"
footnote.attrs['href'] = f'#footnote-{i + 1}' footnote.attrs["href"] = f"#footnote-{i + 1}"
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.') self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
self.logger.log('TOC processing.') self.logger.log("TOC processing.")
self.build_adjacency_list_from_toc(self.ebooklib_book.toc) self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed # build simple toc from spine if needed
if self.is_toc_empty(): if self.is_toc_empty():
self.build_adjacency_list_from_spine() self.build_adjacency_list_from_spine()
not_added = [ not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f'Html documents not added to TOC: {not_added}.') self.logger.log(f"Html documents not added to TOC: {not_added}.")
self.add_not_added_files_to_adjacency_list(not_added) self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.') self.logger.log(f"Html internal links and structure processing.")
self.label_chapters_ids_with_tmp_id() self.label_chapters_ids_with_lc_id()
# used only after parsed toc, ids from toc needed # used only after parsed toc, ids from toc needed
self.process_html_soup_structure_to_line() self.process_html_soup_structure_to_line()
self.process_internal_links() self.process_internal_links()
self.logger.log(f'Building chapters content.') self.logger.log(f"Define chapters content.")
self.define_chapters_content() self.define_chapters_content()
self.logger.log(f"Converting html_nodes to LiveCarta chapter items.")
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements # using EpubElements
@@ -115,7 +119,7 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_body_text = item.get_body_content() html_body_text = item.get_body_content()
# html.parser closes tags if needed # html.parser closes tags if needed
soup = BeautifulSoup(html_body_text, features='html.parser') soup = BeautifulSoup(html_body_text, features="html.parser")
nodes[item.file_name] = soup nodes[item.file_name] = soup
return nodes return nodes
@@ -123,15 +127,15 @@ class EpubConverter:
path_to_css_from_html = css_href path_to_css_from_html = css_href
html_folder = dirname(html_href) html_folder = dirname(html_href)
path_to_css_from_root = normpath( path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace('\\', '/') join(html_folder, path_to_css_from_html)).replace("\\", "/")
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css # if in css file we import another css
if "@import" in str(css_obj.content): if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \ path_to_css_from_root = "css/" + \
re.search('"(.*)"', str(css_obj.content)).group(1) re.search("'(.*)'", str(css_obj.content)).group(1)
css_obj = self.ebooklib_book.get_item_with_href( css_obj = self.ebooklib_book.get_item_with_href(
path_to_css_from_root) path_to_css_from_root)
assert css_obj, f'Css style {css_href} was not in manifest.' assert css_obj, f"Css style {css_href} was not in manifest."
css_content: str = css_obj.get_content().decode() css_content: str = css_obj.get_content().decode()
return css_content return css_content
@@ -140,11 +144,11 @@ class EpubConverter:
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')}) attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style: for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs['style'] inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs['style'] = \ tag_initial_inline_style.attrs["style"] = \
build_inline_style_content(inline_style) build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]: def build_html_and_css_relations(self) -> tuple[dict, dict]:
@@ -167,23 +171,23 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content html_content = item.content
html_href = item.file_name html_href = item.file_name
soup_html_content = BeautifulSoup(html_content, features='lxml') soup_html_content = BeautifulSoup(html_content, features="lxml")
# check if file links to css file # check if file links to css file
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): for tag in soup_html_content.find_all("link", attrs={"type": "text/css"}):
# alternate page of original page (e.g. another language) # alternate page of original page (e.g. another language)
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): if tag.attrs.get("rel") and ("alternate" in tag.attrs["rel"]):
continue continue
css_href = tag.attrs.get('href') css_href = tag.attrs.get("href")
html_href2css_href[html_href].append(css_href) html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content: if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict # css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_file_content( css_href2css_content[css_href] = build_css_file_content(
self.get_css_content(css_href, html_href)) self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all('style')): for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string css_content = tag.string
html_href2css_href[html_href].append(f'href{i}') html_href2css_href[html_href].append(f"href{i}")
css_href2css_content[f'href{i}'] = build_css_file_content( css_href2css_content[f"href{i}"] = build_css_file_content(
css_content) css_content)
return html_href2css_href, css_href2css_content return html_href2css_href, css_href2css_content
@@ -195,7 +199,7 @@ class EpubConverter:
""" """
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href): if self.html_href2css_href.get(html_href):
css = '' css = ""
for css_href in self.html_href2css_href[html_href]: for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href] css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
@@ -243,7 +247,7 @@ class EpubConverter:
sub_nodes = [] sub_nodes = []
for elem in second: for elem in second:
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1: if ("section" in first.title.lower() or "part" in first.title.lower()) and lvl == 1:
self.offset_sub_nodes.append( self.offset_sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl)) self.build_adjacency_list_from_toc(elem, lvl))
else: else:
@@ -267,7 +271,7 @@ class EpubConverter:
self.adjacency_list[-1] = nodes self.adjacency_list[-1] = nodes
else: else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' assert 0, f"Error. Element is not tuple/Link/list instance: {type(element)}"
def is_toc_empty(self) -> bool: def is_toc_empty(self) -> bool:
"""Function checks is toc empty""" """Function checks is toc empty"""
@@ -297,36 +301,36 @@ class EpubConverter:
"""Function add files that not added to adjacency list""" """Function add files that not added to adjacency list"""
for i, file in enumerate(not_added): for i, file in enumerate(not_added):
nav_point = NavPoint( nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file)) Section(f"To check #{i}, filename: {file}", file))
self.adjacency_list[-1].append(nav_point) self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(file) self.hrefs_added_to_toc.add(file)
def label_chapters_ids_with_tmp_id(self): def label_chapters_ids_with_lc_id(self):
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
ids = self.html_href2subchapter_ids[html_href] ids = self.html_href2subchapter_ids[html_href]
for i in ids: for i in ids:
soup = self.html_href2html_body_soup[html_href] soup = self.html_href2html_body_soup[html_href]
tag = soup.find(id=i) tag = soup.find(id=i)
new_h = soup.new_tag('tmp') new_h = soup.new_tag("tmp")
new_h.attrs['class'] = 'converter-chapter-mark' new_h.attrs["class"] = "converter-chapter-mark"
new_h.attrs['id'] = i new_h.attrs["id"] = i
tag.insert_before(new_h) tag.insert_before(new_h)
def process_html_soup_structure_to_line(self): def process_html_soup_structure_to_line(self):
# go to line structure # go to line structure
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[html_href] soup = self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup) self.html_href2html_body_soup[html_href] = process_structural_tags(soup)
@staticmethod @staticmethod
def create_unique_id(href, id_): def create_unique_id(href, id_):
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
@staticmethod @staticmethod
def create_new_anchor_span(soup, id_): def create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span") new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs['id'] = id_ new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs['class'] = 'link-anchor' new_anchor_span.attrs["class"] = "link-anchor"
new_anchor_span.string = "\xa0" new_anchor_span.string = "\xa0"
return new_anchor_span return new_anchor_span
@@ -353,18 +357,18 @@ class EpubConverter:
""" """
dir_name = os.path.dirname(cur_file_path) dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join( normed_path = os.path.normpath(os.path.join(
dir_name, href_in_link)).replace('\\', '/') dir_name, href_in_link)).replace("\\", "/")
full_path = [ full_path = [
path for path in self.hrefs_added_to_toc if normed_path in path] path for path in self.hrefs_added_to_toc if normed_path in path]
if not full_path: if not full_path:
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
f'While processing href in {internal_link_tag}.') f"While processing href in {internal_link_tag}.")
internal_link_tag.attrs['converter-mark'] = 'bad-link' internal_link_tag.attrs["converter-mark"] = "bad-link"
return None return None
if len(full_path) > 1: if len(full_path) > 1:
self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}' self.logger.log(f"Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}"
f' while {internal_link_tag} processing. The first one will be chosen.') f" while {internal_link_tag} processing. The first one will be chosen.")
return full_path[0] return full_path[0]
@@ -387,30 +391,30 @@ class EpubConverter:
""" """
# 1. rebuild ids to be unique in all documents # 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc: for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}): for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
if tag.attrs.get('class') == 'converter-chapter-mark': if tag.attrs.get("class") == "converter-chapter-mark":
continue continue
if tag.attrs.get('class') == 'footnote-element': if tag.attrs.get("class") == "footnote-element":
continue continue
new_id = self.create_unique_id(toc_href, tag.attrs['id']) new_id = self.create_unique_id(toc_href, tag.attrs["id"])
tag.attrs['id'] = new_id tag.attrs["id"] = new_id
# 2a. process anchor which is a whole xhtml file # 2a. process anchor which is a whole xhtml file
internal_link_reg1 = re.compile( internal_link_reg1 = re.compile(
r'(^(?!https?://).+\.(htm|html|xhtml)$)') r"(^(?!https?://).+\.(htm|html|xhtml)$)")
for toc_href in self.hrefs_added_to_toc: for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href] soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): for internal_link_tag in soup.find_all("a", {"href": internal_link_reg1}):
a_tag_href = internal_link_tag.attrs['href'] a_tag_href = internal_link_tag.attrs["href"]
# find full path # find full path
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag) toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc: if not a_tag_href_matched_to_toc:
continue continue
new_id = self.create_unique_id(a_tag_href_matched_to_toc, '') new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
if new_id not in self.internal_anchors: if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id) new_anchor_span = self.create_new_anchor_span(soup, new_id)
@@ -418,22 +422,22 @@ class EpubConverter:
anchor_soup.insert(0, new_anchor_span) anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id) self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href'] del internal_link_tag.attrs["href"]
# 2b. process anchor which is an element in xhtml file # 2b. process anchor which is an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)') internal_link_reg2 = re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")
for toc_href in self.hrefs_added_to_toc: for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href] soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): for internal_link_tag in soup.find_all("a", {"href": internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split( a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
'#') "#")
# find full path # find full path
if a_tag_href: if a_tag_href:
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
internal_link_tag) internal_link_tag)
else: else:
a_tag_href_matched_to_toc = os.path.normpath( a_tag_href_matched_to_toc = os.path.normpath(
toc_href).replace('\\', '/') toc_href).replace("\\", "/")
if not a_tag_href_matched_to_toc: if not a_tag_href_matched_to_toc:
continue continue
@@ -442,45 +446,45 @@ class EpubConverter:
a_tag_href_matched_to_toc, a_tag_id) a_tag_href_matched_to_toc, a_tag_id)
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id, }) anchor_tags = anchor_soup.find_all(attrs={"id": new_id, })
anchor_tags = anchor_tags or anchor_soup.find_all( anchor_tags = anchor_tags or anchor_soup.find_all(
attrs={'id': a_tag_id}) # if link is a footnote attrs={"id": a_tag_id}) # if link is a footnote
if anchor_tags: if anchor_tags:
if len(anchor_tags) > 1: if len(anchor_tags) > 1:
self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n' self.logger.log(f"Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n"
f'{anchor_tags}\n' f"{anchor_tags}\n"
f' While processing {internal_link_tag}') f" While processing {internal_link_tag}")
anchor_tag = anchor_tags[0] anchor_tag = anchor_tags[0]
assert anchor_tag.attrs['id'] in [new_id, a_tag_id] assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
# if anchor is found we could add placeholder for link creation on server side. # if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
# create span to have cyclic links, link has 1 type of class, anchor another # create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs['id'] not in self.internal_anchors: if anchor_tag.attrs["id"] not in self.internal_anchors:
new_anchor_span = self.create_new_anchor_span( new_anchor_span = self.create_new_anchor_span(
soup, new_id) soup, new_id)
anchor_tag.insert_before(new_anchor_span) anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id) self.internal_anchors.add(new_id)
del anchor_tag.attrs['id'] del anchor_tag.attrs["id"]
del internal_link_tag.attrs['href'] del internal_link_tag.attrs["href"]
else: else:
internal_link_tag.attrs['converter-mark'] = 'bad-link' internal_link_tag.attrs["converter-mark"] = "bad-link"
self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.' self.logger.log(f"Error in {toc_href}. While processing {internal_link_tag} no anchor found."
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file."
f' Old id={a_tag_id}') f" Old id={a_tag_id}")
def build_one_chapter(self, nav_point: NavPoint): def detect_one_chapter(self, nav_point: NavPoint):
""" """
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
3 cases: 3 cases:
id wraps all chapter content, id wraps all chapter content,
id wraps chapter's content + subchapters' content id wraps chapter"s content + subchapters" content
id points to the start of title of a chapter id points to the start of title of a chapter
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id In all cases we know where chapter starts. Therefore, chapter is all tags between chapter"s id
and id of the next chapter/subchapter and id of the next chapter/subchapter
Parameters Parameters
---------- ----------
@@ -496,7 +500,7 @@ class EpubConverter:
soup = self.html_href2html_body_soup[nav_point.href] soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks( chapter_tags = get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup) first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser') new_tree = BeautifulSoup("", "html.parser")
for tag in chapter_tags: for tag in chapter_tags:
new_tree.append(tag) new_tree.append(tag)
self.href_chapter_id2soup_html[( self.href_chapter_id2soup_html[(
@@ -504,16 +508,30 @@ class EpubConverter:
if self.adjacency_list.get(nav_point): if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]: for sub_node in self.adjacency_list[nav_point]:
self.build_one_chapter(sub_node) self.detect_one_chapter(sub_node)
def define_chapters_content(self): def define_chapters_content(self):
"""Function build chapters content, starts from top level chapters""" """Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points: if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points: for point in top_level_nav_points:
self.build_one_chapter(point) self.detect_one_chapter(point)
def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
"""
Function prepare style, tags to json structure
Parameters
----------
nav_point: NavPoint
lvl: int
level of chapter
Returns
-------
ChapterItem
built chapter
"""
title = nav_point.title title = nav_point.title
if nav_point.id: if nav_point.id:
content: BeautifulSoup = self.href_chapter_id2soup_html[( content: BeautifulSoup = self.href_chapter_id2soup_html[(
@@ -526,7 +544,7 @@ class EpubConverter:
access=self.access, access=self.access,
path2aws_path=self.book_image_src_path2aws_path, path2aws_path=self.book_image_src_path2aws_path,
book_id=self.file_path.stem book_id=self.file_path.stem
if hasattr(self.file_path, 'stem') else 'book_id') if hasattr(self.file_path, "stem") else "book_id")
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title) title_preprocessed = prepare_title(title)
@@ -534,15 +552,16 @@ class EpubConverter:
remove_title_from_chapter=is_chapter) remove_title_from_chapter=is_chapter)
sub_nodes = [] sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter # warning! not EpubHtmlItems won't be added to chapter
# if it doesn't have subchapters
if self.adjacency_list.get(nav_point): if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]: for sub_node in self.adjacency_list[nav_point]:
sub_chapter_item = self.node_to_livecarta_chapter_item( sub_chapter_item = self.html_node_to_livecarta_chapter_item(
sub_node, lvl + 1) sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item) sub_nodes.append(sub_chapter_item)
if self.logger: if self.logger:
indent = ' ' * lvl indent = " " * lvl
self.logger.log(f'{indent}Chapter: {title} is prepared.') self.logger.log(f"{indent}Chapter: {title} is prepared.")
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self) -> dict: def convert_to_dict(self) -> dict:
@@ -550,12 +569,13 @@ class EpubConverter:
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = [] top_level_chapters = []
for nav_point in top_level_nav_points: # loop through to level chapters
chapter = self.node_to_livecarta_chapter_item(nav_point) for tl_nav_point in top_level_nav_points:
chapter = self.html_node_to_livecarta_chapter_item(tl_nav_point)
top_level_chapters.append(chapter) top_level_chapters.append(chapter)
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
self.logger.log(f'Anchors found: {len(self.internal_anchors)}.') self.logger.log(f"Anchors found: {len(self.internal_anchors)}.")
self.logger.log('End conversion.') self.logger.log("End conversion.")
return { return {
"content": top_level_dict_chapters, "content": top_level_dict_chapters,
@@ -564,12 +584,12 @@ class EpubConverter:
if __name__ == "__main__": if __name__ == "__main__":
epub_file_path = '../../epub/9781614382264.epub' epub_file_path = "../../epub/9781614382264.epub"
logger_object = BookLogger( logger_object = BookLogger(
name='epub', book_id=epub_file_path.split('/')[-1]) name="epub", book_id=epub_file_path.split("/")[-1])
json_converter = EpubConverter(epub_file_path, logger=logger_object) json_converter = EpubConverter(epub_file_path, logger=logger_object)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json: with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
json.dump(content_dict, f_json, ensure_ascii=False) json.dump(content_dict, f_json, ensure_ascii=False)

View File

@@ -7,7 +7,7 @@ class EpubBook(BookSolver):
def __init__(self, book_id=0, access=None, main_logger=None): def __init__(self, book_id=0, access=None, main_logger=None):
super().__init__(book_id, access, main_logger) super().__init__(book_id, access, main_logger)
self.book_type = 'epub' self.book_type = "epub"
def get_converted_book(self): def get_converted_book(self):
""" """

View File

@@ -0,0 +1,87 @@
from typing import Tuple
from bs4 import BeautifulSoup, Tag
def _replace_with_livecarta_anchor_tag(anchor, i):
"""Function replace noteref_tag(anchor) with new livecarta tag"""
new_tag = BeautifulSoup(features="lxml").new_tag("sup")
new_tag["class"] = "footnote-element"
new_tag["data-id"] = i + 1
new_tag["id"] = f"footnote-{i + 1}"
new_tag.string = "*"
if anchor.parent.name == "sup":
anchor.parent.unwrap()
anchor.replace_with(new_tag)
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name="epub:type") \
-> Tuple[list, list, list]:
"""
This function preprocessing footnotes
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(
attrs={noteref_attr_name: "noteref"})
bad_noterefs_tags = set(
[tag for tag in noterefs_tags if not tag.attrs.get("href")])
noterefs_tags = [
tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
new_noterefs_tags = []
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str) -> Tuple[str, str]:
"""Returns name of file & id of an anchor"""
assert "#" in s, f"Error. Unexpected href: {s} in a tag. Href must contain an id."
f, id_ = s.split("#")
return f, id_
def verify_footnote_tag(tags: list):
"""Function verifies is tag - footnote"""
assert len(tags) <= 1, f"Error, Multiple id: {href}.\n{tags}"
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(
f"Warning. Href for tag is detected as footnote:\n{noteref_tag}")
return anchored_tags
else:
assert 0, f"Error, No element with id: {href} found."
return tags
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs["href"]
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(
f"Error while footnotes processing. For {noteref_tag} invalid path: {file}.")
continue
possible_footnote = "note|footnote|endnote|rearenote"
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={"epub:type": re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote":
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(
_replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(
attrs={"role": "doc-backlink"}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
return footnotes, new_noterefs_tags, new_footnotes_tags

View File

@@ -1,305 +1,107 @@
import os
import re import re
import pathlib
from typing import Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from src.access import Access
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
def _replace_with_livecarta_anchor_tag(anchor, i): def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
"""Function replace noteref_tag(anchor) with new livecarta tag"""
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
if anchor.parent.name == 'sup':
anchor.parent.unwrap()
anchor.replace_with(new_tag)
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-> Tuple[list, list, list]:
""" """
This function preprocessing footnotes Function adds span with id from tag_to_be_removed
This function should be earlier that adding fonts in pipeline. because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Soup object
chapter_tag: BeautifulSoup
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p> Returns
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside> -------
None
updated body tag
""" """
footnotes = [] def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list):
noterefs_tags = source_html_tag.find_all( """Function inserts span before tag aren't supported by livecarta"""
attrs={noteref_attr_name: 'noteref'}) new_tag = chapter_tag.new_tag("span")
bad_noterefs_tags = set( new_tag.attrs["id"] = id_ or ""
[tag for tag in noterefs_tags if not tag.attrs.get('href')]) new_tag.attrs["class"] = class_ or ""
noterefs_tags = [ new_tag.string = "\xa0"
tag for tag in noterefs_tags if tag not in bad_noterefs_tags] tag_to_be_removed.insert_before(new_tag)
new_noterefs_tags = []
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str) -> Tuple[str, str]: if tag_to_be_removed.attrs.get("id"):
"""Returns name of file & id of an anchor""" _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' id_=tag_to_be_removed.attrs["id"],
f, id_ = s.split('#') class_=tag_to_be_removed.attrs.get("class"))
return f, id_
def verify_footnote_tag(tags: list):
"""Function verifies is tag - footnote"""
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(
f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
return tags
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(
f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={'epub:type': re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(
_replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(
attrs={'role': 'doc-backlink'}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
return footnotes, new_noterefs_tags, new_footnotes_tags
def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup: def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup:
""" """
Main function that works with structure of html. Make changes inplace. Main function that works with structure of html. Make changes inplace.
Parameters Parameters
---------- ----------
body_tag: Tag, soup object chapter_tag: Tag, soup object
Steps Steps
---------- ----------
1. Extracts tags that are not needed 1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree. 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed. Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed.
This tag must have a body_tag as a parent. This tag must have a chapter_tag as a parent.
Otherwise, it is wrapped with some tags. Like: Otherwise, it is wrapped with some tags. Like:
<p> <span id='123', class='converter-chapter-mark'> </span> </p> <p> <span id="123", class="converter-chapter-mark"> </span> </p>
3. Headings that are not supported by livecarta converts to <p> 3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString 4. Wrapping NavigableString
Returns Returns
------- -------
body_tag: Tag, BeautifulSoup chapter_tag: Tag, BeautifulSoup
adjusted body_tag adjusted chapter_tag
""" """
def _preserve_class_in_aside_tag(tag_): def _tags_to_correspond_livecarta_tag(chapter_tag):
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)""" """Function to replace all tags to correspond livecarta tags"""
# this is for Wiley books with boxes for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items():
tag_class = tag_.attrs['class'] if not isinstance( for key in reg_key:
tag_.attrs['class'], list) else tag_.attrs['class'][0] # text = tag if isinstance(tag, NavigableString) else tag.text
if tag_.parent.name == 'aside': tags = chapter_tag.find_all(re.compile(key))
if not tag_.parent.attrs.get('class'): for tag in tags:
tag_.parent.attrs['class'] = tag_class tag.name = to_replace_value
def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool: def _unwrap_tags(chapter_tag):
""" """Function unwrap tags and move id to span"""
Function saves css style inherited from class, copies class to child <p> for tag in LiveCartaConfig. TAGS_TO_UNWRAP:
returns True, if <section> could be unwrapped for s in chapter_tag.find_all(tag):
Parameters _add_span_to_save_ids_for_links(s, chapter_tag)
---------- s.unwrap()
tag_: Tag, soup object
Returns def _mark_parent_is_body(chapter_tag):
------- # check marks for chapter starting are on the same level - 1st
bool marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"})
""" # fix marks to be on 1 level
# this is for Wiley books with boxes for mark in marks:
tag_class = tag_.attrs['class'] if not isinstance( while mark.parent != chapter_tag:
tag_.attrs['class'], list) else tag_.attrs['class'][0] mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
if 'feature' not in tag_class:
return True
child_p_tags = tag_.find_all("p")
if len(child_p_tags) == 1:
child_p_tag = child_p_tags[0]
if not child_p_tag.attrs.get('class'):
child_p_tag.attrs['class'] = tag_class
return True
elif len(child_p_tags) > 1: _tags_to_correspond_livecarta_tag(chapter_tag)
tag_.name = 'p'
return False
else:
return True
def _add_span_to_save_ids_for_links(tag_to_be_removed): _unwrap_tags(chapter_tag)
if tag_to_be_removed.attrs.get('id'):
_insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
def _replace_div_tag_with_table(): _mark_parent_is_body(chapter_tag)
"""
Function replace <div> with <table>:
1. Convert div with certain classes to tables
2. Add background color to div with background-color
""" return chapter_tag
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(
div.attrs['class'], list) else div.attrs['class'][0]
if div_class in ['C409', 'C409a']:
_wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')
elif div_class in ['C441', 'C816']:
_wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')
if div.attrs.get('style'):
if 'background-color' in div.attrs['style']:
end_index = div.attrs['style'].find(
'background-color') + len('background-color')
start_index_of_color = end_index + 2
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
_wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
elif div.attrs.get('style') == '':
del div.attrs['style']
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
if div.contents:
is_not_struct_tag = [
child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
_add_span_to_save_ids_for_links(div)
div.unwrap()
def _heading_tag_to_p_tag(body_tag):
"""Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
# comments removal
for tag in body_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
_replace_div_tag_with_table()
for s in body_tag.find_all("section"):
could_be_unwrapped = True
if s.attrs.get('class'):
could_be_unwrapped = _preserve_class_in_section_tag(s)
_add_span_to_save_ids_for_links(s)
if could_be_unwrapped:
s.unwrap()
for s in body_tag.find_all("article"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("figure"):
s.name = 'p'
# to center image inside this tag
s.attrs['style'] = "text-align: center;"
for s in body_tag.find_all("figcaption"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("body"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("html"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# check marks for chapter starting are on the same 1 level
marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
parents_marks_are_body = [x.parent == body_tag for x in marks]
# fix marks to be on 1 level
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(
parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
_heading_tag_to_p_tag(body_tag)
# wrap NavigableString with <p>
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""After processing on a first_id that corresponds to current chapter, """
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted from initial html_soup all tags from current chapter are extracted
Parameters Parameters
---------- ----------
first_id: first_id: str
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
href: href: str
Name of current chapter's file Name of current chapters file
html_soup: Tag html_soup: Tag
Soup object of current file Soup object of current file
@@ -310,13 +112,13 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
""" """
marked_tags = html_soup.find( marked_tags = html_soup.find(
attrs={'id': first_id, 'class': 'converter-chapter-mark'}) attrs={"id": first_id, "class": "converter-chapter-mark"})
if marked_tags: if marked_tags:
next_tag = marked_tags.next_sibling next_tag = marked_tags.next_sibling
tags = [] tags = []
while next_tag: while next_tag:
if not isinstance(next_tag, NavigableString) and\ if not isinstance(next_tag, NavigableString) and \
(next_tag.attrs.get('class') == 'converter-chapter-mark'): (next_tag.attrs.get("class") == "converter-chapter-mark"):
break break
tags.append(next_tag) tags.append(next_tag)
next_tag = next_tag.next_sibling next_tag = next_tag.next_sibling
@@ -327,182 +129,119 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
html_soup.smooth() html_soup.smooth()
else: else:
assert 0, f'Warning: no match for {first_id, href}' assert 0, f"Warning: no match for {first_id, href}"
return tags return tags
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images to Amazon web service"""
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link_path
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'../json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def update_images_src_links(body_tag: BeautifulSoup,
href2img_content: dict,
path_to_html: str,
access=None,
path2aws_path: dict = None,
book_id: str = None) -> dict:
"""Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(
html_folder, path_to_img_from_html)).replace('\\', '/')
assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(
access, path_to_img_from_root, img_content, book_id)
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(
path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):
del img.attrs['width']
if img.attrs.get('height'):
del img.attrs['height']
if img.attrs.get('style'):
del img.attrs['style']
return path2aws_path
def _clean_title_from_numbering(title: str):
"""Function removes numbering from titles"""
title = re.sub(r'^(\s+)+', '', title)
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
# title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
def prepare_title(title_of_chapter: str) -> str: def prepare_title(title_of_chapter: str) -> str:
"""Function finalise processing/cleaning title""" """Function finalise processing/cleaning title"""
title_str = BeautifulSoup(title_of_chapter, features='lxml').string title_str = BeautifulSoup(title_of_chapter, features="lxml").string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r"([\n\t\xa0])", " ", title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip() title_str = re.sub(r" +", " ", title_str).rstrip()
title_str = _clean_title_from_numbering(title_str) # clean whitespace characters ([\r\n\t\f\v ])
title_str = re.sub(r"(^\s+)|(\s+$)", "", title_str)
return title_str return title_str
def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): def _remove_comments(chapter_tag):
"""Function inserts span before tag aren't supported by livecarta""" for tag in chapter_tag.find_all():
new_tag = main_tag.new_tag("span") for element in tag(text=lambda text: isinstance(text, Comment)):
new_tag.attrs['id'] = id_ or '' element.extract()
new_tag.attrs['class'] = class_ or ''
new_tag.string = "\xa0"
tag.insert_before(new_tag)
def _clean_headings_content(content: BeautifulSoup, title: str): def _wrap_strings_with_p(chapter_tag):
def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup): # wrap NavigableString with <p>
if tag_to_be_removed.attrs.get('id'): for node in chapter_tag:
_insert_span_with_attrs_before_tag(body_tag, if isinstance(node, NavigableString):
tag_to_be_removed, content = str(node)
id_=tag_to_be_removed.attrs.get( content = re.sub(r"([\n\t\xa0])", " ", content)
'id'), # remove spaces at the beginning and at the end of the string:
class_=tag_to_be_removed.attrs.get('class')) content = content.strip()
if content:
tag = chapter_tag.new_tag("p")
tag.append(str(node))
node.replace_with(tag)
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
_insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class'))
title = title.lower() def _remove_headings_content(content_tag, title_of_chapter: str):
for child in content.contents: """
if isinstance(child, NavigableString): Function
text = child clean/remove headings from chapter in order to avoid duplication of chapter titles in the content
else: add span with id in order to
text = child.text Parameters
if text and re.sub(r'([\n\t\xa0])', '', text): ----------
text = re.sub(r'([\n\t\xa0])', ' ', text) content_tag: soup object
text = re.sub(r' +', ' ', text).strip() Tag of the page
text = text.lower() title_of_chapter: str
if title == text: Chapter title
add_span_to_save_ids_for_links(child, content)
child.extract() Returns
elif (title in text) and (child.name in ['h1', 'h2', 'h3']): -------
add_span_to_save_ids_for_links(child, content) None
child.extract() clean/remove headings & add span with id
"""
title_of_chapter = title_of_chapter.lower()
for tag in content_tag.contents:
text = tag if isinstance(tag, NavigableString) else tag.text
if text:
text = re.sub(r"^[\s\xa0]+|[\s\xa0]+$", " ", text).lower()
if title_of_chapter == text or \
(title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name)):
_add_span_to_save_ids_for_links(tag, content_tag)
tag.extract()
break break
def _process_lists(body_tag: BeautifulSoup): # todo remove
def _process_lists(chapter_tag: BeautifulSoup):
""" """
Function Function
- process tags <li>. - process tags <li>.
- unwrap <p> tags. - unwrap <p> tags.
Parameters Parameters
---------- ----------
body_tag: Tag, soup object chapter_tag: Tag, soup object
Returns Returns
------- -------
None None
""" """
li_tags = body_tag.find_all("li") li_tags = chapter_tag.find_all("li")
for li_tag in li_tags: for li_tag in li_tags:
if li_tag.p: if li_tag.p:
li_tag.attrs.update(li_tag.p.attrs) li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap() li_tag.p.unwrap()
def _preprocess_table(body_tag: BeautifulSoup): def _preprocess_table(chapter_tag: BeautifulSoup):
"""Function to preprocess tables and tags(td|th|tr): style""" """Function to preprocess tables and tags(td|th|tr): style"""
tables = body_tag.find_all("table") tables = chapter_tag.find_all("table")
for table in tables: for table in tables:
t_tags = table.find_all(re.compile("td|th|tr")) t_tags = table.find_all(re.compile("td|th|tr"))
for t_tag in t_tags: for t_tag in t_tags:
style = t_tag.get('style') style = t_tag.get("style")
width = '' width = ""
if style: if style:
width_match = re.search( width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match: if width_match:
size = width_match.group(1) size = width_match.group(1)
width = size + 'px' width = size + "px"
t_tag.attrs['width'] = t_tag.get('width') or width t_tag.attrs["width"] = t_tag.get("width") or width
if t_tag.attrs.get('style'): if t_tag.attrs.get("style"):
t_tag.attrs['style'] = t_tag.attrs['style'].replace( t_tag.attrs["style"] = t_tag.attrs["style"].replace(
'border:0;', '') "border:0;", "")
elif t_tag.attrs.get('style') == '': elif t_tag.attrs.get("style") == "":
del t_tag.attrs['style'] del t_tag.attrs["style"]
if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']: if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs['border'] = '1' table.attrs["border"] = "1"
def _preprocess_code_tags(chapter_tag: BeautifulSoup): def _preprocess_code_tags(chapter_tag: BeautifulSoup):
@@ -523,25 +262,15 @@ def _preprocess_code_tags(chapter_tag: BeautifulSoup):
if not code.parent.name == "pre": if not code.parent.name == "pre":
code.name = "span" code.name = "span"
continue continue
# if tag isn't in pre and doesn't have style # if tag isn"t in pre and doesn"t have style
if not code.attrs.get('style'): if not code.attrs.get("style"):
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;' code.attrs["style"] = "font-size: 14px; font-family: courier new,courier,monospace;"
def _prepare_formatted(text: str) -> str:
"""Function replaces special symbols with their Unicode representation"""
text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
text = text.replace(' ', "\xa0")
text = text.replace('𝑓', "\xf0\x9d\x91\x93")
return text
def _preprocess_pre_tags(chapter_tag: BeautifulSoup): def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
""" """
Function preprocessing <pre> tags Function preprocessing <pre> tags
Wrap string of the tag with <code> if it's necessary Wrap string of the tag with <code> if its necessary
Parameters Parameters
---------- ----------
chapter_tag: Tag, soup object chapter_tag: Tag, soup object
@@ -564,6 +293,42 @@ def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
pre.append(code) pre.append(code)
# todo replace
def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
"""Function wraps <tag> with <table>"""
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_be_wrapped.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def _preprocess_div_tags(chapter_tag):
"""
Function replace <div> with <table>:
"""
for div in chapter_tag.find_all("div"):
if div.attrs.get('style'):
_wrap_tag_with_table(
chapter_tag,
tag_to_be_wrapped=div,
width=div.attrs['width'] if div.attrs.get('width') else '100',
border=div.attrs['border'] if div.attrs.get('border') else None,
bg_color=div.attrs['bgcolor'] if div.attrs.get('bgcolor') else None)
else:
div.name = "p"
continue
_add_span_to_save_ids_for_links(div, chapter_tag)
div.unwrap()
def _clean_wiley_block(block): def _clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs: for hr in hrs:
@@ -571,48 +336,30 @@ def _clean_wiley_block(block):
h = block.find(re.compile("h[1-9]")) h = block.find(re.compile("h[1-9]"))
if h: if h:
h.name = "p" h.name = "p"
h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) h.insert_before(BeautifulSoup(features="lxml").new_tag("br"))
def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
"""Function wraps <block> with <table>"""
table = main_tag.new_tag("table")
table.attrs['border'] = border
table.attrs['align'] = 'center'
table.attrs['style'] = f'width:{width}%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
# td.attrs['border-radius'] = '8px'
if bg_color:
td.attrs['bgcolor'] = bg_color
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
return table
def _preprocess_block_tags(chapter_tag: Tag): def _preprocess_block_tags(chapter_tag: Tag):
"""Function preprocessing <block> tags""" """Function preprocessing <block> tags"""
for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}): for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(block) _clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get( color = "#DDDDDD" if block.attrs.get(
'class') == 'feature1' else None "class") == "feature1" else None
color = '#EEEEEE' if block.attrs.get( color = "#EEEEEE" if block.attrs.get(
'class') == 'feature2' else color "class") == "feature2" else color
_wrap_block_tag_with_table(chapter_tag, block, bg_color=color) _wrap_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) block.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
block.unwrap() block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(future_block) _clean_wiley_block(future_block)
color = '#DDDDDD' if future_block.attrs.get( color = "#DDDDDD" if future_block.attrs.get(
'class') == 'feature1' else None "class") == "feature1" else None
color = '#EEEEEE' if future_block.attrs.get( color = "#EEEEEE" if future_block.attrs.get(
'class') == 'feature2' else color "class") == "feature2" else color
_wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) _wrap_tag_with_table(chapter_tag, future_block, bg_color=color)
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
@@ -628,10 +375,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
Steps Steps
---------- ----------
1. find \n 1. heading removal
2. heading removal 2. processing tags
3. processing tags 3. class removal
4. class removal
Returns Returns
------- -------
@@ -639,28 +385,27 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
prepared content prepared content
""" """
# 1. find \n # 1. remove comments
to_remove = [] _remove_comments(content_tag)
for child in content_tag.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t])', '', child.string)
if s == '':
to_remove.append(child)
# 2. heading removal # 2. wrap NavigableString with tag <p>
_wrap_strings_with_p(content_tag)
# 3. heading removal
if remove_title_from_chapter: if remove_title_from_chapter:
_clean_headings_content(content_tag, title_str) _remove_headings_content(content_tag, title_str)
# 3. processing tags (<li>, <table>, <code>, <pre>, <block>) # 4. processing tags (<li>, <table>, <code>, <pre>, <div>, <block>)
_process_lists(content_tag) _process_lists(content_tag)
_preprocess_table(content_tag) _preprocess_table(content_tag)
_preprocess_code_tags(content_tag) _preprocess_code_tags(content_tag)
_preprocess_pre_tags(content_tag) _preprocess_pre_tags(content_tag)
_preprocess_div_tags(content_tag)
_preprocess_block_tags(content_tag) _preprocess_block_tags(content_tag)
# 4. class removal # 5. remove classes that were created by converter
for tag in content_tag.find_all(recursive=True): for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', if hasattr(tag, "attrs") and tag.attrs.get("class") \
'footnote-element']): and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs['class'] del tag.attrs["class"]
return str(content_tag) return str(content_tag)

View File

@@ -0,0 +1,67 @@
import os
import pathlib
from bs4 import BeautifulSoup
from src.access import Access
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images to Amazon web service"""
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link_path
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f"../json/img_{book_id}/"))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, "wb+")
f.write(img_content)
f.close()
return new_img_path
def update_images_src_links(body_tag: BeautifulSoup,
href2img_content: dict,
path_to_html: str,
access=None,
path2aws_path: dict = None,
book_id: str = None) -> dict:
"""Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all("img")
for img in img_tags:
path_to_img_from_html = img.attrs.get("src")
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(
html_folder, path_to_img_from_html)).replace("\\", "/")
assert path_to_img_from_root in href2img_content, \
f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest."
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(
access, path_to_img_from_root, img_content, book_id)
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(
path_to_img_from_root, img_content, "book_id")
img.attrs["src"] = str(new_folder)
if img.attrs.get("width"):
del img.attrs["width"]
if img.attrs.get("height"):
del img.attrs["height"]
if img.attrs.get("style"):
del img.attrs["style"]
return path2aws_path

View File

@@ -21,33 +21,33 @@ class TagStyleConverter:
@staticmethod @staticmethod
def remove_white_if_no_bgcolor(style_, tag): def remove_white_if_no_bgcolor(style_, tag):
"""Function remove text white color if there is no bg color""" """Function remove text white color if there is no bg color"""
if 'background' in style_: if "background" in style_:
style_ = style_.replace( style_ = style_.replace(
'background:', 'background-color:') "background:", "background-color:")
return style_ return style_
# if text color is white, check that we have bg-color # if text color is white, check that we have bg-color
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): if ("color:#ffffff" in style_) or ("color:#fff" in style_) or ("color:white" in style_):
# if bg color is inherited, just return style as is # if bg color is inherited, just return style as is
for parent_tag in tag.parents: for parent_tag in tag.parents:
# white bg color not need to be checked as we do not write 'white bg color' # white bg color not need to be checked as we do not write "white bg color"
tag_with_bg = ['span', 'td', 'tr', 'p'] tag_with_bg = ["span", "td", "tr", "p"]
tag_will_be_saved = parent_tag.name in tag_with_bg tag_will_be_saved = parent_tag.name in tag_with_bg
has_bg = parent_tag.attrs.get('style') and ( has_bg = parent_tag.attrs.get("style") and (
'background' in parent_tag.attrs.get('style')) "background" in parent_tag.attrs.get("style"))
if has_bg and tag_will_be_saved: if has_bg and tag_will_be_saved:
return style_ return style_
children = tag.find_all() children = tag.find_all()
for child in children: for child in children:
if child.attrs.get('style') and ('background' in child.attrs.get('style')): if child.attrs.get("style") and ("background" in child.attrs.get("style")):
tmp_style = child.attrs['style'] + '; color:#fff; ' tmp_style = child.attrs["style"] + "; color:#fff; "
child.attrs['style'] = tmp_style child.attrs["style"] = tmp_style
# for child with bg color we added white text color, so this tag don't need white color # for child with bg color we added white text color, so this tag don"t need white color
style_ = style_.replace('color:#fff;', '') style_ = style_.replace("color:#fff;", "")
style_ = style_.replace('color:#ffffff;', '') style_ = style_.replace("color:#ffffff;", "")
style_ = style_.replace('color:white;', '') style_ = style_.replace("color:white;", "")
return style_ return style_
@staticmethod @staticmethod
@@ -68,7 +68,7 @@ class TagStyleConverter:
Parameters Parameters
---------- ----------
split_style: list split_style: list
list of styles split by ';' list of styles split by ";"
Returns Returns
---------- ----------
@@ -79,9 +79,9 @@ class TagStyleConverter:
processed_style = ";".join(split_style) processed_style = ";".join(split_style)
margin_left_regexp = re.compile( margin_left_regexp = re.compile(
r'((margin-left|margin): *(-*\w+);*)') r"((margin-left|margin): *(-*\w+);*)")
text_indent_regexp = re.compile( text_indent_regexp = re.compile(
r'(text-indent: *(-*\w+);*)') r"(text-indent: *(-*\w+);*)")
has_margin = re.search(margin_left_regexp, processed_style) has_margin = re.search(margin_left_regexp, processed_style)
has_text_indent = re.search(text_indent_regexp, processed_style) has_text_indent = re.search(text_indent_regexp, processed_style)
@@ -92,21 +92,21 @@ class TagStyleConverter:
if has_text_indent: if has_text_indent:
num_ti = abs(int("0" + "".join( num_ti = abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2)))))) filter(str.isdigit, str(has_text_indent.group(2))))))
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(num_m - num_ti)) + 'px; ') str(abs(num_m - num_ti)) + "px; ")
processed_style = processed_style.replace( processed_style = processed_style.replace(
has_margin.group(1), '') has_margin.group(1), "")
return processed_style return processed_style
processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' + processed_style = processed_style.replace(has_margin.group(1), "text-indent: " +
str(abs(num_m)) + 'px; ') str(abs(num_m)) + "px; ")
return processed_style return processed_style
elif has_text_indent: elif has_text_indent:
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(int("0" + "".join( str(abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2))))))) filter(str.isdigit, str(has_text_indent.group(2)))))))
+ 'px; ') + "px; ")
return processed_style return processed_style
return processed_style return processed_style
@@ -126,18 +126,18 @@ class TagStyleConverter:
processed inline style processed inline style
""" """
inline_style = self.tag_inline_style.attrs.get('style') + ';' inline_style = self.tag_inline_style.attrs.get("style") + ";"
# 1. Remove white color if tag doesn't have background color in style # 1. Remove white color if tag doesn"t have background color in style
inline_style = self.remove_white_if_no_bgcolor( inline_style = self.remove_white_if_no_bgcolor(
inline_style, self.tag_inline_style) inline_style, self.tag_inline_style)
inline_style = inline_style.replace( inline_style = inline_style.replace(
'list-style-image', 'list-style-type') "list-style-image", "list-style-type")
# 2. Create list of styles from inline style # 2. Create list of styles from inline style
# replace all spaces between '; & letter' to ';' # replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", inline_style) style = re.sub(r"; *", ";", inline_style)
# when we split style by ';', last element of the list is '' - None (remove it) # when we split style by ";", last element of the list is "" - None (remove it)
split_inline_style: list = list(filter(None, style.split(';'))) split_inline_style: list = list(filter(None, style.split(";")))
# 3. Duplicate styles check - if the tag had duplicate styles # 3. Duplicate styles check - if the tag had duplicate styles
split_inline_style = self.duplicate_styles_check(split_inline_style) split_inline_style = self.duplicate_styles_check(split_inline_style)
@@ -164,7 +164,7 @@ class TagStyleConverter:
""" """
styles_to_remove = [] styles_to_remove = []
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style: if f"{k[0]}:{k[1]}" in style:
styles_to_remove.append(k) styles_to_remove.append(k)
return styles_to_remove return styles_to_remove
@@ -172,11 +172,11 @@ class TagStyleConverter:
# adds <strong>, <u>, <sup> instead of styles # adds <strong>, <u>, <sup> instead of styles
styles_to_remove = self.check_style_to_be_tag(self.style) styles_to_remove = self.check_style_to_be_tag(self.style)
for i, (attr, value) in enumerate(styles_to_remove): for i, (attr, value) in enumerate(styles_to_remove):
self.tag_inline_style.attrs['style'] = self.tag_inline_style.attrs['style']\ self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\
.replace(f'{attr}:{value};', '').strip() .replace(f"{attr}:{value};", "").strip()
corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)] attr, value)]
correspond_tag = BeautifulSoup(features='lxml').new_tag(corr_tag_name) correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name)
for content in reversed(self.tag_inline_style.contents): for content in reversed(self.tag_inline_style.contents):
correspond_tag.insert(0, content.extract()) correspond_tag.insert(0, content.extract())
self.tag_inline_style.append(correspond_tag) self.tag_inline_style.append(correspond_tag)
@@ -184,34 +184,34 @@ class TagStyleConverter:
@staticmethod @staticmethod
def wrap_span_in_tag_to_save_style_attrs(initial_tag): def wrap_span_in_tag_to_save_style_attrs(initial_tag):
"""Function designed to save style attrs that cannot be in tag.name -> span""" """Function designed to save style attrs that cannot be in tag.name -> span"""
dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG)) dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG))
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'): if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
styles_can_be_in_tag = [style styles_can_be_in_tag = [style
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items() for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items()
if re.match(tag, initial_tag.name) if re.match(tag, initial_tag.name)
for style in styles] for style in styles]
styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in styles_can_be_in_tag] if attr not in styles_can_be_in_tag]
span_style = initial_tag.attrs['style'] span_style = initial_tag.attrs["style"]
# here check that this style is exactly the same. # here check that this style is exactly the same.
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top' # Not "align" when we have "text-align", or "border" when we have "border-top"
styles_to_be_saved_in_span = [((attr + ':') in span_style) & ( styles_to_be_saved_in_span = [((attr + ":") in span_style) & (
'-' + attr not in span_style) for attr in styles_cant_be_in_tag] "-" + attr not in span_style) for attr in styles_cant_be_in_tag]
if any(styles_to_be_saved_in_span): if any(styles_to_be_saved_in_span):
# if we find styles that cannot be in <tag.name> -> wrap them in span # if we find styles that cannot be in <tag.name> -> wrap them in span
tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}') tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}")
style = '' style = ""
possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag] possible_attrs_regexp = [re.compile(fr"({style}: *(\w+);)") for style in styles_can_be_in_tag]
for possible_attr_regexp in possible_attrs_regexp: for possible_attr_regexp in possible_attrs_regexp:
has_style_attrs = re.search( has_style_attrs = re.search(
possible_attr_regexp, span_style) possible_attr_regexp, span_style)
if has_style_attrs and has_style_attrs.group(1): if has_style_attrs and has_style_attrs.group(1):
style += has_style_attrs.group(1) style += has_style_attrs.group(1)
span_style = span_style.replace( span_style = span_style.replace(
has_style_attrs.group(1), '') has_style_attrs.group(1), "")
tag.attrs['style'] = style tag.attrs["style"] = style
initial_tag.name = 'span' initial_tag.name = "span"
initial_tag.attrs['style'] = span_style initial_tag.attrs["style"] = span_style
initial_tag.wrap(tag) initial_tag.wrap(tag)
def convert_initial_tag(self): def convert_initial_tag(self):
@@ -246,10 +246,10 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) ->
disable_validation=True, disable_validation=True,
) )
# soup with converted styles from css # soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')}) attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file # go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style: for tag_inline_style in tags_with_inline_style:

View File

@@ -9,12 +9,12 @@ class LiveCartaConfig:
HEADERS_LEVELS = {"h1", "h2", "h3", HEADERS_LEVELS = {"h1", "h2", "h3",
"h4", "h5", "h6", "h7", "h8", "h9"} "h4", "h5", "h6", "h7", "h8", "h9"}
DEFAULT_ALIGN_STYLE = 'left' DEFAULT_ALIGN_STYLE = "left"
ALIGN_STYLES = ['justify', 'right', 'center', 'left'] ALIGN_STYLES = ["justify", "right", "center", "left"]
# Main constant values # Main constant values
DEFAULT_FONT_NAME = 'Times New Roman' DEFAULT_FONT_NAME = "Times New Roman"
WORD_DEFAULT_FONT_SIZE = 11 WORD_DEFAULT_FONT_SIZE = 11
@@ -38,65 +38,65 @@ class LiveCartaConfig:
} }
COLORS_MAP = { COLORS_MAP = {
'#ffff00': 'yellow', "#ffff00": "yellow",
'#00ff00': 'darkYellow', "#00ff00": "darkYellow",
'#00ffff': 'cyan', "#00ffff": "cyan",
'#ff00ff': 'magenta', "#ff00ff": "magenta",
'#0000ff': 'blue', "#0000ff": "blue",
'#ff0000': 'red', "#ff0000": "red",
'#000080': 'darkBlue', "#000080": "darkBlue",
'#008080': 'darkCyan', "#008080": "darkCyan",
'#008000': 'green', "#008000": "green",
'#800080': 'darkMagenta', "#800080": "darkMagenta",
'#808000': 'darkGreen', "#808000": "darkGreen",
'#c0c0c0': 'lightGray', "#c0c0c0": "lightGray",
'#ffffff': 'white', "#ffffff": "white",
'#800000': '#800000', "#800000": "#800000",
'#808080': '#808080' "#808080": "#808080"
} }
HTML42LIVECARTA_COLORS = { HTML42LIVECARTA_COLORS = {
'yellow': 'yellow', "yellow": "yellow",
'lime': 'green', "lime": "green",
'aqua': 'cyan', "aqua": "cyan",
'fuchsia': 'magenta', "fuchsia": "magenta",
'blue': 'blue', "blue": "blue",
'red': 'red', "red": "red",
'navy': 'darkBlue', "navy": "darkBlue",
'teal': 'darkCyan', "teal": "darkCyan",
'green': 'darkGreen', "green": "darkGreen",
'purple': 'darkMagenta', "purple": "darkMagenta",
'olive': 'darkYellow', "olive": "darkYellow",
'silver': 'lightGray', "silver": "lightGray",
'white': 'white', "white": "white",
'maroon': 'darkRed', # '#800000', "maroon": "darkRed", # "#800000",
'gray': 'darkGray', "gray": "darkGray",
'grey': 'darkGray', "grey": "darkGray",
} }
INDENT = '30px' INDENT = "30px"
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0,
1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69,
1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38,
2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', sizes_px = ["0px", "10px", "10px", "11px", "12px", "13px", "14px", "15px", "16px", "17px", "18px",
'19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', "19px", "20px", "21px", "22px", "23px", "24px", "25px", "26px", "27px", "28px", "29px",
'30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px', "30px", "31px", "32px", "33px", "34px", "35px", "36px", "37px", "38px", "39px", "40px",
'41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] "41px", "42px", "43px", "44px", "45px", "46px", "47px", "48px", "49px", "50px", "64px", "72px"]
list_types = ['circle', 'disc', 'armenian', 'decimal', list_types = ["circle", "disc", "armenian", "decimal",
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', "decimal-leading-zero", "georgian", "lower-alpha", "lower-latin",
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] "lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"]
structural_tags_names = [ structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', "div", "section", "article", "main", "body", "html", "aside",
'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p' "canvas", "data", "figure", "footer", "iframe", "span", "p"
] ]
could_have_style_in_livecarta_regexp = re.compile( could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') "(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)")
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
@@ -104,23 +104,34 @@ class LiveCartaConfig:
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p> <p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong', ("font-weight", "bold"): "strong",
('font-weight', '600'): 'strong', ("font-weight", "600"): "strong",
('font-weight', '700'): 'strong', ("font-weight", "700"): "strong",
('font-weight', '800'): 'strong', ("font-weight", "800"): "strong",
('font-weight', '900'): 'strong', ("font-weight", "900"): "strong",
('font-style', 'italic'): 'i', ("font-style", "italic"): "i",
('text-decoration', 'underline'): 'u', ("text-decoration", "underline"): "u",
('text-decoration', 'line-through'): 's', ("text-decoration", "line-through"): "s",
('text-decoration-line', 'underline'): 'u', ("text-decoration-line", "underline"): "u",
('text-decoration-line', 'line-through'): 's', ("text-decoration-line", "line-through"): "s",
('vertical-align', 'super'): 'sup' ("vertical-align", "super"): "sup"
} }
LIVECARTA_STYLES_CANT_BE_IN_TAG = { LIVECARTA_STYLES_CANT_BE_IN_TAG = {
'p': ['text-align', 'text-indent', 'border-bottom', 'border-top'], "p": ["text-align", "text-indent", "border-bottom", "border-top"],
'li': ['text-align', 'list-style-type'], "li": ["text-align", "list-style-type"],
'ul': ['list-style-type'], "ul": ["list-style-type"],
'ol': ['list-style-type'], "ol": ["list-style-type"],
'(^h[1-9]$)': ['list-style-type'] r"(^h[1-9]$)": ["list-style-type"]
} }
REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = {
(r"^h[6-9]$", "figure$", "section$"): "p",
("^aside$",): "blockquote",
("^header$",): "span",
("^b$",): "strong",
}
TAGS_TO_UNWRAP = [
"section", "article", "figcaption", "main", "body", "html",
]