.
@@ -672,74 +513,76 @@ class HTMLDocxPreprocessor:
uwrap tag with li
"""
-
li_tags = self.body_tag.find_all("li")
-
for li_tag in li_tags:
li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap()
- def process_html(self, access=None, html_path='', book_id='local'):
+ def delete_content_before_toc(self):
+ # remove all tag upper the only in content !!! body tag is not updated
+ toc_tag = self.html_soup.new_tag("TOC")
+ self.content: List[Tag] = self.body_tag.find_all(recursive=False)
+ if toc_tag in self.content:
+ ind = self.content.index(toc_tag) + 1
+ self.content = self.content[ind:]
+
+ def process_html(self, access=None, html_path="", book_id=0):
"""Process html code to satisfy LiveCarta formatting."""
- self.logger_object.log('Beginning of processing .html file.')
+ self.logger_object.log("Beginning of processing .html file.")
try:
- self.logger_object.log(f'Processing TOC and headers.')
+ self.logger_object.log(f"Processing TOC and headers.")
self._process_toc_links()
self.clean_trash()
# process main elements of the .html doc
- self.logger_object.log(f'Processing main elements of html.')
+ self.logger_object.log(f"Processing main elements of html.")
self._preprocessing_headings()
self._process_paragraph()
self._process_two_columns()
- self.logger_object.log('Block quotes processing.')
+ self.logger_object.log("Block quotes processing.")
self._process_quotes()
- self.logger_object.log('Tables processing.')
+ self.logger_object.log("Tables processing.")
self._process_tables()
self.logger_object.log(
- f'{self.tables_amount} tables have been processed.')
+ f"{self.tables_amount} tables have been processed.")
- self.logger_object.log('Hrefs processing.')
+ self.logger_object.log("Hrefs processing.")
self._process_hrefs()
- self.logger_object.log('Footnotes processing.')
- self._process_footnotes()
+ self.logger_object.log("Footnotes processing.")
+ self.footnotes = process_footnotes(self.body_tag)
self.logger_object.log(
- f'{len(self.footnotes)} footnotes have been processed.')
+ f"{len(self.footnotes)} footnotes have been processed.")
- self.logger_object.log('Image processing.')
- self._process_images(
- access=access, html_path=html_path, book_id=book_id)
+ self.logger_object.log("Image processing.")
+ self.images = process_images(access=access, html_path=html_path,
+ book_id=book_id, body_tag=self.body_tag)
self.logger_object.log(
- f'{len(self.images)} images have been processed.')
+ f"{len(self.images)} images have been processed.")
self._process_footer()
self._process_div()
- self.content = self.body_tag.find_all(recursive=False)
-
self.top_level_headers = self._get_top_level_headers()
self._mark_introduction_headers()
self._process_headings()
- self.content: List[Tag] = self.body_tag.find_all(recursive=False)
-
self._process_lists()
# delete text before table of content if exists
self.delete_content_before_toc()
except Exception as exc:
self.logger_object.log(
- 'Error has occurred while processing html.', logging.ERROR)
+ "Error has occurred while processing html.", logging.ERROR)
self.logger_object.log_error_to_main_log()
if self.status_wrapper:
self.status_wrapper.set_error()
raise exc
- self.logger_object.log('End of processing .html file.')
+ self.logger_object.log("End of processing .html file.")
return self.content, self.footnotes, self.top_level_headers
diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py
new file mode 100644
index 0000000..9c5fdab
--- /dev/null
+++ b/src/docx_converter/image_processing.py
@@ -0,0 +1,34 @@
+import os
+import pathlib
+from shutil import copyfile
+
+
+def process_images(access, html_path, book_id, body_tag):
+ """
+ Function to process
tag.
+ Img should be sent Amazon S3 and then return new tag with valid link.
+ For now images are moved to one folder.
+
+ """
+ img_tags = body_tag.find_all("img")
+ for img in img_tags:
+ img_name = img.attrs.get("src")
+ # quick fix for bad links
+ if (len(img_name) >= 3) and img_name[:3] == "../":
+ img_name = img_name[3:]
+ img_path = pathlib.Path(f"{html_path.parent}", f"{img_name}")
+
+ if access is not None:
+ link = access.send_image(img_path, doc_id=book_id)
+ img.attrs["src"] = link
+ else:
+ if img_tags.index(img) == 0:
+ folder_path = os.path.dirname(
+ os.path.dirname(os.path.abspath(__file__)))
+ new_path = pathlib.Path(os.path.join(
+ folder_path, f"../books/json/img_{book_id}/"))
+ new_path.mkdir(exist_ok=True)
+ new_img_path = new_path / img_name
+ copyfile(img_path, new_img_path)
+ img.attrs["src"] = str(new_img_path)
+ return img_tags
diff --git a/src/docx_converter/libre_html2json_converter.py b/src/docx_converter/libre_html2json_converter.py
index 45522da..eb5f0a2 100644
--- a/src/docx_converter/libre_html2json_converter.py
+++ b/src/docx_converter/libre_html2json_converter.py
@@ -29,7 +29,7 @@ class LibreHTML2JSONConverter:
cleaned text
"""
- new_text = re.sub(r'([\n\t])', ' ', html_text)
+ new_text = re.sub(r"([\n\t])", " ", html_text)
return new_text
# TODO: rethink the function structure without indexes.
@@ -48,16 +48,16 @@ class LibreHTML2JSONConverter:
"""
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
title = str(self.content[ind])
- title = title.replace(f'<{self.content[ind].name}>', '')
- title = title.replace(f'{self.content[ind].name}>', '')
- title = re.sub(r'^\n', '', title)
+ title = title.replace(f"<{self.content[ind].name}>", "")
+ title = title.replace(f"{self.content[ind].name}>", "")
+ title = re.sub(r"^\n", "", title)
# extract outline from tag
curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
result = {
- 'title': f'{title}',
- 'contents': [],
- 'sub_items': []
+ "title": f"{title}",
+ "contents": [],
+ "sub_items": []
}
ch_content = []
ind += 1
@@ -71,9 +71,9 @@ class LibreHTML2JSONConverter:
header_dict, ind = self.header_to_livecarta_chapter_item(
ind)
if ch_content:
- result['contents'].append("".join(ch_content))
+ result["contents"].append("".join(ch_content))
ch_content = []
- result['sub_items'].append(header_dict)
+ result["sub_items"].append(header_dict)
# - current h_i <= h_initial, end of recursion
else:
# return result, ind
@@ -85,21 +85,21 @@ class LibreHTML2JSONConverter:
ind += 1
if ch_content:
- result['contents'].append("".join(ch_content))
+ result["contents"].append("".join(ch_content))
return result, ind
- return ''
+ return ""
@staticmethod
def _is_empty_p_tag(tag):
- if tag.name != 'p':
+ if tag.name != "p":
return False
temp_tag = copy(tag)
- brs = temp_tag.find_all('br')
+ brs = temp_tag.find_all("br")
for br in brs:
br.decompose()
- text = re.sub(r'\s+', '', temp_tag.text)
+ text = re.sub(r"\s+", "", temp_tag.text)
if text:
return False
@@ -107,10 +107,7 @@ class LibreHTML2JSONConverter:
def convert_to_dict(self):
"""Function which convert list of html nodes to appropriate json structure."""
- json_strc = []
- ind = 0
- ch_num = 0
- ch_amt = 0
+ json_strc, ind, ch_num, ch_amt = [], 0, 0, 0
try:
while ind < len(self.content):
@@ -120,7 +117,7 @@ class LibreHTML2JSONConverter:
res, ind = self.header_to_livecarta_chapter_item(ind)
else:
- chapter_title = f'Untitled chapter {ch_num}'
+ chapter_title = f"Untitled chapter {ch_num}"
chapter = []
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
if not self._is_empty_p_tag(self.content[ind]):
@@ -129,9 +126,9 @@ class LibreHTML2JSONConverter:
ind += 1
if chapter:
res = {
- 'title': chapter_title,
- 'contents': ["".join(chapter)],
- 'sub_items': []
+ "title": chapter_title,
+ "contents": ["".join(chapter)],
+ "sub_items": []
}
ch_num += 1
@@ -139,10 +136,10 @@ class LibreHTML2JSONConverter:
json_strc.append(res)
ch_amt += 1
self.logger_object.log(
- f'Chapter {ch_amt} has been added to structure.')
+ f"Chapter {ch_amt} has been added to structure.")
except Exception as exc:
self.logger_object.log(
- 'Error has occurred while making json structure.', logging.ERROR)
+ "Error has occurred while making json structure.", logging.ERROR)
self.logger_object.log_error_to_main_log()
if self.book_api_status:
self.book_api_status.set_error()
@@ -151,10 +148,10 @@ class LibreHTML2JSONConverter:
# Add is_introduction field to json structure
# after deleting content before toc, some chapters can be deleted
if self.top_level_headers:
- same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
- is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
+ same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"]
+ is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]
- json_strc[0]['is_introduction'] = is_first_header_introduction
+ json_strc[0]["is_introduction"] = is_first_header_introduction
self.content_dict = {
"content": json_strc,
diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py
deleted file mode 100644
index 2212bd5..0000000
--- a/src/epub_converter/css_preprocessing.py
+++ /dev/null
@@ -1,238 +0,0 @@
-import re
-import cssutils
-
-from ebooklib import epub
-from bs4 import BeautifulSoup
-from itertools import takewhile
-
-from src.util.color_reader import str2hex
-from src.livecarta_config import LiveCartaConfig
-
-
-def get_text_color(x):
- color = str2hex(x)
- color = color if color not in ['#000000', '#000', 'black'] else ''
- return color
-
-
-def get_bg_color(x):
- color = str2hex(x)
- color = color if color not in ['#ffffff', '#fff', 'white'] else ''
- return color
-
-
-def convert_tag_style_values(size_value: str) -> str:
- """
- Function
- - converts values of tags from em/%/pt to px
- - find closest font-size px
- Parameters
- ----------
- size_value: str
-
- Returns
- -------
- size_value: str
-
- """
- def find_closest_size(style_value):
- possible_sizes = list(
- takewhile(lambda x: style_value >= x, LiveCartaConfig.sizes_pr))
- last_possible_size_index = LiveCartaConfig.sizes_pr.index(
- possible_sizes[-1])
- return LiveCartaConfig.sizes_px[last_possible_size_index]
-
- font_size_regexp = re.compile(
- r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
- has_style_attrs = re.search(font_size_regexp, size_value)
- if has_style_attrs:
- if has_style_attrs.group(1):
- size_value = float(size_value.replace('%', '')) / 100.0
- return find_closest_size(size_value)
- elif has_style_attrs.group(3):
- size_value = float(size_value.replace('em', ''))
- return find_closest_size(size_value)
- elif has_style_attrs.group(5):
- return size_value.replace('pt', 'px')
- else:
- return ''
- return size_value
-
-
-def convert_indents_tag_values(size_value: str) -> str:
- """
- Function converts values of ['text-indent', 'margin-left', 'margin']
- Parameters
- ----------
- size_value: str
-
- Returns
- -------
- size_value: str
-
- """
- if len(size_value.split(' ')) == 3:
- size_value = convert_tag_style_values(size_value.split(
- ' ')[-2]) # returns middle value
- else:
- size_value = convert_tag_style_values(size_value.split(
- ' ')[-1]) # returns last value
- return size_value
-
-
-"""
-Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
-Style properties that can be used to fit livecarta css style convention.
-If property has empty list, it means that any value can be converted.
-If property has not empty list, it means that only certain property-value combinations can be transformed.
-"""
-LIVECARTA_STYLE_ATTRS = {
- 'text-indent': [],
- 'font-variant': ['small-caps'],
- 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
- 'align': [],
- 'font': [],
- 'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys()
- if x != LiveCartaConfig.DEFAULT_FONT_NAME],
- 'font-size': [],
- 'font-weight': ['bold', '600', '700', '800', '900'], #
- 'font-style': ['italic'], #
- 'text-decoration': ['underline', 'line-through'], # ,
- 'text-decoration-line': ['underline', 'line-through'], # ,
- 'vertical-align': ['super'], #
- 'color': [],
- 'background-color': [],
- 'background': [],
- 'width': [],
- 'border': [],
- 'border-top-width': [],
- 'border-right-width': [],
- 'border-left-width': [],
- 'border-bottom-width': [],
- 'border-top': [],
- 'border-bottom': [],
- 'list-style-type': [],
- 'list-style-image': [],
- 'margin-left': [],
- 'margin-top': [],
- 'margin': [],
-}
-
-"""
-Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
-
-Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
-to suit livecarta style convention.
-"""
-LIVECARTA_STYLE_ATTRS_MAPPING = {
- 'text-indent': convert_indents_tag_values,
- 'font-variant': lambda x: x,
- 'text-align': lambda x: x,
- 'font': lambda x: '',
- 'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title()))
- or LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x)),
- 'font-size': convert_tag_style_values,
- 'color': get_text_color,
- 'background-color': get_bg_color,
- 'background': get_bg_color,
- 'border': lambda x: x if x != '0' else '',
- 'border-top-width': lambda x: x if x != '0' else '',
- 'border-right-width': lambda x: x if x != '0' else '',
- 'border-left-width': lambda x: x if x != '0' else '',
- 'border-bottom-width': lambda x: x if x != '0' else '',
- 'border-top': lambda x: x if x != '0' else '',
- 'border-bottom': lambda x: x if x != '0' else '',
- 'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc',
- 'list-style-image': lambda x: 'disc',
- 'margin-left': convert_indents_tag_values,
- 'margin-top': convert_tag_style_values,
- 'margin': convert_indents_tag_values
-}
-
-
-def update_inline_styles_to_livecarta_convention(split_style: list):
- for i, style in enumerate(split_style):
- style_name, style_value = style.split(":")
- if style_name not in LIVECARTA_STYLE_ATTRS:
- # property not in LIVECARTA_STYLE_ATTRS, remove from css file
- split_style[i] = ''
- return split_style
-
- cleaned_value = style_value.replace('\"', '').split()[-1]
- constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
- style_name)
- value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
- style_name]
- if constraints_on_value and value_not_in_possible_values_list:
- # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
- split_style[i] = ''
- else:
- if style_name in LIVECARTA_STYLE_ATTRS_MAPPING:
- # function that converts our data
- func = LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
- style_value = func(cleaned_value)
- split_style[i] = style_name + ":" + style_value
- return split_style
-
-
-def build_inline_style_content(style: str) -> str:
- """Build inline style with livecarta convention"""
- # replace all spaces between '; & letter' to ';'
- style = re.sub(r"; *", ";", style)
- # when we split style by ';', last element of the list is '' - None
- # remove it
- split_style: list = list(filter(None, style.split(';')))
- # replace all spaces between ': & letter' to ':'
- split_style = [el.replace(
- re.search(r'(:\s*)', el).group(1), ':') for el in split_style]
-
- split_style = update_inline_styles_to_livecarta_convention(split_style)
- style = "; ".join(split_style)
- return style
-
-
-def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule,
- style_type: cssutils.css.property.Property):
- if style_type.name not in LIVECARTA_STYLE_ATTRS:
- # property not in LIVECARTA_STYLE_ATTRS, remove from css file
- css_rule.style[style_type.name] = ''
- return
-
- cleaned_value = style_type.value.replace('\"', '')
- constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
- style_type.name)
- value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
- style_type.name]
- if constraints_on_value and value_not_in_possible_values_list:
- # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
- css_rule.style[style_type.name] = ''
- else:
- if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
- # function that converts our data
- func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
- css_rule.style[style_type.name] = func(cleaned_value)
-
-
-def build_css_file_content(css_content: str) -> str:
- """Build css content with livecarta convention"""
- sheet = cssutils.parseString(css_content.lower(), validate=False)
-
- for css_rule in sheet:
- if css_rule.type == css_rule.STYLE_RULE:
- for style_type in css_rule.style:
- update_css_styles_to_livecarta_convention(
- css_rule, style_type)
-
- css_text: str = sheet._getCssText().decode()
- return css_text
-
-
-if __name__ == '__main__':
- file = '../../epub/9781627222174.epub'
- ebooklib_book = epub.read_epub(file)
- css_ = ebooklib_book.get_item_with_href('css/epub.css')
- css_ = css_.get_content().decode()
- css_cleaned = build_css_file_content(css_)
- html_ = ebooklib_book.get_item_with_href(
- 'pr01s05.xhtml').get_body_content().decode()
- html_soup = BeautifulSoup(html_, features='lxml')
diff --git a/src/epub_converter/css_processor.py b/src/epub_converter/css_processor.py
new file mode 100644
index 0000000..2be0dab
--- /dev/null
+++ b/src/epub_converter/css_processor.py
@@ -0,0 +1,216 @@
+import re
+import cssutils
+from bs4 import BeautifulSoup
+from os.path import dirname, normpath, join
+
+from src.util.color_reader import str2hex
+from src.livecarta_config import LiveCartaConfig
+
+
+class CSSPreprocessor:
+ def __init__(self):
+ """
+ Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
+
+ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
+ to suit LiveCarta style convention.
+ """
+ self.LIVECARTA_STYLE_ATTRS_MAPPING = {
+ "text-indent": self.convert_indents_tag_values,
+ "font-variant": lambda x: x,
+ "text-align": lambda x: x,
+ "font": lambda x: "",
+ "font-family": lambda x: x,
+ "font-size": self.convert_tag_style_values,
+ "color": self.get_text_color,
+ "background-color": self.get_bg_color,
+ "background": self.get_bg_color,
+ "border": lambda x: x if x != "0" else "",
+ "border-top-width": lambda x: x if x != "0" else "",
+ "border-right-width": lambda x: x if x != "0" else "",
+ "border-left-width": lambda x: x if x != "0" else "",
+ "border-bottom-width": lambda x: x if x != "0" else "",
+ "border-top": lambda x: x if x != "0" else "",
+ "border-bottom": lambda x: x if x != "0" else "",
+ "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
+ "list-style-image": lambda x: "disc",
+ "margin-left": self.convert_indents_tag_values,
+ "margin-top": self.convert_tag_style_values,
+ "margin": self.convert_indents_tag_values,
+ "width": self.convert_tag_style_values,
+ }
+
+ @staticmethod
+ def get_text_color(x):
+ color = str2hex(x)
+ color = color if color not in ["#000000", "#000", "black"] else ""
+ return color
+
+ @staticmethod
+ def get_bg_color(x):
+ color = str2hex(x)
+ color = color if color not in ["#ffffff", "#fff", "white"] else ""
+ return color
+
+ @staticmethod
+ def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str:
+ """
+ Function
+ - converts values of tags from em/%/pt to px
+ - find closest font-size px
+ Parameters
+ ----------
+ size_value: str
+
+ is_indent: bool
+
+ Returns
+ -------
+ size_value: str
+ converted value size
+ """
+ size_regexp = re.compile(
+ r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)")
+ has_style_attrs = re.search(size_regexp, size_value)
+ if has_style_attrs:
+ if has_style_attrs.group(1):
+ multiplier = 5.76 if is_indent else 0.16
+ size_value = float(size_value.replace("%", "")) * multiplier
+ return str(size_value)+'px'
+ elif has_style_attrs.group(3):
+ multiplier = 18 if is_indent else 16
+ size_value = float(size_value.replace("em", "")) * multiplier
+ return str(size_value)+'px'
+ elif has_style_attrs.group(5):
+ size_value = float(size_value.replace("pt", "")) * 4/3
+ return str(size_value)+'px'
+ else:
+ return ""
+ return size_value
+
+ def convert_indents_tag_values(self, size_value: str) -> str:
+ """
+ Function converts values of ["text-indent", "margin-left", "margin"]
+ Parameters
+ ----------
+ size_value: str
+
+ Returns
+ -------
+ size_value: str
+
+ """
+ size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
+ else self.convert_tag_style_values(size_value.split(" ")[-1], True)
+ return size_value
+
+ @staticmethod
+ def clean_value(style_value: str, style_name: str):
+ cleaned_value = style_value.replace("\"", "")
+ if style_name == 'font-family':
+ for symbol in ["+", "*", ".", "%", "?", "$", "^", "[", "]"]:
+ cleaned_value = re.sub(
+ re.escape(f"{symbol}"), rf"\\{symbol}", cleaned_value)
+ return cleaned_value
+
+ @staticmethod
+ def style_conditions(style_value: str, style_name: str) -> tuple[bool, bool]:
+ constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get(
+ style_name)
+ value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[
+ style_name]
+ return constraints_on_value, value_not_in_possible_values_list
+
+ def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
+ for i, style in enumerate(split_style):
+ style_name, style_value = style.split(":")
+ if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
+ # property not in LIVECARTA_STYLE_ATTRS, remove from css file
+ split_style[i] = ""
+ return split_style
+
+ cleaned_value = self.clean_value(style_value, style_name)
+ if all(self.style_conditions(cleaned_value, style_name)):
+ # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
+ split_style[i] = ""
+ else:
+ if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
+ # function that converts our data
+ func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
+ style_value = func(cleaned_value)
+ split_style[i] = style_name + ":" + style_value
+ return split_style
+
+ def build_inline_style_content(self, style: str) -> str:
+ """Build inline style with LiveCarta convention"""
+ # replace all spaces between "; & letter" to ";"
+ style = re.sub(r"; *", ";", style)
+ # when we split style by ";", last element of the list is "" - None (we remove it)
+ split_style: list = list(filter(None, style.split(";")))
+ # replace all spaces between ": & letter" to ":"
+ split_style = [el.replace(
+ re.search(r"(:\s*)", el).group(1), ":") for el in split_style]
+
+ split_style = self.update_inline_styles_to_livecarta_convention(
+ split_style)
+ style = "; ".join(split_style)
+ return style
+
+ def process_inline_styles_in_html_soup(self, html_href2html_body_soup: dict):
+ """This function is designed to convert inline html styles"""
+ for html_href in html_href2html_body_soup:
+ html_content: BeautifulSoup = html_href2html_body_soup[html_href]
+ tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
+ attrs={"style": re.compile(".*")})
+
+ for tag_initial_inline_style in tags_with_inline_style:
+ inline_style = tag_initial_inline_style.attrs["style"]
+ tag_initial_inline_style.attrs["style"] = \
+ self.build_inline_style_content(inline_style)
+
+ @staticmethod
+ def get_css_content(css_href, html_href, ebooklib_book):
+ path_to_css_from_html = css_href
+ html_folder = dirname(html_href)
+ path_to_css_from_root = normpath(
+ join(html_folder, path_to_css_from_html)).replace("\\", "/")
+ css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root)
+ # if in css file we import another css
+ if "@import" in str(css_obj.content):
+ path_to_css_from_root = "css/" + \
+ re.search('"(.*)"', str(css_obj.content)).group(1)
+ css_obj = ebooklib_book.get_item_with_href(
+ path_to_css_from_root)
+ assert css_obj, f"Css style {css_href} was not in manifest."
+ css_content: str = css_obj.get_content().decode()
+ return css_content
+
+ def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
+ style_type: cssutils.css.property.Property):
+ if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
+ # property not in LIVECARTA_STYLE_ATTRS, remove from css file
+ css_rule.style[style_type.name] = ""
+ return
+
+ cleaned_value = self.clean_value(style_type.value, style_type.name)
+ if all(self.style_conditions(cleaned_value, style_type.name)):
+ # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
+ css_rule.style[style_type.name] = ""
+ else:
+ if style_type.name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
+ # function that converts our data
+ func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
+ css_rule.style[style_type.name] = func(cleaned_value)
+
+ def build_css_file_content(self, css_content: str) -> str:
+ """Build css content with LiveCarta convention"""
+ sheet = cssutils.parseString(css_content, validate=False)
+
+ for css_rule in sheet:
+ if css_rule.type == css_rule.STYLE_RULE:
+ for style_type in css_rule.style:
+ self.update_css_styles_to_livecarta_convention(
+ css_rule, style_type)
+
+ css_text: str = sheet._getCssText().decode()
+ return css_text
diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py
index dc8d3a2..fb3b786 100644
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -1,39 +1,40 @@
import re
import json
import codecs
-import os
-from os.path import dirname, normpath, join
-from itertools import chain
-from collections import defaultdict
-from typing import Dict, Union, List
-
-
import ebooklib
from ebooklib import epub
from ebooklib.epub import Link, Section
-from bs4 import BeautifulSoup, Tag
-
+from os import path
+from pathlib import Path
+from itertools import chain
+from premailer import transform
+from collections import defaultdict
+from typing import Dict, Union, List
+from bs4 import BeautifulSoup, NavigableString, Tag
from src.util.helpers import BookLogger
+from src.epub_converter.css_processor import CSSPreprocessor
+from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
-from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
-from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
-from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\
- prepare_title, prepare_content, update_images_src_links, preprocess_footnotes
+from src.epub_converter.image_processing import update_images_src_links
+from src.epub_converter.footnotes_processing import preprocess_footnotes
+from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
class EpubConverter:
- def __init__(self, file_path, access=None, logger=None):
- self.file_path = file_path
+ def __init__(self, book_path, access=None, logger=None, css_processor=None, html_processor=None):
+ self.book_path = book_path
self.access = access
self.logger: BookLogger = logger
- self.ebooklib_book = epub.read_epub(file_path)
+ self.ebooklib_book = epub.read_epub(book_path)
+ self.css_processor = css_processor
+ self.html_processor = html_processor
# main container for all epub .xhtml files
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
# enumerate all subchapter id for each file
- self.html_href2subchapter_ids = defaultdict(list)
+ self.html_href2subchapters_ids = defaultdict(list)
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
@@ -57,55 +58,51 @@ class EpubConverter:
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
- self.logger.log('Image processing.')
+ self.logger.log("Image processing.")
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.img_href2img_bytes[file_name] = content
- self.logger.log('HTML files reading.')
+ self.logger.log("HTML files reading.")
self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content()
- # TODO Presets
- self.logger.log('Process CSS inline styles.')
- self.process_inline_styles_in_html_soup()
- self.logger.log('CSS files processing.')
+ self.logger.log("CSS inline style processing.")
+ self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
+ self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
- self.logger.log('CSS styles adding.')
+ self.logger.log("CSS styles fusion(inline+file).")
self.add_css_styles_to_html_soup()
- self.logger.log('Footnotes processing.')
+ self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup:
- content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href],
- self.html_href2html_body_soup)
- self.footnotes_contents.extend(content)
- self.noterefs.extend(noterefs)
- self.footnotes.extend(footnotes_tags)
+ self.footnotes_contents, self.noterefs, self.footnotes =\
+ preprocess_footnotes(
+ self.html_href2html_body_soup[href], self.html_href2html_body_soup)
+ self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
- for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)):
- noteref.attrs['data-id'] = i + 1
- noteref.attrs['id'] = f'footnote-{i + 1}'
- footnote.attrs['href'] = f'#footnote-{i + 1}'
-
- self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
- self.logger.log('TOC processing.')
+ self.logger.log("TOC processing.")
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed
if self.is_toc_empty():
self.build_adjacency_list_from_spine()
not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
- self.logger.log(f'Html documents not added to TOC: {not_added}.')
+ self.logger.log(f"Html documents not added to TOC: {not_added}.")
+ self.logger.log(f"Add documents not added to TOC.")
self.add_not_added_files_to_adjacency_list(not_added)
- self.logger.log(f'Html internal links and structure processing.')
- self.label_chapters_ids_with_tmp_id()
- # used only after parsed toc, ids from toc needed
- self.process_html_soup_structure_to_line()
+ self.logger.log(f"Label subchapters with converter tag.")
+ self.label_subchapters_with_lc_tag()
+ self.logger.log(f"Process html internal links.")
self.process_internal_links()
- self.logger.log(f'Building chapters content.')
- self.define_chapters_content()
+ self.logger.log(
+ f"Check if converter-chapter-marks are on the same level.")
+ self.chapter_marks_are_same_level()
+ self.logger.log(f"Define chapters content.")
+ self.define_chapters_with_content()
+ self.logger.log(f"Converting html_nodes to LiveCarta chapter items.")
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
@@ -115,38 +112,10 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_body_text = item.get_body_content()
# html.parser closes tags if needed
- soup = BeautifulSoup(html_body_text, features='html.parser')
+ soup = BeautifulSoup(html_body_text, features="html.parser")
nodes[item.file_name] = soup
return nodes
- def get_css_content(self, css_href, html_href):
- path_to_css_from_html = css_href
- html_folder = dirname(html_href)
- path_to_css_from_root = normpath(
- join(html_folder, path_to_css_from_html)).replace('\\', '/')
- css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
- # if in css file we import another css
- if "@import" in str(css_obj.content):
- path_to_css_from_root = "css/" + \
- re.search('"(.*)"', str(css_obj.content)).group(1)
- css_obj = self.ebooklib_book.get_item_with_href(
- path_to_css_from_root)
- assert css_obj, f'Css style {css_href} was not in manifest.'
- css_content: str = css_obj.get_content().decode()
- return css_content
-
- def process_inline_styles_in_html_soup(self):
- """This function is designed to convert inline html styles"""
- for html_href in self.html_href2html_body_soup:
- html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
- tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
- attrs={'style': re.compile('.*')})
-
- for tag_initial_inline_style in tags_with_inline_style:
- inline_style = tag_initial_inline_style.attrs['style']
- tag_initial_inline_style.attrs['style'] = \
- build_inline_style_content(inline_style)
-
def build_html_and_css_relations(self) -> tuple[dict, dict]:
"""
Function is designed to get 2 dictionaries:
@@ -167,39 +136,81 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content
html_href = item.file_name
- soup_html_content = BeautifulSoup(html_content, features='lxml')
+ soup_html_content = BeautifulSoup(html_content, features="lxml")
# check if file links to css file
- for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
+ for tag in soup_html_content.find_all("link", attrs={"type": "text/css"}):
# alternate page of original page (e.g. another language)
- if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
+ if tag.attrs.get("rel") and ("alternate" in tag.attrs["rel"]):
continue
- css_href = tag.attrs.get('href')
+ css_href = tag.attrs.get("href")
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
- css_href2css_content[css_href] = build_css_file_content(
- self.get_css_content(css_href, html_href))
+ css_href2css_content[css_href] = self.css_processor.build_css_file_content(
+ self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))
- for i, tag in enumerate(soup_html_content.find_all('style')):
+ for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string
- html_href2css_href[html_href].append(f'href{i}')
- css_href2css_content[f'href{i}'] = build_css_file_content(
+ html_href2css_href[html_href].append(f"href{i}")
+ css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
css_content)
return html_href2css_href, css_href2css_content
+ @staticmethod
+ def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
+ """
+ Function adds styles from .css to inline style.
+ Parameters
+ ----------
+ html_soup: BeautifulSoup
+ html page with inline style
+ css_text: str
+ css content from css file
+ Returns
+ -------
+ inline_soup: BeautifulSoup
+ soup with styles from css
+
+ """
+ # remove this specification because it causes problems
+ css_text = css_text.replace(
+ '@namespace epub "http://www.idpf.org/2007/ops";', '')
+ # here we add css styles to inline style
+ html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
+ remove_classes=False,
+ external_styles=False,
+ allow_network=False,
+ disable_validation=True,
+ )
+ # soup with converted styles from css
+ inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
+
+ tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
+ attrs={"style": re.compile(".*")})
+
+ # go through the tags with inline style + style parsed from css file
+ for tag_inline_style in tags_with_inline_style:
+ style_converter = TagInlineStyleProcessor(tag_inline_style)
+ style_converter.convert_initial_tag()
+ return inline_soup
+
def add_css_styles_to_html_soup(self):
"""
This function is designed to update html_href2html_body_soup
- add to html_inline_style css_style_content
-
+ Returns
+ -------
+ None
+ updated soups with styles from css
"""
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
- css = ''
+ css = ""
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
- html_content = convert_html_soup_with_css_style(html_content, css)
+ html_content = self.modify_html_soup_with_css_styles(
+ html_content, css)
self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -226,7 +237,7 @@ class EpubConverter:
nav_point = NavPoint(element)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
- self.html_href2subchapter_ids[nav_point.href].append(
+ self.html_href2subchapters_ids[nav_point.href].append(
nav_point.id)
self.adjacency_list[nav_point] = None
self.hrefs_added_to_toc.add(nav_point.href)
@@ -238,12 +249,12 @@ class EpubConverter:
nav_point = NavPoint(first)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
- self.html_href2subchapter_ids[nav_point.href].append(
+ self.html_href2subchapters_ids[nav_point.href].append(
nav_point.id)
sub_nodes = []
for elem in second:
- if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
+ if (bool(re.search('^section$|^part$', first.title.lower()))) and lvl == 1:
self.offset_sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl))
else:
@@ -267,7 +278,7 @@ class EpubConverter:
self.adjacency_list[-1] = nodes
else:
- assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
+ assert 0, f"Error. Element is not tuple/Link/list instance: {type(element)}"
def is_toc_empty(self) -> bool:
"""Function checks is toc empty"""
@@ -276,14 +287,14 @@ class EpubConverter:
return True
return False
- def build_manifest_id2html_href(self) -> dict:
- links = dict()
- for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
- links[item.id] = item.file_name
- return links
-
def build_adjacency_list_from_spine(self):
- manifest_id2html_href = self.build_manifest_id2html_href()
+ def build_manifest_id2html_href() -> dict:
+ links = dict()
+ for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+ links[item.id] = item.file_name
+ return links
+
+ manifest_id2html_href = build_manifest_id2html_href()
self.adjacency_list = {
-1: []
}
@@ -293,42 +304,49 @@ class EpubConverter:
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(nav_point.href)
- def add_not_added_files_to_adjacency_list(self, not_added):
+ def add_not_added_files_to_adjacency_list(self, not_added: list):
"""Function add files that not added to adjacency list"""
for i, file in enumerate(not_added):
nav_point = NavPoint(
- Section(f'To check #{i}, filename: {file}', file))
+ Section(f"To check #{i}, filename: {file}", file))
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(file)
- def label_chapters_ids_with_tmp_id(self):
+ def label_subchapters_with_lc_tag(self):
for html_href in self.html_href2html_body_soup:
- ids = self.html_href2subchapter_ids[html_href]
+ ids, soup = self.html_href2subchapters_ids[html_href], \
+ self.html_href2html_body_soup[html_href]
for i in ids:
- soup = self.html_href2html_body_soup[html_href]
tag = soup.find(id=i)
- new_h = soup.new_tag('tmp')
- new_h.attrs['class'] = 'converter-chapter-mark'
- new_h.attrs['id'] = i
- tag.insert_before(new_h)
+ tmp_tag = soup.new_tag("lc_tmp")
+ tmp_tag.attrs["class"] = "converter-chapter-mark"
+ tmp_tag.attrs["id"] = i
+ tag.insert_before(tmp_tag)
- def process_html_soup_structure_to_line(self):
- # go to line structure
+ def chapter_marks_are_same_level(self):
+ """
+ Function checks that marks for pointing a start of a chapter are placed on one level in html tree.
+ Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed.
+ This tag must have a chapter_tag as a parent.
+ Otherwise, it is wrapped with some tags. Like:
+
+
+ """
for html_href in self.html_href2html_body_soup:
- soup = self.html_href2html_body_soup[html_href]
- self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
+ chapter_tag = self.html_href2html_body_soup[html_href]
+ # check marks for chapter starting are on the same level - 1st
+ marks = chapter_tag.find_all(
+ attrs={"class": "converter-chapter-mark"})
+
+ # fix marks to be on 1 level
+ for mark in marks:
+ while mark.parent != chapter_tag:
+ # todo warning! could reflect on formatting/internal links in some cases
+ mark.parent.unwrap()
@staticmethod
def create_unique_id(href, id_):
- return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_)
-
- @staticmethod
- def create_new_anchor_span(soup, id_):
- new_anchor_span = soup.new_tag("span")
- new_anchor_span.attrs['id'] = id_
- new_anchor_span.attrs['class'] = 'link-anchor'
- new_anchor_span.string = "\xa0"
- return new_anchor_span
+ return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
"""
@@ -351,23 +369,31 @@ class EpubConverter:
prepared content
"""
- dir_name = os.path.dirname(cur_file_path)
- normed_path = os.path.normpath(os.path.join(
- dir_name, href_in_link)).replace('\\', '/')
+ dir_name = path.dirname(cur_file_path)
+ normed_path = path.normpath(path.join(
+ dir_name, href_in_link)).replace("\\", "/")
full_path = [
path for path in self.hrefs_added_to_toc if normed_path in path]
if not full_path:
- self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
- f'While processing href in {internal_link_tag}.')
- internal_link_tag.attrs['converter-mark'] = 'bad-link'
+ self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
+ f"While processing href in {internal_link_tag}.")
+ internal_link_tag.attrs["converter-mark"] = "bad-link"
return None
if len(full_path) > 1:
- self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
- f' while {internal_link_tag} processing. The first one will be chosen.')
+ self.logger.log(f"Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}"
+ f" while {internal_link_tag} processing. The first one will be chosen.")
return full_path[0]
+ @staticmethod
+ def create_new_anchor_span(soup, id_):
+ new_anchor_span = soup.new_tag("span")
+ new_anchor_span.attrs["id"] = id_
+ new_anchor_span.attrs["class"] = "link-anchor"
+ new_anchor_span.string = "\xa0"
+ return new_anchor_span
+
def process_internal_links(self):
"""
Function
@@ -376,8 +402,8 @@ class EpubConverter:
Steps
----------
1. rebuild ids to be unique in all documents
- 2a. process anchor which is a whole xhtml file
- 2b. process anchor which is an element in xhtml file
+ 2a. process anchor which is a whole htm|html|xhtml file
+ 2b. process anchor which is an element in htm|html|xhtml file
Returns
-------
@@ -385,99 +411,128 @@ class EpubConverter:
process links in html
"""
- # 1. rebuild ids to be unique in all documents
- for toc_href in self.hrefs_added_to_toc:
- for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
- if tag.attrs.get('class') == 'converter-chapter-mark':
- continue
+ def make_ids_unique():
+ for toc_href in self.hrefs_added_to_toc:
+ for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
+ if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
+ new_id = self.create_unique_id(toc_href, tag.attrs["id"])
+ tag.attrs["id"] = new_id
- if tag.attrs.get('class') == 'footnote-element':
- continue
+ def process_file_anchor():
+ for toc_href in self.hrefs_added_to_toc:
+ soup = self.html_href2html_body_soup[toc_href]
+ for internal_link_tag in soup.find_all("a",
+ {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
+ a_tag_href = internal_link_tag.attrs["href"]
+ a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
+ toc_href, a_tag_href, internal_link_tag)
+ if a_tag_href_matched_to_toc:
+ new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
+ internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
+ if new_id not in self.internal_anchors:
+ anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
+ new_anchor_span = self.create_new_anchor_span(soup, new_id)
+ # insert a new span to the beginning of the file
+ anchor_soup.insert(0, new_anchor_span)
+ self.internal_anchors.add(new_id)
+ del internal_link_tag.attrs["href"]
- new_id = self.create_unique_id(toc_href, tag.attrs['id'])
- tag.attrs['id'] = new_id
+ def process_file_element_anchor():
+ for toc_href in self.hrefs_added_to_toc:
+ soup = self.html_href2html_body_soup[toc_href]
+ # process_file_element_anchor
+ for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
+ a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#")
+ a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
+ toc_href, a_tag_href, internal_link_tag) if a_tag_href \
+ else path.normpath(toc_href).replace("\\", "/")
+ if a_tag_href_matched_to_toc:
+ new_id = self.create_unique_id(
+ a_tag_href_matched_to_toc, a_tag_id)
- # 2a. process anchor which is a whole xhtml file
- internal_link_reg1 = re.compile(
- r'(^(?!https?://).+\.(htm|html|xhtml)$)')
- for toc_href in self.hrefs_added_to_toc:
- soup = self.html_href2html_body_soup[toc_href]
- for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
- a_tag_href = internal_link_tag.attrs['href']
- # find full path
- a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
- toc_href, a_tag_href, internal_link_tag)
- if not a_tag_href_matched_to_toc:
- continue
- new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
- internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
- if new_id not in self.internal_anchors:
- anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
- new_anchor_span = self.create_new_anchor_span(soup, new_id)
- # insert a new span to the beginning of the file
- anchor_soup.insert(0, new_anchor_span)
- self.internal_anchors.add(new_id)
+ anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
+ anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
+ anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote
+ if anchor_tags:
+ if len(anchor_tags) > 1:
+ self.logger.log(f"Warning in {toc_href}: multiple anchors:"
+ f"{len(anchor_tags)} found.\n"
+ f"{anchor_tags}\n"
+ f"While processing {internal_link_tag}")
- del internal_link_tag.attrs['href']
+ anchor_tag = anchor_tags[0]
+ assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
+ # if anchor is found we could add placeholder for link creation on server side.
+ internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
+ # create span to have cyclic links, link has 1 type of class, anchor another
+ if anchor_tag.attrs["id"] not in self.internal_anchors:
+ new_anchor_span = self.create_new_anchor_span(
+ soup, new_id)
+ anchor_tag.insert_before(new_anchor_span)
+ self.internal_anchors.add(new_id)
+ del anchor_tag.attrs["id"]
+ del internal_link_tag.attrs["href"]
+ else:
+ internal_link_tag.attrs["converter-mark"] = "bad-link"
+ self.logger.log(f"Error in {toc_href}."
+ f" While processing {internal_link_tag} no anchor found."
+ f" Should be anchor with new id={new_id} in"
+ f" {a_tag_href_matched_to_toc} file."
+ f" Old id={a_tag_id}")
+ # 1. make ids to be unique in all documents
+ make_ids_unique()
+ # 2a. process anchor which is a whole htm|html|xhtml file
+ process_file_anchor()
+ # 2b. process anchor which is an element in htm|html|xhtml file
+ process_file_element_anchor()
- # 2b. process anchor which is an element in xhtml file
- internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)')
- for toc_href in self.hrefs_added_to_toc:
- soup = self.html_href2html_body_soup[toc_href]
- for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
- a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split(
- '#')
- # find full path
- if a_tag_href:
- a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
- internal_link_tag)
- else:
- a_tag_href_matched_to_toc = os.path.normpath(
- toc_href).replace('\\', '/')
+ @staticmethod
+ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
+ """
+ Get tags between LiveCarta chapter marks
+ Parameters
+ ----------
+ first_id: str
+ Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
+ href: str
+ Name of current chapters file
+ html_soup: Tag
+ Soup object of current file
- if not a_tag_href_matched_to_toc:
- continue
+ Returns
+ -------
+ tags: list [Tag, NavigableString]
+ Chapter's tags
- new_id = self.create_unique_id(
- a_tag_href_matched_to_toc, a_tag_id)
+ """
+ marked_tags = html_soup.find(
+ attrs={"id": first_id, "class": "converter-chapter-mark"})
+ if marked_tags:
+ next_tag = marked_tags.next_sibling
+ tags = []
+ while next_tag:
+ if not isinstance(next_tag, NavigableString) and \
+ (next_tag.attrs.get("class") == "converter-chapter-mark"):
+ break
+ tags.append(next_tag)
+ next_tag = next_tag.next_sibling
- anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
- anchor_tags = anchor_soup.find_all(attrs={'id': new_id, })
- anchor_tags = anchor_tags or anchor_soup.find_all(
- attrs={'id': a_tag_id}) # if link is a footnote
+ # remove tags between first_id and next found id
+ # save them in list for next steps
+ tags = [tag.extract() for tag in tags]
+ html_soup.smooth()
+ else:
+ assert 0, f"Warning: no match for {first_id, href}"
- if anchor_tags:
- if len(anchor_tags) > 1:
- self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n'
- f'{anchor_tags}\n'
- f' While processing {internal_link_tag}')
+ return tags
- anchor_tag = anchor_tags[0]
- assert anchor_tag.attrs['id'] in [new_id, a_tag_id]
- # if anchor is found we could add placeholder for link creation on server side.
- internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
- # create span to have cyclic links, link has 1 type of class, anchor another
- if anchor_tag.attrs['id'] not in self.internal_anchors:
- new_anchor_span = self.create_new_anchor_span(
- soup, new_id)
- anchor_tag.insert_before(new_anchor_span)
- self.internal_anchors.add(new_id)
- del anchor_tag.attrs['id']
- del internal_link_tag.attrs['href']
-
- else:
- internal_link_tag.attrs['converter-mark'] = 'bad-link'
- self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
- f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
- f' Old id={a_tag_id}')
-
- def build_one_chapter(self, nav_point: NavPoint):
+ def detect_one_chapter(self, nav_point: NavPoint):
"""
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
3 cases:
id wraps all chapter content,
- id wraps chapter's content + subchapters' content
+ id wraps chapter"s content + subchapters" content
id points to the start of title of a chapter
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
@@ -494,68 +549,82 @@ class EpubConverter:
"""
if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href]
- chapter_tags = get_tags_between_chapter_marks(
+ subchapter_tags = self.get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
- new_tree = BeautifulSoup('', 'html.parser')
- for tag in chapter_tags:
- new_tree.append(tag)
+ new_tree = BeautifulSoup("", "html.parser")
+ for subchapter_tag in subchapter_tags:
+ new_tree.append(subchapter_tag)
self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)] = new_tree
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
- self.build_one_chapter(sub_node)
+ self.detect_one_chapter(sub_node)
- def define_chapters_content(self):
+ def define_chapters_with_content(self):
"""Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
- for point in top_level_nav_points:
- self.build_one_chapter(point)
+ for tl_nav_point in top_level_nav_points:
+ self.detect_one_chapter(tl_nav_point)
- def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
+ def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
+ """
+ Function prepare style, tags to json structure
+ Parameters
+ ----------
+ nav_point: NavPoint
+
+ lvl: int
+ level of chapter
+
+ Returns
+ -------
+ ChapterItem
+ built chapter
+
+ """
title = nav_point.title
- if nav_point.id:
- content: BeautifulSoup = self.href_chapter_id2soup_html[(
- nav_point.href, nav_point.id)]
- else:
- content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
- self.book_image_src_path2aws_path = update_images_src_links(content,
+ content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
+ if nav_point.id else self.html_href2html_body_soup[nav_point.href]
+
+ indent = " " * lvl
+ self.logger.log(indent + f"Chapter: {title} is processing.")
+ is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
+ self.logger.log(indent + "Process title.")
+ title_preprocessed = self.html_processor.prepare_title(title)
+ self.logger.log(indent + "Process content.")
+ content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
+ remove_title_from_chapter=is_chapter)
+
+ self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
self.img_href2img_bytes,
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.book_image_src_path2aws_path,
- book_id=self.file_path.stem
- if hasattr(self.file_path, 'stem') else 'book_id')
-
- is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
- title_preprocessed = prepare_title(title)
- content_preprocessed = prepare_content(title_preprocessed, content,
- remove_title_from_chapter=is_chapter)
+ book_id=Path(self.book_path).stem)
sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter
+ # if it doesn't have subchapters
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
- sub_chapter_item = self.node_to_livecarta_chapter_item(
+ sub_chapter_item = self.html_node_to_livecarta_chapter_item(
sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item)
-
- if self.logger:
- indent = ' ' * lvl
- self.logger.log(f'{indent}Chapter: {title} is prepared.')
- return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
+ return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes)
def convert_to_dict(self) -> dict:
"""Function which convert list of html nodes to appropriate json structure"""
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
- for nav_point in top_level_nav_points:
- chapter = self.node_to_livecarta_chapter_item(nav_point)
+ # loop through to level chapters
+ for tl_nav_point in top_level_nav_points:
+ chapter = self.html_node_to_livecarta_chapter_item(tl_nav_point)
top_level_chapters.append(chapter)
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
- self.logger.log(f'Anchors found: {len(self.internal_anchors)}.')
- self.logger.log('End conversion.')
+ self.logger.log(f"Anchors found: {len(self.internal_anchors)}.")
+ self.logger.log("End conversion.")
return {
"content": top_level_dict_chapters,
@@ -564,12 +633,16 @@ class EpubConverter:
if __name__ == "__main__":
- epub_file_path = '../../epub/9781614382264.epub'
+ epub_file_path = "../../books/epub/9780763774134.epub"
logger_object = BookLogger(
- name='epub', book_id=epub_file_path.split('/')[-1])
+ name="epub", book_id=epub_file_path.split("/")[-1])
- json_converter = EpubConverter(epub_file_path, logger=logger_object)
+ css_processor = CSSPreprocessor()
+ html_processor = HtmlEpubPreprocessor(logger=logger_object)
+
+ json_converter = EpubConverter(epub_file_path, logger=logger_object,
+ css_processor=css_processor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict()
- with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json:
+ with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
json.dump(content_dict, f_json, ensure_ascii=False)
diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py
index cb6e080..9131eda 100644
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -1,4 +1,6 @@
from src.book_solver import BookSolver
+from src.epub_converter.css_processor import CSSPreprocessor
+from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.epub_converter.epub_converter import EpubConverter
@@ -7,15 +9,17 @@ class EpubBook(BookSolver):
def __init__(self, book_id=0, access=None, main_logger=None):
super().__init__(book_id, access, main_logger)
- self.book_type = 'epub'
+ self.book_type = "epub"
def get_converted_book(self):
"""
Function
Steps
----------
- 1. Converts .epub to .html
- 2. Parses from line structure to nested structure
+ 1. Gets data from preset structure
+ 2. Add preset to html preprocessor
+ 3. Converts .epub to .html
+ 4. Parses from line structure to nested structure
Returns
----------
@@ -23,7 +27,10 @@ class EpubBook(BookSolver):
json for LiveCarta platform
"""
+ css_processor = CSSPreprocessor()
+ html_processor = HtmlEpubPreprocessor(self.preset_path, logger=self.logger_object)
json_converter = EpubConverter(
- self.file_path, access=self.access, logger=self.logger_object)
+ self.book_path, access=self.access, logger=self.logger_object,
+ css_processor=css_processor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict()
return content_dict
diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py
new file mode 100644
index 0000000..34cd1fb
--- /dev/null
+++ b/src/epub_converter/footnotes_processing.py
@@ -0,0 +1,91 @@
+import re
+from typing import Tuple
+from bs4 import BeautifulSoup, Tag
+
+
+def _replace_with_livecarta_anchor_tag(anchor, i):
+ """Function replace noteref_tag(anchor) with new livecarta tag"""
+ new_tag = BeautifulSoup(features="lxml").new_tag("sup")
+ new_tag["class"] = "footnote-element"
+ new_tag["data-id"] = i + 1
+ new_tag["id"] = f"footnote-{i + 1}"
+ new_tag.string = "*"
+ if anchor.parent.name == "sup":
+ anchor.parent.unwrap()
+ anchor.replace_with(new_tag)
+ return new_tag
+
+
+def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name="epub:type") \
+ -> Tuple[list, list, list]:
+ """
+ This function preprocessing footnotes
+ This function should be earlier that adding fonts in pipeline.
+
+ Here is an example footnote1
+
+
+ """
+ footnotes, new_noterefs_tags, new_footnotes_tags = [], [], []
+ noterefs_tags = source_html_tag.find_all(
+ attrs={noteref_attr_name: "noteref"})
+ bad_noterefs_tags = set(
+ [tag for tag in noterefs_tags if not tag.attrs.get("href")])
+ noterefs_tags = [
+ tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
+ [tag.decompose() for tag in bad_noterefs_tags]
+
+ def parse_a_tag_href(s: str) -> Tuple[str, str]:
+ """Returns name of file & id of an anchor"""
+ assert "#" in s, f"Error. Unexpected href: {s} in a tag. Href must contain an id."
+ f, id_ = s.split("#")
+ return f, id_
+
+ def verify_footnote_tag(tags: list):
+ """Function verifies is tag - footnote"""
+ assert len(tags) <= 1, f"Error, Multiple id: {href}.\n{tags}"
+ if len(tags) == 0:
+ anchored_tags = list(target_html_tag.find_all(id=element_id))
+ if len(anchored_tags):
+ print(
+ f"Warning. Href for tag is detected as footnote:\n{noteref_tag}")
+ return anchored_tags
+ else:
+ assert 0, f"Error, No element with id: {href} found."
+ return tags
+
+ for i, noteref_tag in enumerate(noterefs_tags):
+ href = noteref_tag.attrs["href"]
+ file, element_id = parse_a_tag_href(href)
+ if not file:
+ target_html_tag = source_html_tag
+ else:
+ target_html_tag = href2soup_html.get(file)
+ if not target_html_tag:
+ print(
+ f"Error while footnotes processing. For {noteref_tag} invalid path: {file}.")
+ continue
+
+ possible_footnote = "note|footnote|endnote|rearenote"
+ expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
+ attrs={"epub:type": re.compile(possible_footnote)}))
+
+ expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
+ footnote_tag = expected_footnote_tags[0]
+ if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote":
+ footnote_tag = footnote_tag.parent
+ new_noterefs_tags.append(
+ _replace_with_livecarta_anchor_tag(noteref_tag, i))
+ content = footnote_tag.text
+ # footnote_tag.decompose()
+ footnotes.append(content)
+ footnote_tag = footnote_tag.find(
+ attrs={"role": "docs-backlink"}) or footnote_tag
+ new_footnotes_tags.append(footnote_tag)
+
+ for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)):
+ noteref.attrs["data-id"] = i + 1
+ noteref.attrs["id"] = f"footnote-{i + 1}"
+ footnote.attrs["href"] = f"#footnote-{i + 1}"
+
+ return footnotes, new_noterefs_tags, new_footnotes_tags
diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py
deleted file mode 100644
index d94c43a..0000000
--- a/src/epub_converter/html_epub_preprocessor.py
+++ /dev/null
@@ -1,666 +0,0 @@
-import os
-import re
-import pathlib
-from typing import Tuple
-
-from bs4 import BeautifulSoup, NavigableString, Tag, Comment
-
-from src.access import Access
-from src.livecarta_config import LiveCartaConfig
-
-
-def _replace_with_livecarta_anchor_tag(anchor, i):
- """Function replace noteref_tag(anchor) with new livecarta tag"""
- new_tag = BeautifulSoup(features='lxml').new_tag('sup')
- new_tag['class'] = 'footnote-element'
- new_tag['data-id'] = i + 1
- new_tag['id'] = f'footnote-{i + 1}'
- new_tag.string = '*'
- if anchor.parent.name == 'sup':
- anchor.parent.unwrap()
- anchor.replace_with(new_tag)
- return new_tag
-
-
-def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
- -> Tuple[list, list, list]:
- """
- This function preprocessing footnotes
- This function should be earlier that adding fonts in pipeline.
-
- Here is an example footnote1
-
-
- """
- footnotes = []
- noterefs_tags = source_html_tag.find_all(
- attrs={noteref_attr_name: 'noteref'})
- bad_noterefs_tags = set(
- [tag for tag in noterefs_tags if not tag.attrs.get('href')])
- noterefs_tags = [
- tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
- new_noterefs_tags = []
- new_footnotes_tags = []
- [tag.decompose() for tag in bad_noterefs_tags]
-
- def parse_a_tag_href(s: str) -> Tuple[str, str]:
- """Returns name of file & id of an anchor"""
- assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
- f, id_ = s.split('#')
- return f, id_
-
- def verify_footnote_tag(tags: list):
- """Function verifies is tag - footnote"""
- assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
- if len(tags) == 0:
- anchored_tags = list(target_html_tag.find_all(id=element_id))
- if len(anchored_tags):
- print(
- f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
- return anchored_tags
- else:
- assert 0, f'Error, No element with id: {href} found.'
-
- return tags
-
- for i, noteref_tag in enumerate(noterefs_tags):
- href = noteref_tag.attrs['href']
- file, element_id = parse_a_tag_href(href)
- if not file:
- target_html_tag = source_html_tag
- else:
- target_html_tag = href2soup_html.get(file)
- if not target_html_tag:
- print(
- f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
- continue
-
- possible_footnote = 'note|footnote|endnote|rearenote'
- expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
- attrs={'epub:type': re.compile(possible_footnote)}))
-
- expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
- footnote_tag = expected_footnote_tags[0]
- if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
- footnote_tag = footnote_tag.parent
- new_noterefs_tags.append(
- _replace_with_livecarta_anchor_tag(noteref_tag, i))
- content = footnote_tag.text
- # footnote_tag.decompose()
- footnotes.append(content)
- footnote_tag = footnote_tag.find(
- attrs={'role': 'doc-backlink'}) or footnote_tag
- new_footnotes_tags.append(footnote_tag)
-
- return footnotes, new_noterefs_tags, new_footnotes_tags
-
-
-def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup:
- """
- Main function that works with structure of html. Make changes inplace.
- Parameters
- ----------
- body_tag: Tag, soup object
-
- Steps
- ----------
- 1. Extracts tags that are not needed
- 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
- Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
- This tag must have a body_tag as a parent.
- Otherwise, it is wrapped with some tags. Like:
-
- 3. Headings that are not supported by livecarta converts to
- 4. Wrapping NavigableString
-
- Returns
- -------
- body_tag: Tag, BeautifulSoup
- adjusted body_tag
-
- """
- def _preserve_class_in_aside_tag(tag_):
- """to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
- # this is for Wiley books with boxes
- tag_class = tag_.attrs['class'] if not isinstance(
- tag_.attrs['class'], list) else tag_.attrs['class'][0]
- if tag_.parent.name == 'aside':
- if not tag_.parent.attrs.get('class'):
- tag_.parent.attrs['class'] = tag_class
-
- def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
- """
- Function saves css style inherited from class, copies class to child
- returns True, if could be unwrapped
- Parameters
- ----------
- tag_: Tag, soup object
-
- Returns
- -------
- bool
-
- """
- # this is for Wiley books with boxes
- tag_class = tag_.attrs['class'] if not isinstance(
- tag_.attrs['class'], list) else tag_.attrs['class'][0]
- if 'feature' not in tag_class:
- return True
- child_p_tags = tag_.find_all("p")
- if len(child_p_tags) == 1:
- child_p_tag = child_p_tags[0]
- if not child_p_tag.attrs.get('class'):
- child_p_tag.attrs['class'] = tag_class
- return True
-
- elif len(child_p_tags) > 1:
- tag_.name = 'p'
- return False
- else:
- return True
-
- def _add_span_to_save_ids_for_links(tag_to_be_removed):
- if tag_to_be_removed.attrs.get('id'):
- _insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
- id_=tag_to_be_removed.attrs['id'],
- class_=tag_to_be_removed.attrs.get('class'))
-
- def _replace_div_tag_with_table():
- """
- Function replace with
:
- 1. Convert div with certain classes to tables
- 2. Add background color to div with background-color
-
- """
- for div in body_tag.find_all("div"):
- if div.attrs.get('class'):
- div_class = div.attrs['class'] if not isinstance(
- div.attrs['class'], list) else div.attrs['class'][0]
- if div_class in ['C409', 'C409a']:
- _wrap_block_tag_with_table(
- body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')
-
- elif div_class in ['C441', 'C816']:
- _wrap_block_tag_with_table(
- body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')
-
- if div.attrs.get('style'):
- if 'background-color' in div.attrs['style']:
- end_index = div.attrs['style'].find(
- 'background-color') + len('background-color')
- start_index_of_color = end_index + 2
- bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
- _wrap_block_tag_with_table(
- body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
- elif div.attrs.get('style') == '':
- del div.attrs['style']
-
- structural_tags_names = [
- 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
- 'figure', 'footer', 'iframe', 'span', 'p'
- ]
-
- if div.contents:
- is_not_struct_tag = [
- child.name not in structural_tags_names for child in div.contents]
- if all(is_not_struct_tag):
- div.name = 'p'
- continue
- _add_span_to_save_ids_for_links(div)
- div.unwrap()
-
- def _heading_tag_to_p_tag(body_tag):
- """Function to convert all lower level headings to p tags"""
- pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
- header_tags = body_tag.find_all(re.compile(pattern))
- for tag in header_tags:
- tag.name = 'p'
-
- # comments removal
- for tag in body_tag.find_all():
- for element in tag(text=lambda text: isinstance(text, Comment)):
- element.extract()
-
- _replace_div_tag_with_table()
-
- for s in body_tag.find_all("section"):
- could_be_unwrapped = True
- if s.attrs.get('class'):
- could_be_unwrapped = _preserve_class_in_section_tag(s)
- _add_span_to_save_ids_for_links(s)
- if could_be_unwrapped:
- s.unwrap()
-
- for s in body_tag.find_all("article"):
- _add_span_to_save_ids_for_links(s)
- s.unwrap()
-
- for s in body_tag.find_all("figure"):
- s.name = 'p'
- # to center image inside this tag
- s.attrs['style'] = "text-align: center;"
-
- for s in body_tag.find_all("figcaption"):
- _add_span_to_save_ids_for_links(s)
- s.unwrap()
-
- for s in body_tag.find_all("aside"):
- s.name = 'blockquote'
-
- for s in body_tag.find_all("main"):
- _add_span_to_save_ids_for_links(s)
- s.unwrap()
-
- for s in body_tag.find_all("body"):
- _add_span_to_save_ids_for_links(s)
- s.unwrap()
-
- for s in body_tag.find_all("html"):
- _add_span_to_save_ids_for_links(s)
- s.unwrap()
-
- for s in body_tag.find_all("header"):
- s.name = 'span'
-
- # check marks for chapter starting are on the same 1 level
- marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
- parents_marks_are_body = [x.parent == body_tag for x in marks]
-
- # fix marks to be on 1 level
- if not all(parents_marks_are_body):
- for x in marks:
- while x.parent != body_tag:
- x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
-
- parents_marks_are_body = [x.parent == body_tag for x in marks]
- assert all(
- parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
-
- _heading_tag_to_p_tag(body_tag)
-
- # wrap NavigableString with
- for node in body_tag:
- if isinstance(node, NavigableString):
- content = str(node)
- content = re.sub(r'([\n\t\xa0])', ' ', content)
- content = content.strip()
- if content:
- tag = body_tag.new_tag('p')
- tag.append(str(node))
- node.replace_with(tag)
- return body_tag
-
-
-def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
- """After processing on a first_id that corresponds to current chapter,
- from initial html_soup all tags from current chapter are extracted
- Parameters
- ----------
- first_id:
- Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
- href:
- Name of current chapter's file
- html_soup: Tag
- Soup object of current file
-
- Returns
- -------
- tags: list [Tag, NavigableString]
- Chapter's tags
-
- """
- marked_tags = html_soup.find(
- attrs={'id': first_id, 'class': 'converter-chapter-mark'})
- if marked_tags:
- next_tag = marked_tags.next_sibling
- tags = []
- while next_tag:
- if not isinstance(next_tag, NavigableString) and\
- (next_tag.attrs.get('class') == 'converter-chapter-mark'):
- break
- tags.append(next_tag)
- next_tag = next_tag.next_sibling
-
- # remove tags between first_id and next found id
- # save them in list for next steps
- tags = [tag.extract() for tag in tags]
- html_soup.smooth()
-
- else:
- assert 0, f'Warning: no match for {first_id, href}'
-
- return tags
-
-
-def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
- """Function saves all images to Amazon web service"""
- link_path = access.send_image(
- img_file_path, doc_id=book_id, img_content=img_content)
- return link_path
-
-
-def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
- """Function saves all images locally"""
- folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- new_path = pathlib.Path(os.path.join(
- folder_path, f'../json/img_{book_id}/'))
- new_path.mkdir(exist_ok=True)
-
- new_img_path = new_path / os.path.basename(img_file_path)
- f = open(new_img_path, 'wb+')
- f.write(img_content)
- f.close()
-
- return new_img_path
-
-
-def update_images_src_links(body_tag: BeautifulSoup,
- href2img_content: dict,
- path_to_html: str,
- access=None,
- path2aws_path: dict = None,
- book_id: str = None) -> dict:
- """Function makes dictionary image_src_path -> Amazon web service_path"""
- img_tags = body_tag.find_all('img')
-
- for img in img_tags:
- path_to_img_from_html = img.attrs.get('src')
- html_folder = os.path.dirname(path_to_html)
- path_to_img_from_root = os.path.normpath(os.path.join(
- html_folder, path_to_img_from_html)).replace('\\', '/')
-
- assert path_to_img_from_root in href2img_content, \
- f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
-
- img_content = href2img_content[path_to_img_from_root]
- if access is not None:
- if path_to_img_from_root in path2aws_path:
- new_folder = path2aws_path[path_to_img_from_root]
- else:
- new_folder = save_image_to_aws(
- access, path_to_img_from_root, img_content, book_id)
- path2aws_path[path_to_img_from_root] = new_folder
- else:
- new_folder = save_image_locally(
- path_to_img_from_root, img_content, 'book_id')
-
- img.attrs['src'] = str(new_folder)
- if img.attrs.get('width'):
- del img.attrs['width']
- if img.attrs.get('height'):
- del img.attrs['height']
- if img.attrs.get('style'):
- del img.attrs['style']
- return path2aws_path
-
-
-def _clean_title_from_numbering(title: str):
- """Function removes numbering from titles"""
- title = re.sub(r'^(\s+)+', '', title)
- # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
- # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
- # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
- return title
-
-
-def prepare_title(title_of_chapter: str) -> str:
- """Function finalise processing/cleaning title"""
- title_str = BeautifulSoup(title_of_chapter, features='lxml').string
- title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
- title_str = re.sub(r' +', ' ', title_str).rstrip()
- title_str = _clean_title_from_numbering(title_str)
- return title_str
-
-
-def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
- """Function inserts span before tag aren't supported by livecarta"""
- new_tag = main_tag.new_tag("span")
- new_tag.attrs['id'] = id_ or ''
- new_tag.attrs['class'] = class_ or ''
- new_tag.string = "\xa0"
- tag.insert_before(new_tag)
-
-
-def _clean_headings_content(content: BeautifulSoup, title: str):
- def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
- if tag_to_be_removed.attrs.get('id'):
- _insert_span_with_attrs_before_tag(body_tag,
- tag_to_be_removed,
- id_=tag_to_be_removed.attrs.get(
- 'id'),
- class_=tag_to_be_removed.attrs.get('class'))
-
- for sub_tag in tag_to_be_removed.find_all():
- if sub_tag.attrs.get('id'):
- _insert_span_with_attrs_before_tag(body_tag,
- tag_to_be_removed,
- id_=sub_tag.attrs['id'],
- class_=sub_tag.attrs.get('class'))
-
- title = title.lower()
- for child in content.contents:
- if isinstance(child, NavigableString):
- text = child
- else:
- text = child.text
- if text and re.sub(r'([\n\t\xa0])', '', text):
- text = re.sub(r'([\n\t\xa0])', ' ', text)
- text = re.sub(r' +', ' ', text).strip()
- text = text.lower()
- if title == text:
- add_span_to_save_ids_for_links(child, content)
- child.extract()
- elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
- add_span_to_save_ids_for_links(child, content)
- child.extract()
- break
-
-
-def _process_lists(body_tag: BeautifulSoup):
- """
- Function
- - process tags
.
- - unwrap tags.
- Parameters
- ----------
- body_tag: Tag, soup object
-
- Returns
- -------
- None
-
- """
- li_tags = body_tag.find_all("li")
- for li_tag in li_tags:
- if li_tag.p:
- li_tag.attrs.update(li_tag.p.attrs)
- li_tag.p.unwrap()
-
-
-def _preprocess_table(body_tag: BeautifulSoup):
- """Function to preprocess tables and tags(td|th|tr): style"""
- tables = body_tag.find_all("table")
- for table in tables:
- t_tags = table.find_all(re.compile("td|th|tr"))
- for t_tag in t_tags:
- style = t_tag.get('style')
- width = ''
- if style:
- width_match = re.search(
- r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
- if width_match:
- size = width_match.group(1)
- width = size + 'px'
-
- t_tag.attrs['width'] = t_tag.get('width') or width
-
- if t_tag.attrs.get('style'):
- t_tag.attrs['style'] = t_tag.attrs['style'].replace(
- 'border:0;', '')
-
- elif t_tag.attrs.get('style') == '':
- del t_tag.attrs['style']
-
- if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
- table.attrs['border'] = '1'
-
-
-def _preprocess_code_tags(chapter_tag: BeautifulSoup):
- """
- Function
- - transform , , tags into span
- - add code style to this tags
- Parameters
- ----------
- chapter_tag: Tag, soup object
-
- Returns
- -------
- None
-
- """
- for code in chapter_tag.find_all(re.compile("code|kbd|var")):
- if not code.parent.name == "pre":
- code.name = "span"
- continue
- # if tag isn't in pre and doesn't have style
- if not code.attrs.get('style'):
- code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
-
-
-def _prepare_formatted(text: str) -> str:
- """Function replaces special symbols with their Unicode representation"""
- text = text.replace("<", "\x3C")
- text = text.replace(">", "\x3E")
- text = text.replace('\t', "\xa0 \xa0 ") #
- text = text.replace(' ', "\xa0")
- text = text.replace('𝑓', "\xf0\x9d\x91\x93")
- return text
-
-
-def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
- """
- Function preprocessing tags
- Wrap string of the tag with if it's necessary
- Parameters
- ----------
- chapter_tag: Tag, soup object
-
- Returns
- ----------
- None
- Modified chapter tag
-
- """
- for pre in chapter_tag.find_all("pre"):
- if pre.find_all("code|kbd|var"):
- continue
- else:
- code = chapter_tag.new_tag("code")
- # insert all items that was in pre to code and remove from pre
- for content in reversed(pre.contents):
- code.insert(0, content.extract())
- # wrap code with items
- pre.append(code)
-
-
-def _clean_wiley_block(block):
- hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
- for hr in hrs:
- hr.extract()
- h = block.find(re.compile("h[1-9]"))
- if h:
- h.name = "p"
- h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
-
-
-def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
- """Function wraps with """
- table = main_tag.new_tag("table")
- table.attrs['border'] = border
- table.attrs['align'] = 'center'
- table.attrs['style'] = f'width:{width}%;'
- tbody = main_tag.new_tag("tbody")
- tr = main_tag.new_tag("tr")
- td = main_tag.new_tag("td")
- # td.attrs['border-radius'] = '8px'
- if bg_color:
- td.attrs['bgcolor'] = bg_color
- old_tag.wrap(td)
- td.wrap(tr)
- tr.wrap(tbody)
- tbody.wrap(table)
- table.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
- return table
-
-
-def _preprocess_block_tags(chapter_tag: Tag):
- """Function preprocessing tags"""
- for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
- _clean_wiley_block(block)
- color = '#DDDDDD' if block.attrs.get(
- 'class') == 'feature1' else None
- color = '#EEEEEE' if block.attrs.get(
- 'class') == 'feature2' else color
- _wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
- block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
- block.unwrap()
-
- for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
- _clean_wiley_block(future_block)
- color = '#DDDDDD' if future_block.attrs.get(
- 'class') == 'feature1' else None
- color = '#EEEEEE' if future_block.attrs.get(
- 'class') == 'feature2' else color
- _wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
-
-
-def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
- """
- Function finalise processing/cleaning content
- Parameters
- ----------
- title_str: str
-
- content_tag: Tag, soup object
-
- remove_title_from_chapter: bool
-
- Steps
- ----------
- 1. find \n
- 2. heading removal
- 3. processing tags
- 4. class removal
-
- Returns
- -------
- content_tag: str
- prepared content
-
- """
- # 1. find \n
- to_remove = []
- for child in content_tag.contents:
- if isinstance(child, NavigableString):
- s = re.sub(r'([\n\t])', '', child.string)
- if s == '':
- to_remove.append(child)
-
- # 2. heading removal
- if remove_title_from_chapter:
- _clean_headings_content(content_tag, title_str)
-
- # 3. processing tags (, , , , )
- _process_lists(content_tag)
- _preprocess_table(content_tag)
- _preprocess_code_tags(content_tag)
- _preprocess_pre_tags(content_tag)
- _preprocess_block_tags(content_tag)
-
- # 4. class removal
- for tag in content_tag.find_all(recursive=True):
- if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
- 'footnote-element']):
- del tag.attrs['class']
- return str(content_tag)
diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py
new file mode 100644
index 0000000..da2a6c0
--- /dev/null
+++ b/src/epub_converter/html_epub_processor.py
@@ -0,0 +1,426 @@
+import re
+import json
+from bs4 import BeautifulSoup, NavigableString, Comment, Tag
+
+from src.util.helpers import BookLogger
+
+
+class HtmlEpubPreprocessor:
+ def __init__(self, preset_path="../../presets/presets.json", logger=None):
+ self.preset = json.load(open(preset_path))
+ self.logger: BookLogger = logger
+ self.name2function = {
+ "table_wrapper": self._wrap_tags_with_table,
+ "replacer": self._tags_to_correspond_livecarta_tag,
+ "attr_replacer": self._replace_attrs_in_tags,
+ "unwrapper": self._unwrap_tags,
+ "inserter": self._insert_tags_into_correspond_tags
+ }
+
+ @staticmethod
+ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
+ """
+ Function adds span with id from tag_to_be_removed
+ because this tag will be removed(unwrapped/extract)
+ Parameters
+ ----------
+ tag_to_be_removed: Soup object
+ chapter_tag: BeautifulSoup
+
+ Returns
+ -------
+ None
+ updated body tag
+
+ """
+
+ def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
+ class_: list):
+ """Function inserts span before tag aren't supported by LiveCarta"""
+ new_tag = chapter_tag.new_tag("span")
+ new_tag.attrs["id"] = id_ or ""
+ new_tag.attrs["class"] = class_ or ""
+ new_tag.string = "\xa0"
+ tag_to_be_removed.insert_before(new_tag)
+
+ if tag_to_be_removed.attrs.get("id"):
+ _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
+ id_=tag_to_be_removed.attrs["id"],
+ class_=tag_to_be_removed.attrs.get("class"))
+
+ @staticmethod
+ def prepare_title(title_of_chapter: str) -> str:
+ """
+ Function finalise processing/cleaning title
+ Parameters
+ ----------
+ title_of_chapter: str
+
+ Returns
+ -------
+ title: str
+ cleaned title
+
+ """
+ title = BeautifulSoup(title_of_chapter, features="lxml").string
+ # clean extra whitespace characters ([\r\n\t\f\v ])
+ title = re.sub(r"[\s\xa0]", " ", title).strip()
+ return title
+
+ @staticmethod
+ def _remove_comments(chapter_tag: BeautifulSoup):
+ """
+ Function remove comments
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag without comments
+
+ """
+ for tag in chapter_tag.find_all():
+ for element in tag(text=lambda text: isinstance(text, Comment)):
+ element.extract()
+
+ @staticmethod
+ def _wrap_strings_with_p(chapter_tag: BeautifulSoup):
+ """
+ Function converts headings that aren't supported by LiveCarta with
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag with wrapped NavigableStrings
+
+ """
+ for node in chapter_tag:
+ if isinstance(node, NavigableString):
+ content = str(node)
+ content = re.sub(r"([\s\xa0])", " ", content).strip()
+ if content:
+ p_tag = chapter_tag.new_tag("p")
+ p_tag.append(str(node))
+ node.replace_with(p_tag)
+
+ def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: list):
+ """
+ Function wraps with
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag with wrapped certain tags with
+
+ """
+
+ def _wrap_tag_with_table(width="100", border="", bg_color=None):
+ table = chapter_tag.new_tag("table")
+ table.attrs["border"], table.attrs["align"], table.attrs["style"] \
+ = border, "center", f"width:{width}%;"
+ tbody, tr, td = \
+ chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
+ td.attrs["bgcolor"] = bg_color
+ tag_to_wrap.wrap(td)
+ td.wrap(tr)
+ tr.wrap(tbody)
+ tbody.wrap(table)
+ table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
+ return table
+
+ def process_tag_using_table():
+ _wrap_tag_with_table(
+ width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
+ border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
+ bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
+ self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
+ tag_to_wrap.unwrap()
+
+ for rule in rules:
+ tags = rule["tags"]
+ for attr in rule["attrs"]:
+ for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
+ {attr["name"]: re.compile(fr"{attr['value']}")}):
+ process_tag_using_table()
+
+ @staticmethod
+ def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: list):
+ """
+ Function to replace all tags to correspond LiveCarta tags
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag with all tags replaced with LiveCarta tags
+
+ """
+ for rule in rules:
+ tags = rule["tags"]
+ tag_to_replace = rule["tag_to_replace"]
+ if rule["condition"]:
+ for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+ if condition_on_tag[0] == 'parent_tags':
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ if tag.parent.select(condition_on_tag[1]):
+ tag.name = tag_to_replace
+ elif condition_on_tag[0] == 'child_tags':
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
+ tag.name = tag_to_replace
+ elif condition_on_tag[0] == "attrs":
+ for attr in rule["condition"]["attrs"]:
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+ {attr["name"]: re.compile(fr"{attr['value']}")}):
+ tag.name = tag_to_replace
+ else:
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ # todo can cause appearance of \n ...
-> \n
...
\n
(section)
+ tag.name = tag_to_replace
+
+ @staticmethod
+ def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list):
+ """
+ Function to replace all tags to correspond LiveCarta tags
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag with all tags replaced with LiveCarta tags
+
+ """
+ for rule in rules:
+ attr = rule["attr"]
+ tags = rule["condition"]["tags"]
+ attr_to_replace = rule["attr_to_replace"]
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+ {attr: re.compile(r".*")}):
+ tag[attr_to_replace] = tag[attr]
+ del tag[attr]
+
+ def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
+ """
+ Function unwrap tags and moves id to span
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag with unwrapped certain tags
+
+ """
+ for tag_name in rules["tags"]:
+ for tag in chapter_tag.select(tag_name):
+ # if tag is a subtag
+ if ">" in tag_name:
+ tag.parent.attrs.update(tag.attrs)
+ self._add_span_to_save_ids_for_links(tag, chapter_tag)
+ tag.unwrap()
+
+ @staticmethod
+ def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: list):
+ """
+ Function inserts tags into correspond tags
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag with inserted tags
+
+ """
+ def insert(tag):
+ tag_to_insert = \
+ chapter_tag.new_tag(rule["tag_to_insert"])
+ # insert all items that was in tag to subtag and remove from tag
+ for content in reversed(tag.contents):
+ tag_to_insert.insert(0, content.extract())
+ # wrap subtag with items
+ tag.append(tag_to_insert)
+
+ for rule in rules:
+ tags = rule["tags"]
+ if rule["condition"]:
+ for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
+ if condition_on_tag[0] == 'parent_tags':
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ if tag.parent.select(condition_on_tag[1]):
+ insert(tag)
+ elif condition_on_tag[0] == 'child_tags':
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
+ insert(tag)
+ elif condition_on_tag[0] == "attrs":
+ for attr in rule["condition"]["attrs"]:
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
+ {attr["name"]: re.compile(fr"{attr['value']}")}):
+ insert(tag)
+ else:
+ for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
+ insert(tag)
+
+ def _remove_headings_content(self, chapter_tag, title_of_chapter: str):
+ """
+ Function
+ - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
+ - adds span with id in order to
+ Parameters
+ ----------
+ chapter_tag: soup object
+ Tag of the page
+ title_of_chapter: str
+ Chapter title
+
+ Returns
+ -------
+ None
+ clean/remove headings & add span with id
+
+ """
+ title_of_chapter = title_of_chapter.lower()
+ if title_of_chapter == "chapter 1":
+ pass
+ for tag in chapter_tag.contents:
+ text = tag if isinstance(tag, NavigableString) else tag.text
+ if re.sub(r"[\s\xa0]", "", text):
+ text = re.sub(r"[\s\xa0]", " ", text).lower()
+ text = text.strip() # delete extra spaces
+ if not isinstance(tag, NavigableString):
+ if title_of_chapter == text or \
+ (title_of_chapter in text and
+ re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
+ self._add_span_to_save_ids_for_links(tag, chapter_tag)
+ tag.extract()
+ return
+ elif not self._remove_headings_content(tag, title_of_chapter):
+ break
+ else:
+ tag.extract()
+ return
+
+ @staticmethod
+ def _process_tables(chapter_tag: BeautifulSoup):
+ """
+ Function preprocesses tables and tags(td|th|tr)
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag with processed tables
+
+ """
+ tables = chapter_tag.find_all("table")
+ for table in tables:
+ for t_tag in table.find_all(re.compile("td|th|tr")):
+ width = ""
+ if t_tag.get("style"):
+ width_match = re.search(
+ r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
+ if width_match:
+ size = width_match.group(1)
+ width = size + "px"
+
+ t_tag.attrs["width"] = t_tag.get("width") or width
+
+ if t_tag.attrs.get("style"):
+ t_tag.attrs["style"] = t_tag.attrs["style"].replace(
+ "border:0;", "")
+ if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
+ del t_tag.attrs["style"]
+
+ if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
+ table.attrs["border"] = "1"
+
+ @staticmethod
+ def _class_removing(chapter_tag: BeautifulSoup):
+ """
+ Function removes classes that aren't created by converter
+ Parameters
+ ----------
+ chapter_tag: BeautifulSoup
+ Tag & contents of the chapter tag
+
+ Returns
+ -------
+ None
+ Chapter Tag without original classes of the book
+
+ """
+ for tag in chapter_tag.find_all(recursive=True):
+ if tag.attrs.get("class") \
+ and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
+ del tag.attrs["class"]
+
+ def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
+ """
+ Function finalise processing/cleaning content
+ Parameters
+ ----------
+ title_str: str
+
+ content_tag: Tag, soup object
+
+ remove_title_from_chapter: bool
+
+ Steps
+ ----------
+ 1. comments removal
+ 2. wrap NavigableString with tag
+ 3-6. wrap tags with
+ replace tags with correspond LiveCarta tags
+ unwrap tags
+ insert tags into correspond tags
+ 7. heading removal
+ 8. process_tables
+ 9. class removal
+
+ Returns
+ -------
+ content_tag: Tag
+ prepared content
+
+ """
+ # 1. remove comments
+ self._remove_comments(content_tag)
+ # 2.
+ self._wrap_strings_with_p(content_tag)
+ # 3-6.
+ for dict in self.preset:
+ func = self.name2function[dict["preset_name"]]
+ func(content_tag, dict['rules'])
+ # 7.
+ if remove_title_from_chapter:
+ self._remove_headings_content(content_tag, title_str)
+ # 8.
+ self._process_tables(content_tag)
+ # 9. remove classes that weren't created by converter
+ self._class_removing(content_tag)
+ return content_tag
diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py
new file mode 100644
index 0000000..6f35c3a
--- /dev/null
+++ b/src/epub_converter/image_processing.py
@@ -0,0 +1,65 @@
+import os
+import pathlib
+from bs4 import BeautifulSoup
+
+from src.access import Access
+
+
+def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
+ """Function saves all images to Amazon web service"""
+ link_path = access.send_image(
+ img_file_path, doc_id=book_id, img_content=img_content)
+ return link_path
+
+
+def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
+ """Function saves all images locally"""
+ folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ new_path = pathlib.Path(os.path.join(
+ folder_path, f"../books/json/img_{book_id}/"))
+ new_path.mkdir(exist_ok=True)
+
+ new_img_path = new_path / os.path.basename(img_file_path)
+ f = open(new_img_path, "wb+")
+ f.write(img_content)
+ f.close()
+ return new_img_path
+
+
+def update_images_src_links(body_tag: BeautifulSoup,
+ img_href2img_content: dict,
+ path_to_html: str,
+ access=None,
+ path2aws_path: dict = None,
+ book_id: str = None) -> dict:
+ """Function makes dictionary image_src_path -> Amazon web service_path"""
+ img_tags = body_tag.find_all("img")
+ for img in img_tags:
+ path_to_img_from_html = img.attrs.get("src")
+ html_folder = os.path.dirname(path_to_html)
+ path_to_img_from_root = os.path.normpath(os.path.join(
+ html_folder, path_to_img_from_html)).replace("\\", "/")
+
+ assert path_to_img_from_root in img_href2img_content, \
+ f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest."
+
+ img_content = img_href2img_content[path_to_img_from_root]
+ if access is not None:
+ if path_to_img_from_root in path2aws_path:
+ new_folder = path2aws_path[path_to_img_from_root]
+ else:
+ new_folder = save_image_to_aws(
+ access, path_to_img_from_root, img_content, book_id)
+ path2aws_path[path_to_img_from_root] = new_folder
+ else:
+ new_folder = save_image_locally(
+ path_to_img_from_root, img_content, book_id)
+
+ img.attrs["src"] = str(new_folder)
+ if img.attrs.get("width"):
+ del img.attrs["width"]
+ if img.attrs.get("height"):
+ del img.attrs["height"]
+ if img.attrs.get("style"):
+ del img.attrs["style"]
+ return path2aws_path
diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_inline_style_processor.py
similarity index 55%
rename from src/epub_converter/tag_css_style_converter.py
rename to src/epub_converter/tag_inline_style_processor.py
index 37b2672..30d7e50 100644
--- a/src/epub_converter/tag_css_style_converter.py
+++ b/src/epub_converter/tag_inline_style_processor.py
@@ -4,61 +4,62 @@ from typing import List
from logging import CRITICAL
from bs4 import BeautifulSoup
-from premailer import transform
from src.livecarta_config import LiveCartaConfig
-from src.epub_converter.css_preprocessing import LIVECARTA_STYLE_ATTRS
cssutils.log.setLevel(CRITICAL)
-class TagStyleConverter:
+class TagInlineStyleProcessor:
def __init__(self, tag_inline_style):
# tag with inline style + style parsed from css file
self.tag_inline_style = tag_inline_style
- self.style = self.process_inline_style()
+ self.tag_inline_style.attrs['style'] = self.process_inline_style()
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
"""Function remove text white color if there is no bg color"""
- if 'background' in style_:
+ if "background" in style_:
style_ = style_.replace(
- 'background:', 'background-color:')
+ "background:", "background-color:")
return style_
# if text color is white, check that we have bg-color
- if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_):
+ if ("color:#ffffff" in style_) or ("color:#fff" in style_) or ("color:white" in style_):
# if bg color is inherited, just return style as is
for parent_tag in tag.parents:
- # white bg color not need to be checked as we do not write 'white bg color'
- tag_with_bg = ['span', 'td', 'tr', 'p']
+ # white bg color not need to be checked as we do not write "white bg color"
+ tag_with_bg = ["span", "td", "tr", "p"]
tag_will_be_saved = parent_tag.name in tag_with_bg
- has_bg = parent_tag.attrs.get('style') and (
- 'background' in parent_tag.attrs.get('style'))
+ has_bg = parent_tag.attrs.get("style") and (
+ "background" in parent_tag.attrs.get("style"))
if has_bg and tag_will_be_saved:
return style_
children = tag.find_all()
for child in children:
- if child.attrs.get('style') and ('background' in child.attrs.get('style')):
- tmp_style = child.attrs['style'] + '; color:#fff; '
- child.attrs['style'] = tmp_style
+ if child.attrs.get("style") and ("background" in child.attrs.get("style")):
+ tmp_style = child.attrs["style"] + "; color:#fff; "
+ child.attrs["style"] = tmp_style
- # for child with bg color we added white text color, so this tag don't need white color
- style_ = style_.replace('color:#fff;', '')
- style_ = style_.replace('color:#ffffff;', '')
- style_ = style_.replace('color:white;', '')
+ # for child with bg color we added white text color, so this tag don"t need white color
+ style_ = style_.replace("color:#fff;", "")
+ style_ = style_.replace("color:#ffffff;", "")
+ style_ = style_.replace("color:white;", "")
return style_
- @staticmethod
- def duplicate_styles_check(split_style: list) -> list:
- style_name2style_value = {}
- for list_item in split_style:
- key, val = list_item.split(":")
- if val not in style_name2style_value.keys():
- style_name2style_value[key] = val
- split_style = [k + ":" + v for k, v in style_name2style_value.items()]
- return split_style
+ # @staticmethod
+ # def duplicate_styles_check(split_style: list) -> list:
+ # style_name2style_value = {}
+ # # {key: val for for list_item in split_style}
+ # splitstrs = (list_item.split(":") for list_item in split_style)
+ # d = {key: val for key, val in splitstrs}
+ # for list_item in split_style:
+ # key, val = list_item.split(":")
+ # if key not in style_name2style_value.keys():
+ # style_name2style_value[key] = val
+ # split_style = [k + ":" + v for k, v in style_name2style_value.items()]
+ # return split_style
@staticmethod
def indents_processing(split_style: list) -> str:
@@ -68,7 +69,7 @@ class TagStyleConverter:
Parameters
----------
split_style: list
- list of styles split by ';'
+ list of styles split by ";"
Returns
----------
@@ -76,12 +77,12 @@ class TagStyleConverter:
processed style with counted indent
"""
- processed_style = ";".join(split_style)
+ processed_style = ";".join(split_style)+';'
margin_left_regexp = re.compile(
- r'((margin-left|margin): *(-*\w+);*)')
+ r"((margin-left|margin): *(-*\w+);*)")
text_indent_regexp = re.compile(
- r'(text-indent: *(-*\w+);*)')
+ r"(text-indent: *(-*\w+);*)")
has_margin = re.search(margin_left_regexp, processed_style)
has_text_indent = re.search(text_indent_regexp, processed_style)
@@ -92,21 +93,21 @@ class TagStyleConverter:
if has_text_indent:
num_ti = abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2))))))
- processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' +
- str(abs(num_m - num_ti)) + 'px; ')
+ processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
+ str(abs(num_m - num_ti)) + "px; ")
processed_style = processed_style.replace(
- has_margin.group(1), '')
+ has_margin.group(1), "")
return processed_style
- processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' +
- str(abs(num_m)) + 'px; ')
+ processed_style = processed_style.replace(has_margin.group(1), "text-indent: " +
+ str(abs(num_m)) + "px; ")
return processed_style
elif has_text_indent:
- processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' +
+ processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2)))))))
- + 'px; ')
+ + "px; ")
return processed_style
return processed_style
@@ -126,23 +127,20 @@ class TagStyleConverter:
processed inline style
"""
- inline_style = self.tag_inline_style.attrs.get('style') + ';'
- # 1. Remove white color if tag doesn't have background color in style
+ inline_style = self.tag_inline_style.attrs.get("style") + ";"
+ # 1. Remove white color if tag doesn"t have background color in style
inline_style = self.remove_white_if_no_bgcolor(
inline_style, self.tag_inline_style)
inline_style = inline_style.replace(
- 'list-style-image', 'list-style-type')
-
+ "list-style-image", "list-style-type")
# 2. Create list of styles from inline style
- # replace all spaces between '; & letter' to ';'
+ # replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", inline_style)
- # when we split style by ';', last element of the list is '' - None (remove it)
- split_inline_style: list = list(filter(None, style.split(';')))
-
+ # when we split style by ";", last element of the list is "" - None (remove it)
+ split_inline_style: list = list(filter(None, style.split(";")))
# 3. Duplicate styles check - if the tag had duplicate styles
- split_inline_style = self.duplicate_styles_check(split_inline_style)
-
- # 4. Processing indents#
+ # split_inline_style = self.duplicate_styles_check(split_inline_style)
+ # 4. Processing indents
inline_style: str = self.indents_processing(split_inline_style)
return inline_style
@@ -164,19 +162,19 @@ class TagStyleConverter:
"""
styles_to_remove = []
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
- if f'{k[0]}:{k[1]}' in style:
+ if f"{k[0]}:{k[1]}" in style:
styles_to_remove.append(k)
return styles_to_remove
def change_attrs_with_corresponding_tags(self):
# adds , , instead of styles
- styles_to_remove = self.check_style_to_be_tag(self.style)
+ styles_to_remove = self.check_style_to_be_tag(self.tag_inline_style.attrs['style'])
for i, (attr, value) in enumerate(styles_to_remove):
- self.tag_inline_style.attrs['style'] = self.tag_inline_style.attrs['style']\
- .replace(f'{attr}:{value};', '').strip()
+ self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\
+ .replace(f"{attr}:{value};", "").strip()
corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)]
- correspond_tag = BeautifulSoup(features='lxml').new_tag(corr_tag_name)
+ correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name)
for content in reversed(self.tag_inline_style.contents):
correspond_tag.insert(0, content.extract())
self.tag_inline_style.append(correspond_tag)
@@ -184,75 +182,37 @@ class TagStyleConverter:
@staticmethod
def wrap_span_in_tag_to_save_style_attrs(initial_tag):
"""Function designed to save style attrs that cannot be in tag.name -> span"""
- dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG))
- if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'):
+ dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG))
+ if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
styles_can_be_in_tag = [style
- for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items()
+ for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items()
if re.match(tag, initial_tag.name)
for style in styles]
- styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS
+ styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS
if attr not in styles_can_be_in_tag]
- span_style = initial_tag.attrs['style']
+ span_style = initial_tag.attrs["style"]
# here check that this style is exactly the same.
- # Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
- styles_to_be_saved_in_span = [((attr + ':') in span_style) & (
- '-' + attr not in span_style) for attr in styles_cant_be_in_tag]
+ # Not "align" when we have "text-align", or "border" when we have "border-top"
+ styles_to_be_saved_in_span = [((attr + ":") in span_style) & (
+ "-" + attr not in span_style) for attr in styles_cant_be_in_tag]
if any(styles_to_be_saved_in_span):
# if we find styles that cannot be in -> wrap them in span
- tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}')
- style = ''
- possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag]
+ tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}")
+ style = ""
+ possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag]
for possible_attr_regexp in possible_attrs_regexp:
has_style_attrs = re.search(
possible_attr_regexp, span_style)
if has_style_attrs and has_style_attrs.group(1):
style += has_style_attrs.group(1)
span_style = span_style.replace(
- has_style_attrs.group(1), '')
- tag.attrs['style'] = style
- initial_tag.name = 'span'
- initial_tag.attrs['style'] = span_style
+ has_style_attrs.group(1), "")
+ tag.attrs["style"] = style
+ initial_tag.name = "span"
+ initial_tag.attrs["style"] = span_style
initial_tag.wrap(tag)
def convert_initial_tag(self):
self.change_attrs_with_corresponding_tags()
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
return self.tag_inline_style
-
-
-def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
- """
- Function adds styles from .css to inline style.
- Parameters
- ----------
- html_soup: BeautifulSoup
- html page with inline style
- css_text: str
- css content from css file
- Returns
- -------
- inline_soup: BeautifulSoup
- soup with styles from css
-
- """
- # remove this specification because it causes problems
- css_text = css_text.replace(
- '@namespace epub "http://www.idpf.org/2007/ops";', '')
- # here we add css styles to inline style
- html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
- remove_classes=False,
- external_styles=False,
- allow_network=False,
- disable_validation=True,
- )
- # soup with converted styles from css
- inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
-
- tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
- attrs={'style': re.compile('.*')})
-
- # go through the tags with inline style + style parsed from css file
- for tag_inline_style in tags_with_inline_style:
- style_converter = TagStyleConverter(tag_inline_style)
- style_converter.convert_initial_tag()
- return inline_soup
diff --git a/src/livecarta_config.py b/src/livecarta_config.py
index e3e63d4..9ae2d40 100644
--- a/src/livecarta_config.py
+++ b/src/livecarta_config.py
@@ -9,12 +9,12 @@ class LiveCartaConfig:
HEADERS_LEVELS = {"h1", "h2", "h3",
"h4", "h5", "h6", "h7", "h8", "h9"}
- DEFAULT_ALIGN_STYLE = 'left'
+ DEFAULT_ALIGN_STYLE = "left"
- ALIGN_STYLES = ['justify', 'right', 'center', 'left']
+ ALIGN_STYLES = ["justify", "right", "center", "left"]
# Main constant values
- DEFAULT_FONT_NAME = 'Times New Roman'
+ DEFAULT_FONT_NAME = "Times New Roman"
WORD_DEFAULT_FONT_SIZE = 11
@@ -23,80 +23,56 @@ class LiveCartaConfig:
FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE /\
WORD_DEFAULT_FONT_SIZE
- FONT_CORRESPONDANCE_TABLE = {
- "Arial": "arial,helvetica,sans-serif",
- "Comic Sans MS": "comic sans ms,cursive",
- "Courier New": "courier new,courier,monospace",
- "Georgia": "georgia,serif",
- "Lucida Sans Unicode": "lucida sans unicode,lucida grande,sans-serif",
- "Tahoma": "tahoma,geneva,sans-serif",
- "Times New Roman": "times new roman,times,serif",
- "Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
- "Verdana": "verdana,geneva,sans-serif",
- "monospace": "courier new,courier,monospace",
- "sans-serif": "arial,helvetica,sans-serif"
- }
-
COLORS_MAP = {
- '#ffff00': 'yellow',
- '#00ff00': 'darkYellow',
- '#00ffff': 'cyan',
- '#ff00ff': 'magenta',
- '#0000ff': 'blue',
- '#ff0000': 'red',
- '#000080': 'darkBlue',
- '#008080': 'darkCyan',
- '#008000': 'green',
- '#800080': 'darkMagenta',
- '#808000': 'darkGreen',
- '#c0c0c0': 'lightGray',
- '#ffffff': 'white',
- '#800000': '#800000',
- '#808080': '#808080'
+ "#ffff00": "yellow",
+ "#00ff00": "darkYellow",
+ "#00ffff": "cyan",
+ "#ff00ff": "magenta",
+ "#0000ff": "blue",
+ "#ff0000": "red",
+ "#000080": "darkBlue",
+ "#008080": "darkCyan",
+ "#008000": "green",
+ "#800080": "darkMagenta",
+ "#808000": "darkGreen",
+ "#c0c0c0": "lightGray",
+ "#ffffff": "white",
+ "#800000": "#800000",
+ "#808080": "#808080"
}
HTML42LIVECARTA_COLORS = {
- 'yellow': 'yellow',
- 'lime': 'green',
- 'aqua': 'cyan',
- 'fuchsia': 'magenta',
- 'blue': 'blue',
- 'red': 'red',
- 'navy': 'darkBlue',
- 'teal': 'darkCyan',
- 'green': 'darkGreen',
- 'purple': 'darkMagenta',
- 'olive': 'darkYellow',
- 'silver': 'lightGray',
- 'white': 'white',
- 'maroon': 'darkRed', # '#800000',
- 'gray': 'darkGray',
- 'grey': 'darkGray',
+ "yellow": "yellow",
+ "lime": "green",
+ "aqua": "cyan",
+ "fuchsia": "magenta",
+ "blue": "blue",
+ "red": "red",
+ "navy": "darkBlue",
+ "teal": "darkCyan",
+ "green": "darkGreen",
+ "purple": "darkMagenta",
+ "olive": "darkYellow",
+ "silver": "lightGray",
+ "white": "white",
+ "maroon": "darkRed", # "#800000",
+ "gray": "darkGray",
+ "grey": "darkGray",
}
- INDENT = '30px'
+ INDENT = "30px"
- sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0,
- 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69,
- 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38,
- 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
-
- sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px',
- '19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px',
- '30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px',
- '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px']
-
- list_types = ['circle', 'disc', 'armenian', 'decimal',
- 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
- 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
+ list_types = ["circle", "disc", "armenian", "decimal",
+ "decimal-leading-zero", "georgian", "lower-alpha", "lower-latin",
+ "lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"]
structural_tags_names = [
- 'div', 'section', 'article', 'main', 'body', 'html', 'aside',
- 'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p'
+ "div", "section", "article", "main", "body", "html", "aside",
+ "canvas", "data", "figure", "footer", "iframe", "span", "p"
]
could_have_style_in_livecarta_regexp = re.compile(
- '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
+ "(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)")
"""
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
@@ -104,23 +80,60 @@ class LiveCartaConfig:
+ "font-style": ["italic"], #
+ "text-decoration": ["underline", "line-through"], # ,
+ "text-decoration-line": ["underline", "line-through"], # ,
+ "vertical-align": ["super"], #
+ "color": [],
+ "background-color": [],
+ "background": [],
+ "width": [],
+ "border": [],
+ "border-top-width": [],
+ "border-right-width": [],
+ "border-left-width": [],
+ "border-bottom-width": [],
+ "border-top": [],
+ "border-bottom": [],
+ "list-style-type": [],
+ "list-style-image": [],
+ "margin-left": [],
+ "margin-top": [],
+ "margin": [],
}
diff --git a/src/util/color_reader.py b/src/util/color_reader.py
index fe44758..82fb451 100644
--- a/src/util/color_reader.py
+++ b/src/util/color_reader.py
@@ -96,13 +96,13 @@ def str2hex(s: str):
if '#' in s and (len(s) <= 7):
return s.lower()
- if ('rgb' in s) and ('%' in s):
+ if ('rgb' in s.lower()) and ('%' in s):
match = re.search(r'rgba*\(((\d+)%, *(\d+)%, *(\d+)%(, \d\.\d+)*)\)', s)
if match:
r, g, b = int(match.group(2)), int(match.group(3)), int(match.group(4))
return rgb_percent_to_hex((r, g, b))
- if 'rgb' in s:
+ if 'rgb' in s.lower():
rgba = re.findall('([0-9] *\.?[0-9]+)', s)
r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2])
if len(rgba) == 4:
@@ -110,7 +110,7 @@ def str2hex(s: str):
r, g, b = rgba2rgb(r, g, b, alpha)
return rgb_to_hex((r, g, b))
- if 'hsl' in s:
+ if 'hsl' in s.lower():
# hsl(hue in {0,360}, saturation [0, 100%], lightness [0, 100%])
match = re.search(r'hsla*\(((\d+), *(\d+)%, *(\d+)%, (\d\.\d+)*)\)', s)
if match: