Merge branch 'kiryl/converter_fix' of https://github.com/Teqniksoft/LiveCarta_add_ons into kiryl/docx_presets

This commit is contained in:
Kiryl
2022-08-03 09:47:40 +03:00
26 changed files with 1899 additions and 1876 deletions

View File

@@ -33,7 +33,7 @@ def configure_file_logger(name, filename="logs/converter.log", filemode="w+",
def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict): def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict):
logger.info(f"Start processing book-{book_id}.") logger.info(f"Start processing book-{book_id}.")
try: try:
json_file_path = "json/9781614382264.json" json_file_path = "books/json/9781614382264.json"
book = book_type(book_id=book_id, main_logger=logger, **params) book = book_type(book_id=book_id, main_logger=logger, **params)
book.conversion_local(json_file_path) book.conversion_local(json_file_path)
except Exception as exc: except Exception as exc:
@@ -77,7 +77,6 @@ def callback(ch, method, properties, body, logger, libre_locker):
thread.start() thread.start()
logging.log(logging.INFO, f"Active threads: {active_count()}.") logging.log(logging.INFO, f"Active threads: {active_count()}.")
# print(f"Active threads: {active_count()}.") # print(f"Active threads: {active_count()}.")
except Exception as exc: except Exception as exc:
if hasattr(exc, "message"): if hasattr(exc, "message"):
logger.error(f"{sys.exc_info()[0]}: {exc.message}") logger.error(f"{sys.exc_info()[0]}: {exc.message}")
@@ -90,15 +89,18 @@ def callback(ch, method, properties, body, logger, libre_locker):
def server_run(): def server_run():
logger = configure_file_logger("consumer") logger = configure_file_logger("consumer")
channel = None
try: try:
folder_path = os.path.dirname(os.path.abspath(__file__)) folder_path = os.path.dirname(os.path.abspath(__file__))
config_path = Path(os.path.join(folder_path, "config/queue_config.json")) config_path = Path(os.path.join(
folder_path, "config/queue_config.json"))
with open(config_path, "r") as f: with open(config_path, "r") as f:
conf_param = json.load(f) conf_param = json.load(f)
host = conf_param.get("host") or pika.ConnectionParameters().DEFAULT_HOST host = conf_param.get(
port = conf_param.get("port") or pika.ConnectionParameters().DEFAULT_PORT "host") or pika.ConnectionParameters().DEFAULT_HOST
channel = None port = conf_param.get(
"port") or pika.ConnectionParameters().DEFAULT_PORT
credentials = pika.PlainCredentials( credentials = pika.PlainCredentials(
username=conf_param["username"], password=conf_param["password"]) username=conf_param["username"], password=conf_param["password"])
parameters = pika.ConnectionParameters( parameters = pika.ConnectionParameters(
@@ -113,7 +115,6 @@ def server_run():
logger.log(logging.ERROR, logger.log(logging.ERROR,
f"Queue {conf_param['queue']} is not declared.") f"Queue {conf_param['queue']} is not declared.")
raise exc raise exc
locker = Event() locker = Event()
locker.set() locker.set()
channel.basic_consume(queue=conf_param["queue"], channel.basic_consume(queue=conf_param["queue"],

113
presets/presets.json Normal file
View File

@@ -0,0 +1,113 @@
[
{
"preset_name": "table_wrapper",
"rules": [
{
"tags": ["div"],
"attrs": [
{
"name": "width",
"value": ".*"
},
{
"name": "border",
"value": ".*"
},
{
"name": "bgcolor",
"value": ".*"
}
]
},
{
"tags": ["section", "blockquote"],
"attrs": [
{
"name": "class",
"value": "feature[1234]"
}
]
}
]
},
{
"preset_name": "replacer",
"rules": [
{
"tags": ["^h[6-9]$", "^figure$", "^section$", "^div$"],
"condition": null,
"tag_to_replace": "p"
},
{
"tags": ["^aside$"],
"condition": null,
"tag_to_replace": "blockquote"
},
{
"tags": ["^header$", "^footer$"],
"condition": null,
"tag_to_replace": "span"
},
{
"tags": ["^code$", "^kbd$", "^var$"],
"condition": {
"parent_tags": ":not(pre)",
"child_tags": null,
"attrs": null
},
"tag_to_replace": "span"
},
{
"tags": ["^b$"],
"condition": null,
"tag_to_replace": "strong"
},
{
"tags": ["^image$"],
"condition": null,
"tag_to_replace": "img"
}
]
},
{
"preset_name": "attr_replacer",
"rules": [
{
"attr": "xlink:href",
"condition": {
"tags": ["img"]
},
"attr_to_replace": "src"
}
]
},
{
"preset_name": "unwrapper",
"rules": {
"tags": [
"section",
"article",
"figcaption",
"main",
"body",
"html",
"svg",
"li > p"
]
}
},
{
"preset_name": "inserter",
"rules": [
{
"tags": ["pre"],
"condition": {
"parent_tags": null,
"child_tags": ":not(code, kbd, var)",
"attrs": null
},
"tag_to_insert": "code"
}
]
}
]

View File

@@ -8,49 +8,30 @@ from io import BytesIO
class Access: class Access:
"""Class accessing our platform""" """Class accessing our platform"""
def __init__(self, url=None):
PENDING = 1
PROCESS = 2
GENERATE = 3
FINISH = 4
ERROR = 5
url = None
username = None
password = None
token = None
refresh = None
refresh_time = None
headers = None
refreshing = Event()
def __init__(self, url):
""" """
:param url: str, url received from queue message, if field apiURL exists :param url: str, url received from queue message, if field apiURL exists
else None else None
""" """
self.PENDING = 1
self.PROCESS = 2
self.GENERATE = 3
self.FINISH = 4
self.ERROR = 5
self.username = None
self.password = None
self.token = None
self.refresh = None
self.refresh_time = None
self.headers = None
self.refreshing = Event()
self.set_credentials(url) self.set_credentials(url)
self.get_token() self.get_token()
self.refreshing.set() self.refreshing.set()
def sleep(timeout: float, retry=3):
def decorator(function):
"""Decorator sleeping timeout sec and makes 3 retries"""
def wrapper(*args, **kwargs):
retries = 0
while retries < retry:
try:
value = function(*args, **kwargs)
if value is not None:
return value
except:
time.sleep(timeout)
retries += 1
return wrapper
return decorator
def set_credentials(self, url): def set_credentials(self, url):
folder_path = os.path.dirname( folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__))) os.path.dirname(os.path.abspath(__file__)))
@@ -60,8 +41,8 @@ class Access:
self.refreshing.clear() self.refreshing.clear()
self.url = url self.url = url
self.username = params['username'] self.username = params["username"]
self.password = params['password'] self.password = params["password"]
self.refreshing.set() self.refreshing.set()
def format_header(self): def format_header(self):
@@ -123,14 +104,14 @@ class Access:
else: else:
raise Exception(f'{response.status_code}') raise Exception(f'{response.status_code}')
def get_book(self, book_id): def get_file(self, file_path):
"""Function downloads the book from site""" """Function downloads the file[book, preset] from site"""
if self.is_time_for_refreshing(): if self.is_time_for_refreshing():
self.refresh_token() self.refresh_token()
self.refreshing.wait() self.refreshing.wait()
response = requests.get( response = requests.get(
f'{self.url}/doc-convert/{book_id}/file', headers=self.headers, file_path, headers=self.headers,
# auth=('kiryl.miatselitsa', 'iK4yXCvdyHFEEOvG2v3F') # auth=('kiryl.miatselitsa', 'iK4yXCvdyHFEEOvG2v3F')
) )
@@ -139,11 +120,26 @@ class Access:
elif response.status_code == 200: elif response.status_code == 200:
content = response.content content = response.content
else: else:
raise Exception(f'Error in getting doc from url: {self.url}/doc-convert/{book_id}/file, ' raise Exception(f'Error in getting preset from url: {file_path}, '
f'status code:{response.status_code}') f'status code:{response.status_code}')
return content return content
def sleep(timeout: float, retry=3):
def decorator(function):
"""Decorator sleeping timeout sec and makes 3 retries"""
def wrapper(*args, **kwargs):
retries = 0
while retries < retry:
try:
value = function(*args, **kwargs)
if value is not None:
return value
except:
time.sleep(timeout)
retries += 1
return wrapper
return decorator
@sleep(3) @sleep(3)
def send_image(self, img_path, doc_id, img_content: bytes = None): def send_image(self, img_path, doc_id, img_content: bytes = None):
"""Function sends images to site""" """Function sends images to site"""

View File

@@ -24,9 +24,10 @@ class BookSolver:
self.book_type = None self.book_type = None
self.book_id = book_id self.book_id = book_id
self.access = access self.access = access
self.file_path = None # path to book file, appears after downloading from server self.preset_path = None
self.output_path = None # path to json file self.book_path = None # path to book file, appears after downloading from server
self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}', self.book_output_path = None # path to json file
self.logger_object = BookLogger(name=f"{__name__}_{self.book_id}",
book_id=book_id, book_id=book_id,
main_logger=main_logger) main_logger=main_logger)
self.status_wrapper = BookStatusWrapper( self.status_wrapper = BookStatusWrapper(
@@ -35,9 +36,9 @@ class BookSolver:
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \ assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels." "Length of headers doesn't match allowed levels."
def save_book_file(self, content: bytes): def save_file(self, content: bytes, path_to_save, file_type):
""" """
Function saves binary content of file to .docx/.epub Function saves binary content of file to folder(path_to_save)
Parameters Parameters
---------- ----------
content: bytes str content: bytes str
@@ -47,80 +48,100 @@ class BookSolver:
folder_path = os.path.dirname( folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__))) os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join( folder_path = os.path.join(
folder_path, f'{self.book_type}/{self.book_id}') folder_path, path_to_save)
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
file_path = os.path.join( file_path = os.path.join(
folder_path, f'{self.book_id}.{self.book_type}') folder_path, f"{self.book_id}.{file_type}")
try: try:
with open(file_path, 'wb+') as file: with open(file_path, "wb+") as file:
file.write(content) file.write(content)
self.logger_object.log(f'File was saved to folder: {folder_path}.') self.logger_object.log(
f"File was saved to folder: {folder_path}.")
except Exception as exc: except Exception as exc:
self.logger_object.log( self.logger_object.log(
f"Error in writing {self.book_type} file.", logging.ERROR) f"Error in writing {self.book_type} file.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
raise exc raise exc
return file_path
self.file_path = pathlib.Path(file_path) def get_preset_file(self):
"""Method for getting and saving preset from server"""
try:
self.logger_object.log(f"Start receiving preset file from server. URL:"
f" {self.access.url}/doc-convert/{self.book_id}/presets")
content = self.access.get_file(
file_path=f"{self.access.url}/doc-convert/{self.book_id}/presets")
self.logger_object.log("Preset file was received from server.")
self.preset_path = pathlib.Path(
str(self.save_file(content, path_to_save="presets", file_type="json")))
except FileNotFoundError as f_err:
self.logger_object.log(
"Can't get preset file from server.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise f_err
except Exception as exc:
raise exc
def get_book_file(self): def get_book_file(self):
"""Method for getting and saving book from server""" """Method for getting and saving book from server"""
try: try:
self.logger_object.log(f'Start receiving file from server. URL:' self.logger_object.log(f"Start receiving book file from server. URL:"
f' {self.access.url}/doc-convert/{self.book_id}/file') f" {self.access.url}/doc-convert/{self.book_id}/file")
content = self.access.get_book(self.book_id) content = self.access.get_file(
self.logger_object.log('File was received from server.') file_path=f"{self.access.url}/doc-convert/{self.book_id}/file")
self.save_book_file(content) self.logger_object.log("Book file was received from server.")
self.book_path = pathlib.Path(self.save_file(
content, path_to_save=f"books/{self.book_type}", file_type=self.book_type))
except FileNotFoundError as f_err: except FileNotFoundError as f_err:
self.logger_object.log( self.logger_object.log(
"Can't get file from server.", logging.ERROR) "Can't get book file from server.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
raise f_err raise f_err
except Exception as exc: except Exception as exc:
raise exc raise exc
def check_output_directory(self): def check_output_directory(self):
if self.output_path is None: if self.book_output_path is None:
folder_path = os.path.dirname( folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__))) os.path.dirname(os.path.abspath(__file__)))
output_path = os.path.join( output_path = os.path.join(
folder_path, f'json/{self.book_id}.json') folder_path, f"books/json/{self.book_id}.json")
self.output_path = output_path self.book_output_path = output_path
self.output_path = pathlib.Path(self.output_path) self.book_output_path = pathlib.Path(self.book_output_path)
self.logger_object.log(f'Output file path: {self.output_path}') self.logger_object.log(f"Output file path: {self.book_output_path}")
pathlib.Path(self.output_path).parent.mkdir( pathlib.Path(self.book_output_path).parent.mkdir(
parents=True, exist_ok=True) parents=True, exist_ok=True)
self.output_path.touch(exist_ok=True) self.book_output_path.touch(exist_ok=True)
def write_to_json(self, content: dict): def write_to_json(self, content: dict):
self.check_output_directory() self.check_output_directory()
try: try:
with codecs.open(self.output_path, 'w', encoding='utf-8') as f: with codecs.open(self.book_output_path, "w", encoding="utf-8") as f:
json.dump(content, f, ensure_ascii=False) json.dump(content, f, ensure_ascii=False)
self.logger_object.log( self.logger_object.log(
f'Data has been saved to .json file: {self.output_path}') f"Data has been saved to .json file: {self.book_output_path}")
except Exception as exc: except Exception as exc:
self.logger_object.log( self.logger_object.log(
'Error has occurred while writing .json file.' + str(exc), logging.ERROR) "Error has occurred while writing .json file." + str(exc), logging.ERROR)
def send_json_content_to_server(self, content: dict): def send_json_content_to_server(self, content: dict):
"""Function sends json_content to site""" """Function sends json_content to site"""
try: try:
self.access.send_book(self.book_id, content) self.access.send_book(self.book_id, content)
self.logger_object.log(f'JSON data has been sent to server.') self.logger_object.log(f"JSON data has been sent to server.")
except Exception as exc: except Exception as exc:
self.logger_object.log( self.logger_object.log(
'Error has occurred while sending json content.', logging.ERROR) "Error has occurred while sending json content.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error() self.status_wrapper.set_error()
raise exc raise exc
@abstractmethod @abstractmethod
def get_converted_book(self): def get_converted_book(self):
self.logger_object.log('Beginning of processing .json output.') self.logger_object.log("Beginning of processing .json output.")
self.status_wrapper.set_generating() self.status_wrapper.set_generating()
return {} return {}
@@ -132,20 +153,23 @@ class BookSolver:
""" """
try: try:
self.logger_object.log( self.get_preset_file()
f'Beginning of conversion from .{self.book_type} to .json.')
self.get_book_file() self.get_book_file()
self.logger_object.log(
f"Beginning of conversion from .{self.book_type} to .json.")
self.status_wrapper.set_processing() self.status_wrapper.set_processing()
content_dict = self.get_converted_book() content_dict = self.get_converted_book()
[os.remove(path) for path in [self.preset_path, self.book_path]]
self.logger_object.log("Beginning of processing .json output.")
self.status_wrapper.set_generating() self.status_wrapper.set_generating()
self.write_to_json(content_dict) self.write_to_json(content_dict)
self.send_json_content_to_server(content_dict) self.send_json_content_to_server(content_dict)
self.logger_object.log( self.logger_object.log(
f'End of the conversion to LiveCarta format. Check {self.output_path}.') f"End of the conversion to LiveCarta format. Check {self.book_output_path}.")
except Exception as exc: except Exception as exc:
self.status_wrapper.set_error() self.status_wrapper.set_error()
self.logger_object.log( self.logger_object.log(
'Error has occurred while conversion.', logging.ERROR) "Error has occurred while conversion.", logging.ERROR)
self.logger_object.log_error_to_main_log(str(exc)) self.logger_object.log_error_to_main_log(str(exc))
raise exc raise exc
@@ -158,15 +182,16 @@ class BookSolver:
""" """
try: try:
self.logger_object.log( self.logger_object.log(
f'Data has been downloaded from {file_path} file') f"Data has been downloaded from {file_path} file")
self.status_wrapper.set_processing() self.status_wrapper.set_processing()
with codecs.open(file_path, 'r', encoding='utf-8') as f_json: with codecs.open(file_path, "r", encoding="utf-8") as f_json:
content_dict = json.load(f_json) content_dict = json.load(f_json)
self.logger_object.log("Beginning of processing .json output.")
self.status_wrapper.set_generating() self.status_wrapper.set_generating()
self.send_json_content_to_server(content_dict) self.send_json_content_to_server(content_dict)
self.logger_object.log(f'Sent a file to server. Check LiveCarta.') self.logger_object.log(f"Sent a file to server. Check LiveCarta.")
except Exception as exc: except Exception as exc:
self.status_wrapper.set_error() self.status_wrapper.set_error()
self.logger_object.log( self.logger_object.log(
'Error has occurred while reading json file.' + str(exc), logging.ERROR) "Error has occurred while reading json file." + str(exc), logging.ERROR)
self.logger_object.log_error_to_main_log(str(exc)) self.logger_object.log_error_to_main_log(str(exc))

View File

@@ -10,12 +10,12 @@ from src.util.helpers import BookLogger
class Docx2LibreHTML: class Docx2LibreHTML:
def __init__(self, book_id=0, file_path=None, access=None, logger=None, status_wrapper=None, libre_locker=None): def __init__(self, book_id=0, file_path=None, access=None, logger=None, libre_locker=None):
self.book_id = book_id self.book_id = book_id if book_id != 0 else pathlib.Path(
file_path).stem
self.file_path = file_path self.file_path = file_path
self.access = access self.access = access
self.logger_object: BookLogger = logger self.logger_object: BookLogger = logger
self.status_wrapper: status_wrapper = status_wrapper
# critical section for occupying libreoffice by one thread # critical section for occupying libreoffice by one thread
self.libre_locker: Event() = libre_locker self.libre_locker: Event() = libre_locker
@@ -24,15 +24,15 @@ class Docx2LibreHTML:
self.html_soup = self.read_html(self.html_path) self.html_soup = self.read_html(self.html_path)
def _libre_run(self, out_dir_path): def _libre_run(self, out_dir_path):
command = ['libreoffice', '--headless', command = ["libreoffice", "--headless",
'--convert-to', 'html', f'{str(self.file_path)}', "--convert-to", "html", f"{str(self.file_path)}",
'--outdir', f'{out_dir_path}'] "--outdir", f"{out_dir_path}"]
print(command) print(command)
result = subprocess.run(command, stdout=PIPE, stderr=PIPE) result = subprocess.run(command, stdout=PIPE, stderr=PIPE)
self.logger_object.log(f'Result of libre conversion for book_{self.book_id}:' self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:"
f' {result.returncode}, {result.stdout}', logging.DEBUG) f" {result.returncode}, {result.stdout}", logging.DEBUG)
self.logger_object.log(f'Any error while libre conversion for book_' self.logger_object.log(f"Any error while libre conversion for book_"
f'{self.book_id}: {result.stderr}', logging.DEBUG) f"{self.book_id}: {result.stderr}", logging.DEBUG)
def convert_docx_to_html(self): def convert_docx_to_html(self):
""" """
@@ -48,82 +48,73 @@ class Docx2LibreHTML:
path to html file, file appears after libre-conversion path to html file, file appears after libre-conversion
""" """
self.logger_object.log(f'File - {self.file_path}.') def get_and_clear_flag(out_dir_path: str):
print(f'{self.file_path}') self.libre_locker.clear()
self.logger_object.log('Beginning of conversion from .docx to .html.') self.logger_object.log(f"Got flag!", logging.DEBUG)
self._libre_run(out_dir_path)
self.libre_locker.set()
self.logger_object.log("Cleared flag...", logging.DEBUG)
try: def check_file_exists(path, error_string: str):
f = open(self.file_path) try:
f.close() f = open(path)
except FileNotFoundError as error: f.close()
self.logger_object.log( except FileNotFoundError as error:
'Invalid path to input data.', logging.ERROR) self.logger_object.log(
self.status_wrapper.set_error() error_string, logging.ERROR)
raise error self.logger_object.log_error_to_main_log()
raise error
self.logger_object.log(f"File - {self.file_path}.")
self.logger_object.log("Beginning of conversion from .docx to .html.")
check_file_exists(
self.file_path, error_string="Invalid path to input data.")
folder_path = os.path.dirname( folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__))) os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}') out_dir_path = os.path.join(folder_path, f"../books/html/{self.book_id}")
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
is_book_converted = False
try: try:
if self.libre_locker.isSet(): if self.libre_locker.isSet():
self.libre_locker.clear() get_and_clear_flag(out_dir_path)
self.logger_object.log('Got flag...', logging.DEBUG)
self._libre_run(out_dir_path)
self.libre_locker.set()
self.logger_object.log('Cleared flag...', logging.DEBUG)
else: else:
while not self.libre_locker.isSet() and not is_book_converted: while not self.libre_locker.isSet():
self.logger_object.log( self.logger_object.log(
'Waiting for libre...', logging.DEBUG) "Waiting for libre...", logging.DEBUG)
flag = self.libre_locker.wait(50) flag = self.libre_locker.wait(50)
if flag: if flag:
if self.libre_locker.isSet(): if self.libre_locker.isSet():
self.libre_locker.clear() get_and_clear_flag(out_dir_path)
self.logger_object.log(f'Got flag!', logging.DEBUG)
self._libre_run(out_dir_path)
self.libre_locker.set()
break break
except Exception as exc: except Exception as exc:
self.logger_object.log( self.logger_object.log(
"Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc raise exc
out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html') out_dir_path = os.path.join(out_dir_path, f"{self.book_id}.html")
html_path = pathlib.Path(out_dir_path) html_path = pathlib.Path(out_dir_path)
try: check_file_exists(
f = open(html_path) html_path, error_string="Conversion has gone wrong. HTML file doesn't exist.")
f.close()
except FileNotFoundError as exc:
self.logger_object.log(
"Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
self.logger_object.log('End of conversion from .docx to .html.') self.logger_object.log("End of conversion from .docx to .html.")
self.logger_object.log( self.logger_object.log(
f'Input file path after conversion: {html_path}.') f"Input file path after conversion: {html_path}.")
return html_path return html_path
def read_html(self, html_path): def read_html(self, html_path):
"""Method for reading .html file into beautiful soup tag.""" """Method for reading .html file into beautiful soup tag."""
try: try:
html_text = open(html_path, 'r', encoding='utf8').read() html_text = open(html_path, "r", encoding="utf8").read()
self.logger_object.log('HTML for book has been loaded.') self.logger_object.log("HTML for book has been loaded.")
except FileNotFoundError as exc: except FileNotFoundError as exc:
self.logger_object.log('There is no html to process.' self.logger_object.log("There is no html to process."
'Conversion went wrong or you specified wrong paths.', logging.ERROR) "Conversion went wrong or you specified wrong paths.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc raise exc
html_soup = BeautifulSoup(html_text, features='lxml') html_soup = BeautifulSoup(html_text, features="lxml")
return html_soup return html_soup

View File

@@ -14,7 +14,7 @@ class DocxBook(BookSolver):
def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None): def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None):
super().__init__(book_id, access, main_logger) super().__init__(book_id, access, main_logger)
self.book_type = 'docx' self.book_type = "docx"
# critical section for occupying libreoffice by one thread # critical section for occupying libreoffice by one thread
self.libre_locker: Event() = libre_locker self.libre_locker: Event() = libre_locker
@@ -34,9 +34,9 @@ class DocxBook(BookSolver):
""" """
# 1. Converts docx to html with LibreOffice # 1. Converts docx to html with LibreOffice
html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, html_converter = Docx2LibreHTML(self.book_id, self.book_path, self.access,
self.logger_object, self.status_wrapper, self.libre_locker) self.logger_object, self.libre_locker)
# TODO presets # todo presets
# 2. Parses and cleans html, gets list of tags, gets footnotes # 2. Parses and cleans html, gets list of tags, gets footnotes
parser = HTMLDocxPreprocessor( parser = HTMLDocxPreprocessor(
@@ -46,26 +46,29 @@ class DocxBook(BookSolver):
# 3. Parses from line structure to nested structure with JSONConverter # 3. Parses from line structure to nested structure with JSONConverter
json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers, json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers,
self.logger_object, self.status_wrapper) self.logger_object)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
return content_dict return content_dict
if __name__ == "__main__": if __name__ == "__main__":
docx_file_path = '../../docx/music_inquiry.docx' docx_file_path = "../../books/docx/music_inquiry.docx"
logger_object = BookLogger( logger_object = BookLogger(
name='docx', book_id=docx_file_path.split('/')[-1]) name="docx", book_id=docx_file_path.split("/")[-1])
locker = Event()
locker.set()
html_converter = Docx2LibreHTML(file_path=docx_file_path) html_converter = Docx2LibreHTML(file_path=docx_file_path,
logger=logger_object, libre_locker=locker)
parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object) parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object)
content, footnotes, top_level_headers = parser.process_html( content, footnotes, top_level_headers = parser.process_html(
html_converter.html_path) html_path=html_converter.html_path, book_id=html_converter.book_id)
json_converter = LibreHTML2JSONConverter( json_converter = LibreHTML2JSONConverter(
content, footnotes, top_level_headers, logger_object) content, footnotes, top_level_headers, logger_object)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f: with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f:
json.dump(content_dict, f, ensure_ascii=False) json.dump(content_dict, f, ensure_ascii=False)

View File

@@ -0,0 +1,73 @@
import re
from bs4 import BeautifulSoup, NavigableString
def _clean_footnote_content(content):
content = content.strip()
return content.strip()
def process_footnotes(body_tag):
"""Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc")
footnote_content = body_tag.find_all(
"div", id=re.compile(r"^sdfootnote\d+$"))
footnote_amt = len(footnote_anchors)
assert footnote_amt == len(footnote_content), \
"Something went wrong with footnotes after libre conversion"
footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
true_a_tag = cont_tag.find_all(
"a", class_=re.compile(r"^sdfootnote.+$"))[0]
if true_a_tag.attrs.get("href") is None:
cont_tag.a.decompose()
continue
assert anc_tag["name"] == true_a_tag["href"][1:], \
"Something went wrong with footnotes after libre conversion"
new_tag = BeautifulSoup(features="lxml").new_tag("sup")
new_tag["class"] = "footnote-element"
new_tag["data-id"] = i + 1
new_tag["id"] = f"footnote-{i + 1}"
new_tag.string = "*"
anc_tag.replace_with(new_tag)
# extra digits in footnotes from documents downloaded from livecarta
a_text = true_a_tag.text
if len(cont_tag.find_all("p")):
sup = cont_tag.find_all("p")[0].find("sup")
if sup and sup.text == a_text:
sup.decompose()
for tag_a in cont_tag.find_all("a", {"class": "sdfootnotesym"}):
tag_a.decompose()
# remove font-size
for span in cont_tag.find_all("span", {"style": re.compile("font-size")}):
style = span.get("style")
style = re.sub(r"font-size: \d+px", "", style)
if style == "":
del span.attrs["style"]
else:
span.attrs["style"] = style
unicode_string = ""
for child in cont_tag.children:
if type(child) is NavigableString:
continue
if child.name == "blockquote":
unicode_string += str(child)
else:
unicode_string += child.decode_contents()
content = _clean_footnote_content(unicode_string)
cont_tag.decompose()
footnotes.append(content)
return footnotes

View File

@@ -1,14 +1,13 @@
import os
import re import re
import logging import logging
import pathlib
from typing import List from typing import List
from shutil import copyfile
from bs4 import BeautifulSoup, NavigableString, Tag from bs4 import BeautifulSoup, NavigableString, Tag
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.util.helpers import BookLogger, BookStatusWrapper from src.util.helpers import BookLogger, BookStatusWrapper
from src.docx_converter.footnotes_processing import process_footnotes
from src.docx_converter.image_processing import process_images
class HTMLDocxPreprocessor: class HTMLDocxPreprocessor:
@@ -21,7 +20,40 @@ class HTMLDocxPreprocessor:
self.top_level_headers = None self.top_level_headers = None
self.content = list() self.content = list()
def _process_toc_links(self):
def _check_parent_link_exist_in_toc(tag_with_link):
toc_links = []
for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}):
link_name = a_tag.attrs["name"]
toc_item = self.body_tag.find("a", {"href": "#" + link_name})
if toc_item:
toc_links.append(toc_item)
return len(toc_links) > 0
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all(
"a", {"name": re.compile(r"^_Toc\d+")})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
for h_tag in headers:
if re.search(r"^h\d$", h_tag.name):
h_tag.a.unwrap()
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
elif h_tag.name == "p":
exist_in_toc = _check_parent_link_exist_in_toc(h_tag)
if h_tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup(
features="lxml").new_tag("h" + outline_level)
text = h_tag.text
h_tag.replaceWith(new_tag)
new_tag.string = text
else:
# rethink document structure when you have toc_links, other cases?
self.logger_object.log(f"Something went wrong in processing toc_links."
f" Check the structure of the file. "
f"Tag name: {h_tag.name}")
def _clean_tag(self, tag: str, attr_name: str, attr_value: re): def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
# todo regex
""" """
Function to clean tags by its name and attribute value. Function to clean tags by its name and attribute value.
Parameters Parameters
@@ -44,15 +76,16 @@ class HTMLDocxPreprocessor:
tag.unwrap() tag.unwrap()
def _clean_underline_links(self): def _clean_underline_links(self):
# todo regex
"""Function cleans meaningless <u> tags before links.""" """Function cleans meaningless <u> tags before links."""
underlines = self.body_tag.find_all("u") underlines = self.body_tag.find_all("u")
for u in underlines: for u in underlines:
if u.find_all('a'): if u.find_all("a"):
u.unwrap() u.unwrap()
links = self.body_tag.find_all('a') links = self.body_tag.find_all("a")
for link in links: for link in links:
u = link.find_all('u') u = link.find_all("u")
if u and len(u) == 1: if u and len(u) == 1:
u[0].unwrap() u[0].unwrap()
@@ -80,16 +113,12 @@ class HTMLDocxPreprocessor:
""" """
size = re.search(r"font-size: (\d{1,3})pt", style) size = re.search(r"font-size: (\d{1,3})pt", style)
if size is None: if size is None:
return style return style
size = size.group(1) size = size.group(1)
new_size = cls.convert_pt_to_px(size) new_size = cls.convert_pt_to_px(size)
if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE: if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
return "" return ""
return re.sub(size + "pt", str(new_size) + "px", style) return re.sub(size + "pt", str(new_size) + "px", style)
def _font_to_span(self): def _font_to_span(self):
@@ -99,27 +128,18 @@ class HTMLDocxPreprocessor:
""" """
fonts = self.body_tag.find_all("font") fonts = self.body_tag.find_all("font")
for font in fonts: for font in fonts:
face = font.get("face") face, style, color =\
style = font.get("style") font.get("face"), font.get("style"), font.get("color")
color = font.get("color")
font.attrs = {} font.attrs, font.name = {}, "span"
font.name = "span"
if style: if style:
style = self.convert_font_pt_to_px(style) style = self.convert_font_pt_to_px(style)
if style != "": if style != "":
if color and color in LiveCartaConfig.COLORS_MAP: if color and color in LiveCartaConfig.COLORS_MAP:
style += f'; color: {color};' style += f"; color: {color};"
font.attrs["style"] = style font.attrs["style"] = style
elif color and color in LiveCartaConfig.COLORS_MAP: elif color and color in LiveCartaConfig.COLORS_MAP:
font.attrs["style"] = f'color: {color};' font.attrs["style"] = f"color: {color};"
if face is not None:
face = re.sub(r",[\w,\- ]*$", "", face)
if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(face):
font.attrs["face"] = LiveCartaConfig.FONT_CORRESPONDANCE_TABLE[face]
else:
font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME
if len(font.attrs) == 0: if len(font.attrs) == 0:
font.unwrap() font.unwrap()
@@ -127,24 +147,18 @@ class HTMLDocxPreprocessor:
# on this step there should be no more <font> tags # on this step there should be no more <font> tags
assert len(self.body_tag.find_all("font")) == 0 assert len(self.body_tag.find_all("font")) == 0
def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated
toc_tag = self.html_soup.new_tag('TOC')
if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:]
def clean_trash(self): def clean_trash(self):
"""Function to remove all styles and tags we don't need.""" # todo make it regex dict
self._clean_tag('span', 'style', re.compile( """Function to remove all styles and tags we don"t need."""
r'^background: #[\da-fA-F]{6}$')) self._clean_tag("span", "style", re.compile(
r"^background: #[\da-fA-F]{6}$"))
# todo: check for another languages # todo: check for another languages
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) self._clean_tag("span", "lang", re.compile(r"^ru-RU$"))
self._clean_tag('span', 'style', re.compile( self._clean_tag("span", "style", re.compile(
'^letter-spacing: -?[\d.]+pt$')) "^letter-spacing: -?[\d.]+pt$"))
self._clean_tag('font', 'face', re.compile( self._clean_tag("font", "face", re.compile(
r'^Times New Roman[\w, ]+$')) r"^Times New Roman[\w, ]+$"))
self._clean_tag("a", "name", "_GoBack") self._clean_tag("a", "name", "_GoBack")
self._clean_underline_links() self._clean_underline_links()
@@ -153,63 +167,68 @@ class HTMLDocxPreprocessor:
# replace toc with empty <TOC> tag # replace toc with empty <TOC> tag
tables = self.body_tag.find_all( tables = self.body_tag.find_all(
"div", id=re.compile(r'^Table of Contents\d+')) "div", id=re.compile(r"^Table of Contents\d+"))
for table in tables: for table in tables:
table.wrap(self.html_soup.new_tag("TOC")) table.wrap(self.html_soup.new_tag("TOC"))
table.decompose() table.decompose()
def _preprocessing_headings(self):
# todo regex
"""Function to convert all lower level headings to p tags"""
pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$"
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = "p"
def _process_paragraph(self): def _process_paragraph(self):
"""Function to process <p> tags (text-align and text-indent value).""" """Function to process <p> tags (text-align and text-indent value)."""
paragraphs = self.body_tag.find_all('p') paragraphs = self.body_tag.find_all("p")
for p in paragraphs: for p in paragraphs:
# libre converts some \n into <p> with 2 </br> # libre converts some \n into <p> with 2 </br>
# there we remove 1 unnecessary <br> # there we remove 1 unnecessary <br>
brs = p.find_all('br') brs = p.find_all("br")
text = p.text text = p.text
if brs and text == '\n\n' and len(brs) == 2: if brs and text == "\n\n" and len(brs) == 2:
brs[0].decompose() brs[0].decompose()
indent_should_be_added = False indent_should_be_added = False
if text and ((text[0:1] == '\t') or (text[:2] == '\n\t')): if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")):
indent_should_be_added = True indent_should_be_added = True
align = p.get('align') align = p.get("align")
style = p.get('style') style = p.get("style")
if style: if style:
indent = re.search(r'text-indent: ([\d.]{1,4})in', style) indent = re.search(r"text-indent: ([\d.]{1,4})in", style)
margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style) margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style)
margin_right = re.search( margin_right = re.search(
r'margin-right: ([\d.]{1,4})in', style) r"margin-right: ([\d.]{1,4})in", style)
margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style) margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style)
margin_bottom = re.search( margin_bottom = re.search(
r'margin-bottom: ([\d.]{1,4})in', style) r"margin-bottom: ([\d.]{1,4})in", style)
else: else:
indent = None indent = margin_left = margin_right = \
margin_left = None margin_top = margin_bottom = None
margin_right = None
margin_top = None
margin_bottom = None
if margin_left and margin_right and margin_top and margin_bottom and \ if margin_left and margin_right and margin_top and margin_bottom and \
margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \ margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \
margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11': margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11":
p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))
p.attrs = {} p.attrs = {}
style = '' style = ""
if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE: if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE:
style += f'text-align: {align};' style += f"text-align: {align};"
if indent is not None or indent_should_be_added: if indent is not None or indent_should_be_added:
# indent = indent.group(1) # indent = indent.group(1)
style += f'text-indent: {LiveCartaConfig.INDENT};' style += f"text-indent: {LiveCartaConfig.INDENT};"
if style: if style:
p.attrs['style'] = style p.attrs["style"] = style
def _process_two_columns(self): def _process_two_columns(self):
"""Function to process paragraphs which has two columns layout.""" """Function to process paragraphs which has two columns layout."""
@@ -220,41 +239,6 @@ class HTMLDocxPreprocessor:
child["class"] = "columns2" child["class"] = "columns2"
div.unwrap() div.unwrap()
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
sizes = []
for td in tds:
style = td.get('style')
if style:
match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
if match:
size = match.group(1)
units = match.group(2)
if units == "pt":
size = self.convert_pt_to_px(size)
sizes.append(float(size))
width = td.get('width')
td.attrs = {}
if width:
td.attrs['width'] = width
if sizes:
border_size = sum(sizes) / len(sizes)
table.attrs['border'] = f'{border_size:.2}'
self.tables_amount = len(tables)
def _process_quotes(self): def _process_quotes(self):
""" """
Function to process block quotes. Function to process block quotes.
@@ -277,9 +261,9 @@ class HTMLDocxPreprocessor:
for table in tables: for table in tables:
trs = table.find_all("tr") trs = table.find_all("tr")
tds = table.find_all("td") tds = table.find_all("td")
if len(trs) == 1 and len(tds) == 1 and tds[0].get('width') == '600': if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600":
td = tds[0] td = tds[0]
is_zero_border = 'border: none;' in td.get('style') is_zero_border = "border: none;" in td.get("style")
paragraphs = td.find_all("p") paragraphs = td.find_all("p")
has_i_tag_or_br = [(p.i, p.br) for p in paragraphs] has_i_tag_or_br = [(p.i, p.br) for p in paragraphs]
has_i_tag_or_br = [x[0] is not None or x[1] is not None has_i_tag_or_br = [x[0] is not None or x[1] is not None
@@ -287,231 +271,79 @@ class HTMLDocxPreprocessor:
if all(has_i_tag_or_br) and is_zero_border: if all(has_i_tag_or_br) and is_zero_border:
new_div = BeautifulSoup( new_div = BeautifulSoup(
features='lxml').new_tag('blockquote') features="lxml").new_tag("blockquote")
for p in paragraphs: for p in paragraphs:
new_div.append(p) new_div.append(p)
table.replaceWith(new_div) table.replaceWith(new_div)
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
sizes = []
for td in tds:
style = td.get("style")
if style:
match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
if match:
size = match.group(1)
units = match.group(2)
if units == "pt":
size = self.convert_pt_to_px(size)
sizes.append(float(size))
width = td.get("width")
td.attrs = {}
if width:
td.attrs["width"] = width
if sizes:
border_size = sum(sizes) / len(sizes)
table.attrs["border"] = f"{border_size:.2}"
self.tables_amount = len(tables)
def _process_hrefs(self): def _process_hrefs(self):
a_tags_with_href = self.body_tag.find_all( a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^.*http.+')}) "a", {"href": re.compile("^.*http.+")})
# remove char=end of file for some editors # remove char=end of file for some editors
for tag in a_tags_with_href: for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace("\u200c", "")
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
a_tags_with_href = self.body_tag.find_all( a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^(?!#sdfootnote)')}) "a", {"href": re.compile("^(?!#sdfootnote)")})
for tag in a_tags_with_href: for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace("\u200c", "")
tag.string = tag.text.replace('\u200b', '') # zero-width-space tag.string = tag.text.replace("\u200b", "") # zero-width-space
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
@staticmethod
def _clean_footnote_content(content):
content = content.strip()
return content.strip()
def _process_footnotes(self):
"""Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all(
'div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_anchors)
assert footnote_amt == len(footnote_content), \
'Something went wrong with footnotes after libre conversion'
footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
true_a_tag = cont_tag.find_all(
'a', class_=re.compile(r'^sdfootnote.+$'))[0]
if true_a_tag.attrs.get('href') is None:
cont_tag.a.decompose()
continue
assert anc_tag['name'] == true_a_tag['href'][1:], \
'Something went wrong with footnotes after libre conversion'
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
anc_tag.replace_with(new_tag)
# extra digits in footnotes from documents downloaded from livecarta
a_text = true_a_tag.text
if len(cont_tag.find_all('p')):
sup = cont_tag.find_all('p')[0].find('sup')
if sup and sup.text == a_text:
sup.decompose()
for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
tag_a.decompose()
# remove font-size
for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
style = span.get('style')
style = re.sub(r"font-size: \d+px", "", style)
if style == '':
del span.attrs['style']
else:
span.attrs['style'] = style
unicode_string = ''
for child in cont_tag.children:
if type(child) is NavigableString:
continue
if child.name == 'blockquote':
unicode_string += str(child)
else:
unicode_string += child.decode_contents()
content = self._clean_footnote_content(unicode_string)
cont_tag.decompose()
footnotes.append(content)
self.footnotes = footnotes
def _process_images(self, access, html_path, book_id):
"""
Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
For now images are moved to one folder.
"""
img_tags = self.body_tag.find_all('img')
if len(img_tags):
if access is None:
folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
for img in img_tags:
img_name = img.attrs.get('src')
# quick fix for bad links
if (len(img_name) >= 3) and img_name[:3] == '../':
img_name = img_name[3:]
img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
if access is not None:
link = access.send_image(img_path, doc_id=book_id)
img.attrs['src'] = link
self.logger_object.log(
f'{img_name} successfully uploaded.')
else:
img_size = os.path.getsize(img_path)
self.logger_object.log(
f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG)
new_img_path = new_path / img_name
copyfile(img_path, new_img_path)
img.attrs["src"] = str(new_img_path)
self.images = img_tags
def _process_footer(self): def _process_footer(self):
# todo regex
""" """
Function to process <div title="footer"> tags. Function to process <div title="footer"> tags.
All the tags will be deleted from file. All the tags will be deleted from file.
""" """
divs = self.body_tag.find_all('div', {'title': 'footer'}) divs = self.body_tag.find_all("div", {"title": "footer"})
for div in divs: for div in divs:
div.decompose() div.decompose()
def _process_div(self): def _process_div(self):
# todo regex
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.""" """Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
divs = self.body_tag.find_all("div") divs = self.body_tag.find_all("div")
for div in divs: for div in divs:
div.unwrap() div.unwrap()
def _check_parent_link_exist_in_toc(self, tag_with_link):
toc_links = []
for a_tag in tag_with_link.find_all("a", {'name': re.compile(r'^_Toc\d+')}):
link_name = a_tag.attrs['name']
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
if toc_item:
toc_links.append(toc_item)
return len(toc_links) > 0
def _process_toc_links(self):
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all(
"a", {'name': re.compile(r'^_Toc\d+')})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
for tag in headers:
if re.search(r"^h\d$", tag.name):
tag.a.unwrap()
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
elif tag.name == "p":
exist_in_toc = self._check_parent_link_exist_in_toc(tag)
if tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup(
features="lxml").new_tag("h" + outline_level)
text = tag.text
tag.replaceWith(new_tag)
new_tag.string = text
else:
# rethink document structure when you have toc_links, other cases?
self.logger_object.log(f'Something went wrong in processing toc_links.'
f' Check the structure of the file. '
f'Tag name: {tag.name}')
@staticmethod
def clean_title_from_numbering(title: str):
"""Function to remove digits from headers."""
title = re.sub(r'^(\s+)+', '', title)
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
# title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
@staticmethod
def clean_tag_from_tabs(tag: NavigableString):
cleaned = re.sub(r'(\s+)+', ' ', tag)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
# print('input: ', repr(tag))
# print('test: ', repr(cleaned))
def clean_tag_from_numbering(self, tag):
cleaned = self.clean_title_from_numbering(tag)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
# print('input: ', repr(tag))
# print('test: ', repr(cleaned))
def apply_func_to_last_child(self, tag, func=None):
"""
works only with constructions like (((child to work with)))
where child is object of NavigableString
"""
if type(tag) is NavigableString:
func(tag)
else:
children = list(tag.children)
if children:
self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self):
"""Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
def _get_top_level_headers(self): def _get_top_level_headers(self):
""" """
Function for gathering info about top-level chapters. Function for gathering info about top-level chapters.
@@ -539,27 +371,26 @@ class HTMLDocxPreprocessor:
tag.parent.unwrap() tag.parent.unwrap()
title = tag.text title = tag.text
title = re.sub(r'\s+', ' ', title).strip() title = re.sub(r"\s+", " ", title).strip()
number = re.match(r'^(?:\.?\d+\.? ?)+', title) number = re.match(r"^(?:\.?\d+\.? ?)+", title)
is_numbered = number is not None is_numbered = number is not None
cleaned_title = self.clean_title_from_numbering(tag.text) cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text)
is_introduction = cleaned_title.lower() == 'introduction' is_introduction = cleaned_title.lower() == "introduction"
headers_info.append({ headers_info.append({
'title': cleaned_title, "title": cleaned_title,
'is_numbered': is_numbered, "is_numbered": is_numbered,
'is_introduction': is_introduction}) "is_introduction": is_introduction})
return headers_info return headers_info
def _mark_introduction_headers(self): def _mark_introduction_headers(self):
""" """
Function to find out: Function to find out:
what header shouldn't be numbered and can be treated as introduction chapter what header shouldn"t be numbered and can be treated as introduction chapter
Assume header(s) to be introduction if: Assume header(s) to be introduction if:
1. one header not numbered, before 1 numbered header 1. one header not numbered, before 1 numbered header
2. it is first header from the top level list, and it equals to 'introduction' 2. it is first header from the top level list, and it equals to "introduction"
Returns Returns
------- -------
@@ -567,9 +398,9 @@ class HTMLDocxPreprocessor:
mark each top-level header with flag should_be_numbered = true/false mark each top-level header with flag should_be_numbered = true/false
""" """
is_numbered_header = [header['is_numbered'] is_numbered_header = [header["is_numbered"]
for header in self.top_level_headers] for header in self.top_level_headers]
is_title = [header['is_introduction'] is_title = [header["is_introduction"]
for header in self.top_level_headers] for header in self.top_level_headers]
first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
@@ -577,14 +408,34 @@ class HTMLDocxPreprocessor:
first_header_is_introduction = is_title and is_title[0] first_header_is_introduction = is_title and is_title[0]
if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction: if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction:
self.top_level_headers[0]['should_be_numbered'] = False self.top_level_headers[0]["should_be_numbered"] = False
for i in range(1, len(self.top_level_headers)): for i in range(1, len(self.top_level_headers)):
self.top_level_headers[i]['should_be_numbered'] = True self.top_level_headers[i]["should_be_numbered"] = True
else: else:
for i in range(0, len(self.top_level_headers)): for i in range(0, len(self.top_level_headers)):
self.top_level_headers[i]['should_be_numbered'] = True self.top_level_headers[i]["should_be_numbered"] = True
@staticmethod
def clean_title_from_tabs(tag: NavigableString):
cleaned = re.sub(r"[\s\xa0]", " ", tag)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
def apply_func_to_last_child(self, tag, func=None):
"""
works only with constructions like (((child to work with)))
where child is object of NavigableString
"""
if type(tag) is NavigableString:
func(tag)
else:
children = list(tag.children)
if children:
self.apply_func_to_last_child(children[0], func)
def _process_headings(self): def _process_headings(self):
# todo regex
""" """
Function to process tags <h>. Function to process tags <h>.
Steps Steps
@@ -621,46 +472,36 @@ class HTMLDocxPreprocessor:
while tag.parent.name == "ol": while tag.parent.name == "ol":
tag.parent.unwrap() tag.parent.unwrap()
title = tag.text cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text)
title = self.clean_title_from_numbering(title) if cleaned_title == "":
if title == "":
tag.unwrap() tag.unwrap()
else: else:
assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \ assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings."
content = list(tag.children) content = list(tag.children)
# do not take into account rubbish empty tags like <a>, but don't remove them # do not take into account rubbish empty tags like <a>, but don"t remove them
content = [item for item in content if content = [item for item in content if
(type(item) is not NavigableString and item.text != '') (type(item) is not NavigableString and item.text != "")
or (type(item) is NavigableString)] or (type(item) is NavigableString)]
content[0] = "" if content[0] == " " else content[0]
content = [item for item in content if item != ""]
for i, item in enumerate(content): for i, item in enumerate(content):
if type(content[i]) is NavigableString: if type(content[i]) is NavigableString:
cleaned = re.sub(r'(\s+)+', ' ', content[i]) cleaned = re.sub(r"(\s+)+", " ", content[i])
this = BeautifulSoup.new_string(BeautifulSoup( this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString) features="lxml"), cleaned, NavigableString)
content[i].replace_with(this) content[i].replace_with(this)
content[i] = this content[i] = this
else: else:
self.apply_func_to_last_child( self.apply_func_to_last_child(
content[i], self.clean_tag_from_tabs) content[i], self.clean_title_from_tabs)
content[0] = '' if content[0] == ' ' else content[0]
content = [item for item in content if item != '']
if type(content[0]) is NavigableString:
cleaned = self.clean_title_from_numbering(content[0])
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
content[0].replace_with(this)
content[0] = this
else:
self.apply_func_to_last_child(
content[0], self.clean_tag_from_numbering)
def _process_lists(self): def _process_lists(self):
# todo regex
""" """
Function Function
- process tags <li>. - process tags <li>.
@@ -672,74 +513,76 @@ class HTMLDocxPreprocessor:
uwrap <p> tag with li uwrap <p> tag with li
""" """
li_tags = self.body_tag.find_all("li") li_tags = self.body_tag.find_all("li")
for li_tag in li_tags: for li_tag in li_tags:
li_tag.attrs.update(li_tag.p.attrs) li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap() li_tag.p.unwrap()
def process_html(self, access=None, html_path='', book_id='local'): def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated
toc_tag = self.html_soup.new_tag("TOC")
self.content: List[Tag] = self.body_tag.find_all(recursive=False)
if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:]
def process_html(self, access=None, html_path="", book_id=0):
"""Process html code to satisfy LiveCarta formatting.""" """Process html code to satisfy LiveCarta formatting."""
self.logger_object.log('Beginning of processing .html file.') self.logger_object.log("Beginning of processing .html file.")
try: try:
self.logger_object.log(f'Processing TOC and headers.') self.logger_object.log(f"Processing TOC and headers.")
self._process_toc_links() self._process_toc_links()
self.clean_trash() self.clean_trash()
# process main elements of the .html doc # process main elements of the .html doc
self.logger_object.log(f'Processing main elements of html.') self.logger_object.log(f"Processing main elements of html.")
self._preprocessing_headings() self._preprocessing_headings()
self._process_paragraph() self._process_paragraph()
self._process_two_columns() self._process_two_columns()
self.logger_object.log('Block quotes processing.') self.logger_object.log("Block quotes processing.")
self._process_quotes() self._process_quotes()
self.logger_object.log('Tables processing.') self.logger_object.log("Tables processing.")
self._process_tables() self._process_tables()
self.logger_object.log( self.logger_object.log(
f'{self.tables_amount} tables have been processed.') f"{self.tables_amount} tables have been processed.")
self.logger_object.log('Hrefs processing.') self.logger_object.log("Hrefs processing.")
self._process_hrefs() self._process_hrefs()
self.logger_object.log('Footnotes processing.') self.logger_object.log("Footnotes processing.")
self._process_footnotes() self.footnotes = process_footnotes(self.body_tag)
self.logger_object.log( self.logger_object.log(
f'{len(self.footnotes)} footnotes have been processed.') f"{len(self.footnotes)} footnotes have been processed.")
self.logger_object.log('Image processing.') self.logger_object.log("Image processing.")
self._process_images( self.images = process_images(access=access, html_path=html_path,
access=access, html_path=html_path, book_id=book_id) book_id=book_id, body_tag=self.body_tag)
self.logger_object.log( self.logger_object.log(
f'{len(self.images)} images have been processed.') f"{len(self.images)} images have been processed.")
self._process_footer() self._process_footer()
self._process_div() self._process_div()
self.content = self.body_tag.find_all(recursive=False)
self.top_level_headers = self._get_top_level_headers() self.top_level_headers = self._get_top_level_headers()
self._mark_introduction_headers() self._mark_introduction_headers()
self._process_headings() self._process_headings()
self.content: List[Tag] = self.body_tag.find_all(recursive=False)
self._process_lists() self._process_lists()
# delete text before table of content if exists # delete text before table of content if exists
self.delete_content_before_toc() self.delete_content_before_toc()
except Exception as exc: except Exception as exc:
self.logger_object.log( self.logger_object.log(
'Error has occurred while processing html.', logging.ERROR) "Error has occurred while processing html.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
if self.status_wrapper: if self.status_wrapper:
self.status_wrapper.set_error() self.status_wrapper.set_error()
raise exc raise exc
self.logger_object.log('End of processing .html file.') self.logger_object.log("End of processing .html file.")
return self.content, self.footnotes, self.top_level_headers return self.content, self.footnotes, self.top_level_headers

View File

@@ -0,0 +1,34 @@
import os
import pathlib
from shutil import copyfile
def process_images(access, html_path, book_id, body_tag):
"""
Function to process <img> tag.
Img should be sent Amazon S3 and then return new tag with valid link.
For now images are moved to one folder.
"""
img_tags = body_tag.find_all("img")
for img in img_tags:
img_name = img.attrs.get("src")
# quick fix for bad links
if (len(img_name) >= 3) and img_name[:3] == "../":
img_name = img_name[3:]
img_path = pathlib.Path(f"{html_path.parent}", f"{img_name}")
if access is not None:
link = access.send_image(img_path, doc_id=book_id)
img.attrs["src"] = link
else:
if img_tags.index(img) == 0:
folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f"../books/json/img_{book_id}/"))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / img_name
copyfile(img_path, new_img_path)
img.attrs["src"] = str(new_img_path)
return img_tags

View File

@@ -29,7 +29,7 @@ class LibreHTML2JSONConverter:
cleaned text cleaned text
""" """
new_text = re.sub(r'([\n\t])', ' ', html_text) new_text = re.sub(r"([\n\t])", " ", html_text)
return new_text return new_text
# TODO: rethink the function structure without indexes. # TODO: rethink the function structure without indexes.
@@ -48,16 +48,16 @@ class LibreHTML2JSONConverter:
""" """
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
title = str(self.content[ind]) title = str(self.content[ind])
title = title.replace(f'<{self.content[ind].name}>', '') title = title.replace(f"<{self.content[ind].name}>", "")
title = title.replace(f'</{self.content[ind].name}>', '') title = title.replace(f"</{self.content[ind].name}>", "")
title = re.sub(r'^\n', '', title) title = re.sub(r"^\n", "", title)
# extract outline from tag # extract outline from tag
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
result = { result = {
'title': f'{title}', "title": f"{title}",
'contents': [], "contents": [],
'sub_items': [] "sub_items": []
} }
ch_content = [] ch_content = []
ind += 1 ind += 1
@@ -71,9 +71,9 @@ class LibreHTML2JSONConverter:
header_dict, ind = self.header_to_livecarta_chapter_item( header_dict, ind = self.header_to_livecarta_chapter_item(
ind) ind)
if ch_content: if ch_content:
result['contents'].append("".join(ch_content)) result["contents"].append("".join(ch_content))
ch_content = [] ch_content = []
result['sub_items'].append(header_dict) result["sub_items"].append(header_dict)
# - current h_i <= h_initial, end of recursion # - current h_i <= h_initial, end of recursion
else: else:
# return result, ind # return result, ind
@@ -85,21 +85,21 @@ class LibreHTML2JSONConverter:
ind += 1 ind += 1
if ch_content: if ch_content:
result['contents'].append("".join(ch_content)) result["contents"].append("".join(ch_content))
return result, ind return result, ind
return '' return ""
@staticmethod @staticmethod
def _is_empty_p_tag(tag): def _is_empty_p_tag(tag):
if tag.name != 'p': if tag.name != "p":
return False return False
temp_tag = copy(tag) temp_tag = copy(tag)
brs = temp_tag.find_all('br') brs = temp_tag.find_all("br")
for br in brs: for br in brs:
br.decompose() br.decompose()
text = re.sub(r'\s+', '', temp_tag.text) text = re.sub(r"\s+", "", temp_tag.text)
if text: if text:
return False return False
@@ -107,10 +107,7 @@ class LibreHTML2JSONConverter:
def convert_to_dict(self): def convert_to_dict(self):
"""Function which convert list of html nodes to appropriate json structure.""" """Function which convert list of html nodes to appropriate json structure."""
json_strc = [] json_strc, ind, ch_num, ch_amt = [], 0, 0, 0
ind = 0
ch_num = 0
ch_amt = 0
try: try:
while ind < len(self.content): while ind < len(self.content):
@@ -120,7 +117,7 @@ class LibreHTML2JSONConverter:
res, ind = self.header_to_livecarta_chapter_item(ind) res, ind = self.header_to_livecarta_chapter_item(ind)
else: else:
chapter_title = f'Untitled chapter {ch_num}' chapter_title = f"Untitled chapter {ch_num}"
chapter = [] chapter = []
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
if not self._is_empty_p_tag(self.content[ind]): if not self._is_empty_p_tag(self.content[ind]):
@@ -129,9 +126,9 @@ class LibreHTML2JSONConverter:
ind += 1 ind += 1
if chapter: if chapter:
res = { res = {
'title': chapter_title, "title": chapter_title,
'contents': ["".join(chapter)], "contents": ["".join(chapter)],
'sub_items': [] "sub_items": []
} }
ch_num += 1 ch_num += 1
@@ -139,10 +136,10 @@ class LibreHTML2JSONConverter:
json_strc.append(res) json_strc.append(res)
ch_amt += 1 ch_amt += 1
self.logger_object.log( self.logger_object.log(
f'Chapter {ch_amt} has been added to structure.') f"Chapter {ch_amt} has been added to structure.")
except Exception as exc: except Exception as exc:
self.logger_object.log( self.logger_object.log(
'Error has occurred while making json structure.', logging.ERROR) "Error has occurred while making json structure.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
if self.book_api_status: if self.book_api_status:
self.book_api_status.set_error() self.book_api_status.set_error()
@@ -151,10 +148,10 @@ class LibreHTML2JSONConverter:
# Add is_introduction field to json structure # Add is_introduction field to json structure
# after deleting content before toc, some chapters can be deleted # after deleting content before toc, some chapters can be deleted
if self.top_level_headers: if self.top_level_headers:
same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title'] same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"]
is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered'] is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]
json_strc[0]['is_introduction'] = is_first_header_introduction json_strc[0]["is_introduction"] = is_first_header_introduction
self.content_dict = { self.content_dict = {
"content": json_strc, "content": json_strc,

View File

@@ -1,238 +0,0 @@
import re
import cssutils
from ebooklib import epub
from bs4 import BeautifulSoup
from itertools import takewhile
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
def get_text_color(x):
color = str2hex(x)
color = color if color not in ['#000000', '#000', 'black'] else ''
return color
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ['#ffffff', '#fff', 'white'] else ''
return color
def convert_tag_style_values(size_value: str) -> str:
"""
Function
- converts values of tags from em/%/pt to px
- find closest font-size px
Parameters
----------
size_value: str
Returns
-------
size_value: str
"""
def find_closest_size(style_value):
possible_sizes = list(
takewhile(lambda x: style_value >= x, LiveCartaConfig.sizes_pr))
last_possible_size_index = LiveCartaConfig.sizes_pr.index(
possible_sizes[-1])
return LiveCartaConfig.sizes_px[last_possible_size_index]
font_size_regexp = re.compile(
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
has_style_attrs = re.search(font_size_regexp, size_value)
if has_style_attrs:
if has_style_attrs.group(1):
size_value = float(size_value.replace('%', '')) / 100.0
return find_closest_size(size_value)
elif has_style_attrs.group(3):
size_value = float(size_value.replace('em', ''))
return find_closest_size(size_value)
elif has_style_attrs.group(5):
return size_value.replace('pt', 'px')
else:
return ''
return size_value
def convert_indents_tag_values(size_value: str) -> str:
"""
Function converts values of ['text-indent', 'margin-left', 'margin']
Parameters
----------
size_value: str
Returns
-------
size_value: str
"""
if len(size_value.split(' ')) == 3:
size_value = convert_tag_style_values(size_value.split(
' ')[-2]) # returns middle value
else:
size_value = convert_tag_style_values(size_value.split(
' ')[-1]) # returns last value
return size_value
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [],
'font': [],
'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys()
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
'text-decoration': ['underline', 'line-through'], # <u> , <s>
'text-decoration-line': ['underline', 'line-through'], # <u> , <s>
'vertical-align': ['super'], # <sup>
'color': [],
'background-color': [],
'background': [],
'width': [],
'border': [],
'border-top-width': [],
'border-right-width': [],
'border-left-width': [],
'border-bottom-width': [],
'border-top': [],
'border-bottom': [],
'list-style-type': [],
'list-style-image': [],
'margin-left': [],
'margin-top': [],
'margin': [],
}
"""
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents_tag_values,
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title()))
or LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x)),
'font-size': convert_tag_style_values,
'color': get_text_color,
'background-color': get_bg_color,
'background': get_bg_color,
'border': lambda x: x if x != '0' else '',
'border-top-width': lambda x: x if x != '0' else '',
'border-right-width': lambda x: x if x != '0' else '',
'border-left-width': lambda x: x if x != '0' else '',
'border-bottom-width': lambda x: x if x != '0' else '',
'border-top': lambda x: x if x != '0' else '',
'border-bottom': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc',
'list-style-image': lambda x: 'disc',
'margin-left': convert_indents_tag_values,
'margin-top': convert_tag_style_values,
'margin': convert_indents_tag_values
}
def update_inline_styles_to_livecarta_convention(split_style: list):
for i, style in enumerate(split_style):
style_name, style_value = style.split(":")
if style_name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ''
return split_style
cleaned_value = style_value.replace('\"', '').split()[-1]
constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
style_name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_name]
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ''
else:
if style_name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
style_value = func(cleaned_value)
split_style[i] = style_name + ":" + style_value
return split_style
def build_inline_style_content(style: str) -> str:
"""Build inline style with livecarta convention"""
# replace all spaces between '; & letter' to ';'
style = re.sub(r"; *", ";", style)
# when we split style by ';', last element of the list is '' - None
# remove it
split_style: list = list(filter(None, style.split(';')))
# replace all spaces between ': & letter' to ':'
split_style = [el.replace(
re.search(r'(:\s*)', el).group(1), ':') for el in split_style]
split_style = update_inline_styles_to_livecarta_convention(split_style)
style = "; ".join(split_style)
return style
def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule,
style_type: cssutils.css.property.Property):
if style_type.name not in LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
return
cleaned_value = style_type.value.replace('\"', '')
constraints_on_value = LIVECARTA_STYLE_ATTRS.get(
style_type.name)
value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[
style_type.name]
if constraints_on_value and value_not_in_possible_values_list:
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ''
else:
if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
css_rule.style[style_type.name] = func(cleaned_value)
def build_css_file_content(css_content: str) -> str:
"""Build css content with livecarta convention"""
sheet = cssutils.parseString(css_content.lower(), validate=False)
for css_rule in sheet:
if css_rule.type == css_rule.STYLE_RULE:
for style_type in css_rule.style:
update_css_styles_to_livecarta_convention(
css_rule, style_type)
css_text: str = sheet._getCssText().decode()
return css_text
if __name__ == '__main__':
file = '../../epub/9781627222174.epub'
ebooklib_book = epub.read_epub(file)
css_ = ebooklib_book.get_item_with_href('css/epub.css')
css_ = css_.get_content().decode()
css_cleaned = build_css_file_content(css_)
html_ = ebooklib_book.get_item_with_href(
'pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml')

View File

@@ -0,0 +1,216 @@
import re
import cssutils
from bs4 import BeautifulSoup
from os.path import dirname, normpath, join
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
class CSSPreprocessor:
def __init__(self):
"""
Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit LiveCarta style convention.
"""
self.LIVECARTA_STYLE_ATTRS_MAPPING = {
"text-indent": self.convert_indents_tag_values,
"font-variant": lambda x: x,
"text-align": lambda x: x,
"font": lambda x: "",
"font-family": lambda x: x,
"font-size": self.convert_tag_style_values,
"color": self.get_text_color,
"background-color": self.get_bg_color,
"background": self.get_bg_color,
"border": lambda x: x if x != "0" else "",
"border-top-width": lambda x: x if x != "0" else "",
"border-right-width": lambda x: x if x != "0" else "",
"border-left-width": lambda x: x if x != "0" else "",
"border-bottom-width": lambda x: x if x != "0" else "",
"border-top": lambda x: x if x != "0" else "",
"border-bottom": lambda x: x if x != "0" else "",
"list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
"list-style-image": lambda x: "disc",
"margin-left": self.convert_indents_tag_values,
"margin-top": self.convert_tag_style_values,
"margin": self.convert_indents_tag_values,
"width": self.convert_tag_style_values,
}
@staticmethod
def get_text_color(x):
color = str2hex(x)
color = color if color not in ["#000000", "#000", "black"] else ""
return color
@staticmethod
def get_bg_color(x):
color = str2hex(x)
color = color if color not in ["#ffffff", "#fff", "white"] else ""
return color
@staticmethod
def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str:
"""
Function
- converts values of tags from em/%/pt to px
- find closest font-size px
Parameters
----------
size_value: str
is_indent: bool
Returns
-------
size_value: str
converted value size
"""
size_regexp = re.compile(
r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)")
has_style_attrs = re.search(size_regexp, size_value)
if has_style_attrs:
if has_style_attrs.group(1):
multiplier = 5.76 if is_indent else 0.16
size_value = float(size_value.replace("%", "")) * multiplier
return str(size_value)+'px'
elif has_style_attrs.group(3):
multiplier = 18 if is_indent else 16
size_value = float(size_value.replace("em", "")) * multiplier
return str(size_value)+'px'
elif has_style_attrs.group(5):
size_value = float(size_value.replace("pt", "")) * 4/3
return str(size_value)+'px'
else:
return ""
return size_value
def convert_indents_tag_values(self, size_value: str) -> str:
"""
Function converts values of ["text-indent", "margin-left", "margin"]
Parameters
----------
size_value: str
Returns
-------
size_value: str
"""
size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
else self.convert_tag_style_values(size_value.split(" ")[-1], True)
return size_value
@staticmethod
def clean_value(style_value: str, style_name: str):
cleaned_value = style_value.replace("\"", "")
if style_name == 'font-family':
for symbol in ["+", "*", ".", "%", "?", "$", "^", "[", "]"]:
cleaned_value = re.sub(
re.escape(f"{symbol}"), rf"\\{symbol}", cleaned_value)
return cleaned_value
@staticmethod
def style_conditions(style_value: str, style_name: str) -> tuple[bool, bool]:
constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get(
style_name)
value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[
style_name]
return constraints_on_value, value_not_in_possible_values_list
def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
for i, style in enumerate(split_style):
style_name, style_value = style.split(":")
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ""
return split_style
cleaned_value = self.clean_value(style_value, style_name)
if all(self.style_conditions(cleaned_value, style_name)):
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
split_style[i] = ""
else:
if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
style_value = func(cleaned_value)
split_style[i] = style_name + ":" + style_value
return split_style
def build_inline_style_content(self, style: str) -> str:
"""Build inline style with LiveCarta convention"""
# replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", style)
# when we split style by ";", last element of the list is "" - None (we remove it)
split_style: list = list(filter(None, style.split(";")))
# replace all spaces between ": & letter" to ":"
split_style = [el.replace(
re.search(r"(:\s*)", el).group(1), ":") for el in split_style]
split_style = self.update_inline_styles_to_livecarta_convention(
split_style)
style = "; ".join(split_style)
return style
def process_inline_styles_in_html_soup(self, html_href2html_body_soup: dict):
"""This function is designed to convert inline html styles"""
for html_href in html_href2html_body_soup:
html_content: BeautifulSoup = html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs["style"]
tag_initial_inline_style.attrs["style"] = \
self.build_inline_style_content(inline_style)
@staticmethod
def get_css_content(css_href, html_href, ebooklib_book):
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace("\\", "/")
css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search('"(.*)"', str(css_obj.content)).group(1)
css_obj = ebooklib_book.get_item_with_href(
path_to_css_from_root)
assert css_obj, f"Css style {css_href} was not in manifest."
css_content: str = css_obj.get_content().decode()
return css_content
def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule,
style_type: cssutils.css.property.Property):
if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ""
return
cleaned_value = self.clean_value(style_type.value, style_type.name)
if all(self.style_conditions(cleaned_value, style_type.name)):
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
css_rule.style[style_type.name] = ""
else:
if style_type.name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name]
css_rule.style[style_type.name] = func(cleaned_value)
def build_css_file_content(self, css_content: str) -> str:
"""Build css content with LiveCarta convention"""
sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet:
if css_rule.type == css_rule.STYLE_RULE:
for style_type in css_rule.style:
self.update_css_styles_to_livecarta_convention(
css_rule, style_type)
css_text: str = sheet._getCssText().decode()
return css_text

View File

@@ -1,39 +1,40 @@
import re import re
import json import json
import codecs import codecs
import os
from os.path import dirname, normpath, join
from itertools import chain
from collections import defaultdict
from typing import Dict, Union, List
import ebooklib import ebooklib
from ebooklib import epub from ebooklib import epub
from ebooklib.epub import Link, Section from ebooklib.epub import Link, Section
from bs4 import BeautifulSoup, Tag from os import path
from pathlib import Path
from itertools import chain
from premailer import transform
from collections import defaultdict
from typing import Dict, Union, List
from bs4 import BeautifulSoup, NavigableString, Tag
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.epub_converter.css_processor import CSSPreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\ from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor
prepare_title, prepare_content, update_images_src_links, preprocess_footnotes
class EpubConverter: class EpubConverter:
def __init__(self, file_path, access=None, logger=None): def __init__(self, book_path, access=None, logger=None, css_processor=None, html_processor=None):
self.file_path = file_path self.book_path = book_path
self.access = access self.access = access
self.logger: BookLogger = logger self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file_path) self.ebooklib_book = epub.read_epub(book_path)
self.css_processor = css_processor
self.html_processor = html_processor
# main container for all epub .xhtml files # main container for all epub .xhtml files
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
# enumerate all subchapter id for each file # enumerate all subchapter id for each file
self.html_href2subchapter_ids = defaultdict(list) self.html_href2subchapters_ids = defaultdict(list)
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints) # toc tree structure stored as adj.list (NavPoint to list of NavPoints)
@@ -57,55 +58,51 @@ class EpubConverter:
self.noterefs: List[Tag] = [] # start of the footnote self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote self.footnotes: List[Tag] = [] # end of the footnote
self.logger.log('Image processing.') self.logger.log("Image processing.")
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name file_name = x.file_name
content = x.content content = x.content
self.img_href2img_bytes[file_name] = content self.img_href2img_bytes[file_name] = content
self.logger.log('HTML files reading.') self.logger.log("HTML files reading.")
self.html_href2html_body_soup: Dict[str, self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content() BeautifulSoup] = self.build_href2soup_content()
# TODO Presets
self.logger.log('Process CSS inline styles.') self.logger.log("CSS inline style processing.")
self.process_inline_styles_in_html_soup() self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup)
self.logger.log('CSS files processing.') self.logger.log("CSS files processing.")
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log('CSS styles adding.') self.logger.log("CSS styles fusion(inline+file).")
self.add_css_styles_to_html_soup() self.add_css_styles_to_html_soup()
self.logger.log('Footnotes processing.') self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup: for href in self.html_href2html_body_soup:
content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], self.footnotes_contents, self.noterefs, self.footnotes =\
self.html_href2html_body_soup) preprocess_footnotes(
self.footnotes_contents.extend(content) self.html_href2html_body_soup[href], self.html_href2html_body_soup)
self.noterefs.extend(noterefs) self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.")
self.footnotes.extend(footnotes_tags)
for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)): self.logger.log("TOC processing.")
noteref.attrs['data-id'] = i + 1
noteref.attrs['id'] = f'footnote-{i + 1}'
footnote.attrs['href'] = f'#footnote-{i + 1}'
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
self.logger.log('TOC processing.')
self.build_adjacency_list_from_toc(self.ebooklib_book.toc) self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed # build simple toc from spine if needed
if self.is_toc_empty(): if self.is_toc_empty():
self.build_adjacency_list_from_spine() self.build_adjacency_list_from_spine()
not_added = [ not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f'Html documents not added to TOC: {not_added}.') self.logger.log(f"Html documents not added to TOC: {not_added}.")
self.logger.log(f"Add documents not added to TOC.")
self.add_not_added_files_to_adjacency_list(not_added) self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.') self.logger.log(f"Label subchapters with converter tag.")
self.label_chapters_ids_with_tmp_id() self.label_subchapters_with_lc_tag()
# used only after parsed toc, ids from toc needed self.logger.log(f"Process html internal links.")
self.process_html_soup_structure_to_line()
self.process_internal_links() self.process_internal_links()
self.logger.log(f'Building chapters content.') self.logger.log(
self.define_chapters_content() f"Check if converter-chapter-marks are on the same level.")
self.chapter_marks_are_same_level()
self.logger.log(f"Define chapters content.")
self.define_chapters_with_content()
self.logger.log(f"Converting html_nodes to LiveCarta chapter items.")
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements # using EpubElements
@@ -115,38 +112,10 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_body_text = item.get_body_content() html_body_text = item.get_body_content()
# html.parser closes tags if needed # html.parser closes tags if needed
soup = BeautifulSoup(html_body_text, features='html.parser') soup = BeautifulSoup(html_body_text, features="html.parser")
nodes[item.file_name] = soup nodes[item.file_name] = soup
return nodes return nodes
def get_css_content(self, css_href, html_href):
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace('\\', '/')
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search('"(.*)"', str(css_obj.content)).group(1)
css_obj = self.ebooklib_book.get_item_with_href(
path_to_css_from_root)
assert css_obj, f'Css style {css_href} was not in manifest.'
css_content: str = css_obj.get_content().decode()
return css_content
def process_inline_styles_in_html_soup(self):
"""This function is designed to convert inline html styles"""
for html_href in self.html_href2html_body_soup:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs['style']
tag_initial_inline_style.attrs['style'] = \
build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]: def build_html_and_css_relations(self) -> tuple[dict, dict]:
""" """
Function is designed to get 2 dictionaries: Function is designed to get 2 dictionaries:
@@ -167,39 +136,81 @@ class EpubConverter:
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content html_content = item.content
html_href = item.file_name html_href = item.file_name
soup_html_content = BeautifulSoup(html_content, features='lxml') soup_html_content = BeautifulSoup(html_content, features="lxml")
# check if file links to css file # check if file links to css file
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): for tag in soup_html_content.find_all("link", attrs={"type": "text/css"}):
# alternate page of original page (e.g. another language) # alternate page of original page (e.g. another language)
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): if tag.attrs.get("rel") and ("alternate" in tag.attrs["rel"]):
continue continue
css_href = tag.attrs.get('href') css_href = tag.attrs.get("href")
html_href2css_href[html_href].append(css_href) html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content: if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict # css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_file_content( css_href2css_content[css_href] = self.css_processor.build_css_file_content(
self.get_css_content(css_href, html_href)) self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book))
for i, tag in enumerate(soup_html_content.find_all('style')): for i, tag in enumerate(soup_html_content.find_all("style")):
css_content = tag.string css_content = tag.string
html_href2css_href[html_href].append(f'href{i}') html_href2css_href[html_href].append(f"href{i}")
css_href2css_content[f'href{i}'] = build_css_file_content( css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content(
css_content) css_content)
return html_href2css_href, css_href2css_content return html_href2css_href, css_href2css_content
@staticmethod
def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""
Function adds styles from .css to inline style.
Parameters
----------
html_soup: BeautifulSoup
html page with inline style
css_text: str
css content from css file
Returns
-------
inline_soup: BeautifulSoup
soup with styles from css
"""
# remove this specification because it causes problems
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features="lxml")
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={"style": re.compile(".*")})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:
style_converter = TagInlineStyleProcessor(tag_inline_style)
style_converter.convert_initial_tag()
return inline_soup
def add_css_styles_to_html_soup(self): def add_css_styles_to_html_soup(self):
""" """
This function is designed to update html_href2html_body_soup This function is designed to update html_href2html_body_soup
- add to html_inline_style css_style_content - add to html_inline_style css_style_content
Returns
-------
None
updated soups with styles from css
""" """
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href): if self.html_href2css_href.get(html_href):
css = '' css = ""
for css_href in self.html_href2css_href[html_href]: for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href] css += self.css_href2css_content[css_href]
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
html_content = convert_html_soup_with_css_style(html_content, css) html_content = self.modify_html_soup_with_css_styles(
html_content, css)
self.html_href2html_body_soup[html_href] = html_content self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
@@ -226,7 +237,7 @@ class EpubConverter:
nav_point = NavPoint(element) nav_point = NavPoint(element)
if nav_point.id: if nav_point.id:
self.id_anchor_exist_in_nav_points = True self.id_anchor_exist_in_nav_points = True
self.html_href2subchapter_ids[nav_point.href].append( self.html_href2subchapters_ids[nav_point.href].append(
nav_point.id) nav_point.id)
self.adjacency_list[nav_point] = None self.adjacency_list[nav_point] = None
self.hrefs_added_to_toc.add(nav_point.href) self.hrefs_added_to_toc.add(nav_point.href)
@@ -238,12 +249,12 @@ class EpubConverter:
nav_point = NavPoint(first) nav_point = NavPoint(first)
if nav_point.id: if nav_point.id:
self.id_anchor_exist_in_nav_points = True self.id_anchor_exist_in_nav_points = True
self.html_href2subchapter_ids[nav_point.href].append( self.html_href2subchapters_ids[nav_point.href].append(
nav_point.id) nav_point.id)
sub_nodes = [] sub_nodes = []
for elem in second: for elem in second:
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1: if (bool(re.search('^section$|^part$', first.title.lower()))) and lvl == 1:
self.offset_sub_nodes.append( self.offset_sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl)) self.build_adjacency_list_from_toc(elem, lvl))
else: else:
@@ -267,7 +278,7 @@ class EpubConverter:
self.adjacency_list[-1] = nodes self.adjacency_list[-1] = nodes
else: else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' assert 0, f"Error. Element is not tuple/Link/list instance: {type(element)}"
def is_toc_empty(self) -> bool: def is_toc_empty(self) -> bool:
"""Function checks is toc empty""" """Function checks is toc empty"""
@@ -276,14 +287,14 @@ class EpubConverter:
return True return True
return False return False
def build_manifest_id2html_href(self) -> dict:
links = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
links[item.id] = item.file_name
return links
def build_adjacency_list_from_spine(self): def build_adjacency_list_from_spine(self):
manifest_id2html_href = self.build_manifest_id2html_href() def build_manifest_id2html_href() -> dict:
links = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
links[item.id] = item.file_name
return links
manifest_id2html_href = build_manifest_id2html_href()
self.adjacency_list = { self.adjacency_list = {
-1: [] -1: []
} }
@@ -293,42 +304,49 @@ class EpubConverter:
self.adjacency_list[-1].append(nav_point) self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(nav_point.href) self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added): def add_not_added_files_to_adjacency_list(self, not_added: list):
"""Function add files that not added to adjacency list""" """Function add files that not added to adjacency list"""
for i, file in enumerate(not_added): for i, file in enumerate(not_added):
nav_point = NavPoint( nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file)) Section(f"To check #{i}, filename: {file}", file))
self.adjacency_list[-1].append(nav_point) self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(file) self.hrefs_added_to_toc.add(file)
def label_chapters_ids_with_tmp_id(self): def label_subchapters_with_lc_tag(self):
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
ids = self.html_href2subchapter_ids[html_href] ids, soup = self.html_href2subchapters_ids[html_href], \
self.html_href2html_body_soup[html_href]
for i in ids: for i in ids:
soup = self.html_href2html_body_soup[html_href]
tag = soup.find(id=i) tag = soup.find(id=i)
new_h = soup.new_tag('tmp') tmp_tag = soup.new_tag("lc_tmp")
new_h.attrs['class'] = 'converter-chapter-mark' tmp_tag.attrs["class"] = "converter-chapter-mark"
new_h.attrs['id'] = i tmp_tag.attrs["id"] = i
tag.insert_before(new_h) tag.insert_before(tmp_tag)
def process_html_soup_structure_to_line(self): def chapter_marks_are_same_level(self):
# go to line structure """
Function checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed.
This tag must have a chapter_tag as a parent.
Otherwise, it is wrapped with some tags. Like:
<p> <span id="123", class="converter-chapter-mark"> </span> </p>
"""
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[html_href] chapter_tag = self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup) # check marks for chapter starting are on the same level - 1st
marks = chapter_tag.find_all(
attrs={"class": "converter-chapter-mark"})
# fix marks to be on 1 level
for mark in marks:
while mark.parent != chapter_tag:
# todo warning! could reflect on formatting/internal links in some cases
mark.parent.unwrap()
@staticmethod @staticmethod
def create_unique_id(href, id_): def create_unique_id(href, id_):
return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_)
@staticmethod
def create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs['id'] = id_
new_anchor_span.attrs['class'] = 'link-anchor'
new_anchor_span.string = "\xa0"
return new_anchor_span
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]: def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
""" """
@@ -351,23 +369,31 @@ class EpubConverter:
prepared content prepared content
""" """
dir_name = os.path.dirname(cur_file_path) dir_name = path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join( normed_path = path.normpath(path.join(
dir_name, href_in_link)).replace('\\', '/') dir_name, href_in_link)).replace("\\", "/")
full_path = [ full_path = [
path for path in self.hrefs_added_to_toc if normed_path in path] path for path in self.hrefs_added_to_toc if normed_path in path]
if not full_path: if not full_path:
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
f'While processing href in {internal_link_tag}.') f"While processing href in {internal_link_tag}.")
internal_link_tag.attrs['converter-mark'] = 'bad-link' internal_link_tag.attrs["converter-mark"] = "bad-link"
return None return None
if len(full_path) > 1: if len(full_path) > 1:
self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}' self.logger.log(f"Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}"
f' while {internal_link_tag} processing. The first one will be chosen.') f" while {internal_link_tag} processing. The first one will be chosen.")
return full_path[0] return full_path[0]
@staticmethod
def create_new_anchor_span(soup, id_):
new_anchor_span = soup.new_tag("span")
new_anchor_span.attrs["id"] = id_
new_anchor_span.attrs["class"] = "link-anchor"
new_anchor_span.string = "\xa0"
return new_anchor_span
def process_internal_links(self): def process_internal_links(self):
""" """
Function Function
@@ -376,8 +402,8 @@ class EpubConverter:
Steps Steps
---------- ----------
1. rebuild ids to be unique in all documents 1. rebuild ids to be unique in all documents
2a. process anchor which is a whole xhtml file 2a. process anchor which is a whole htm|html|xhtml file
2b. process anchor which is an element in xhtml file 2b. process anchor which is an element in htm|html|xhtml file
Returns Returns
------- -------
@@ -385,99 +411,128 @@ class EpubConverter:
process links in html process links in html
""" """
# 1. rebuild ids to be unique in all documents def make_ids_unique():
for toc_href in self.hrefs_added_to_toc: for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}): for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
if tag.attrs.get('class') == 'converter-chapter-mark': if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
continue new_id = self.create_unique_id(toc_href, tag.attrs["id"])
tag.attrs["id"] = new_id
if tag.attrs.get('class') == 'footnote-element': def process_file_anchor():
continue for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all("a",
{"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
a_tag_href = internal_link_tag.attrs["href"]
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag)
if a_tag_href_matched_to_toc:
new_id = self.create_unique_id(a_tag_href_matched_to_toc, "")
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
# insert a new span to the beginning of the file
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
del internal_link_tag.attrs["href"]
new_id = self.create_unique_id(toc_href, tag.attrs['id']) def process_file_element_anchor():
tag.attrs['id'] = new_id for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
# process_file_element_anchor
for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#")
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag) if a_tag_href \
else path.normpath(toc_href).replace("\\", "/")
if a_tag_href_matched_to_toc:
new_id = self.create_unique_id(
a_tag_href_matched_to_toc, a_tag_id)
# 2a. process anchor which is a whole xhtml file anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
internal_link_reg1 = re.compile( anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
r'(^(?!https?://).+\.(htm|html|xhtml)$)') anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote
for toc_href in self.hrefs_added_to_toc: if anchor_tags:
soup = self.html_href2html_body_soup[toc_href] if len(anchor_tags) > 1:
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): self.logger.log(f"Warning in {toc_href}: multiple anchors:"
a_tag_href = internal_link_tag.attrs['href'] f"{len(anchor_tags)} found.\n"
# find full path f"{anchor_tags}\n"
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( f"While processing {internal_link_tag}")
toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc:
continue
new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
# insert a new span to the beginning of the file
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href'] anchor_tag = anchor_tags[0]
assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
# if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
# create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs["id"] not in self.internal_anchors:
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id)
del anchor_tag.attrs["id"]
del internal_link_tag.attrs["href"]
else:
internal_link_tag.attrs["converter-mark"] = "bad-link"
self.logger.log(f"Error in {toc_href}."
f" While processing {internal_link_tag} no anchor found."
f" Should be anchor with new id={new_id} in"
f" {a_tag_href_matched_to_toc} file."
f" Old id={a_tag_id}")
# 1. make ids to be unique in all documents
make_ids_unique()
# 2a. process anchor which is a whole htm|html|xhtml file
process_file_anchor()
# 2b. process anchor which is an element in htm|html|xhtml file
process_file_element_anchor()
# 2b. process anchor which is an element in xhtml file @staticmethod
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)') def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
for toc_href in self.hrefs_added_to_toc: """
soup = self.html_href2html_body_soup[toc_href] Get tags between LiveCarta chapter marks
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): Parameters
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split( ----------
'#') first_id: str
# find full path Id that point where a chapter starts. A Tag with class: "converter-chapter-mark"
if a_tag_href: href: str
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, Name of current chapters file
internal_link_tag) html_soup: Tag
else: Soup object of current file
a_tag_href_matched_to_toc = os.path.normpath(
toc_href).replace('\\', '/')
if not a_tag_href_matched_to_toc: Returns
continue -------
tags: list [Tag, NavigableString]
Chapter's tags
new_id = self.create_unique_id( """
a_tag_href_matched_to_toc, a_tag_id) marked_tags = html_soup.find(
attrs={"id": first_id, "class": "converter-chapter-mark"})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and \
(next_tag.attrs.get("class") == "converter-chapter-mark"):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] # remove tags between first_id and next found id
anchor_tags = anchor_soup.find_all(attrs={'id': new_id, }) # save them in list for next steps
anchor_tags = anchor_tags or anchor_soup.find_all( tags = [tag.extract() for tag in tags]
attrs={'id': a_tag_id}) # if link is a footnote html_soup.smooth()
else:
assert 0, f"Warning: no match for {first_id, href}"
if anchor_tags: return tags
if len(anchor_tags) > 1:
self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n'
f'{anchor_tags}\n'
f' While processing {internal_link_tag}')
anchor_tag = anchor_tags[0] def detect_one_chapter(self, nav_point: NavPoint):
assert anchor_tag.attrs['id'] in [new_id, a_tag_id]
# if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
# create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs['id'] not in self.internal_anchors:
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id)
del anchor_tag.attrs['id']
del internal_link_tag.attrs['href']
else:
internal_link_tag.attrs['converter-mark'] = 'bad-link'
self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.'
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={a_tag_id}')
def build_one_chapter(self, nav_point: NavPoint):
""" """
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
3 cases: 3 cases:
id wraps all chapter content, id wraps all chapter content,
id wraps chapter's content + subchapters' content id wraps chapter"s content + subchapters" content
id points to the start of title of a chapter id points to the start of title of a chapter
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
@@ -494,68 +549,82 @@ class EpubConverter:
""" """
if nav_point.id: if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href] soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks( subchapter_tags = self.get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup) first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser') new_tree = BeautifulSoup("", "html.parser")
for tag in chapter_tags: for subchapter_tag in subchapter_tags:
new_tree.append(tag) new_tree.append(subchapter_tag)
self.href_chapter_id2soup_html[( self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)] = new_tree nav_point.href, nav_point.id)] = new_tree
if self.adjacency_list.get(nav_point): if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]: for sub_node in self.adjacency_list[nav_point]:
self.build_one_chapter(sub_node) self.detect_one_chapter(sub_node)
def define_chapters_content(self): def define_chapters_with_content(self):
"""Function build chapters content, starts from top level chapters""" """Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points: if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points: for tl_nav_point in top_level_nav_points:
self.build_one_chapter(point) self.detect_one_chapter(tl_nav_point)
def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
"""
Function prepare style, tags to json structure
Parameters
----------
nav_point: NavPoint
lvl: int
level of chapter
Returns
-------
ChapterItem
built chapter
"""
title = nav_point.title title = nav_point.title
if nav_point.id: content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \
content: BeautifulSoup = self.href_chapter_id2soup_html[( if nav_point.id else self.html_href2html_body_soup[nav_point.href]
nav_point.href, nav_point.id)]
else: indent = " " * lvl
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href] self.logger.log(indent + f"Chapter: {title} is processing.")
self.book_image_src_path2aws_path = update_images_src_links(content, is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
self.logger.log(indent + "Process title.")
title_preprocessed = self.html_processor.prepare_title(title)
self.logger.log(indent + "Process content.")
content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed,
self.img_href2img_bytes, self.img_href2img_bytes,
path_to_html=nav_point.href, path_to_html=nav_point.href,
access=self.access, access=self.access,
path2aws_path=self.book_image_src_path2aws_path, path2aws_path=self.book_image_src_path2aws_path,
book_id=self.file_path.stem book_id=Path(self.book_path).stem)
if hasattr(self.file_path, 'stem') else 'book_id')
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)
content_preprocessed = prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
sub_nodes = [] sub_nodes = []
# warning! not EpubHtmlItems won't be added to chapter # warning! not EpubHtmlItems won't be added to chapter
# if it doesn't have subchapters
if self.adjacency_list.get(nav_point): if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]: for sub_node in self.adjacency_list[nav_point]:
sub_chapter_item = self.node_to_livecarta_chapter_item( sub_chapter_item = self.html_node_to_livecarta_chapter_item(
sub_node, lvl + 1) sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item) sub_nodes.append(sub_chapter_item)
return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes)
if self.logger:
indent = ' ' * lvl
self.logger.log(f'{indent}Chapter: {title} is prepared.')
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self) -> dict: def convert_to_dict(self) -> dict:
"""Function which convert list of html nodes to appropriate json structure""" """Function which convert list of html nodes to appropriate json structure"""
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = [] top_level_chapters = []
for nav_point in top_level_nav_points: # loop through to level chapters
chapter = self.node_to_livecarta_chapter_item(nav_point) for tl_nav_point in top_level_nav_points:
chapter = self.html_node_to_livecarta_chapter_item(tl_nav_point)
top_level_chapters.append(chapter) top_level_chapters.append(chapter)
top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] top_level_dict_chapters = [x.to_dict() for x in top_level_chapters]
self.logger.log(f'Anchors found: {len(self.internal_anchors)}.') self.logger.log(f"Anchors found: {len(self.internal_anchors)}.")
self.logger.log('End conversion.') self.logger.log("End conversion.")
return { return {
"content": top_level_dict_chapters, "content": top_level_dict_chapters,
@@ -564,12 +633,16 @@ class EpubConverter:
if __name__ == "__main__": if __name__ == "__main__":
epub_file_path = '../../epub/9781614382264.epub' epub_file_path = "../../books/epub/9780763774134.epub"
logger_object = BookLogger( logger_object = BookLogger(
name='epub', book_id=epub_file_path.split('/')[-1]) name="epub", book_id=epub_file_path.split("/")[-1])
json_converter = EpubConverter(epub_file_path, logger=logger_object) css_processor = CSSPreprocessor()
html_processor = HtmlEpubPreprocessor(logger=logger_object)
json_converter = EpubConverter(epub_file_path, logger=logger_object,
css_processor=css_processor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json: with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json:
json.dump(content_dict, f_json, ensure_ascii=False) json.dump(content_dict, f_json, ensure_ascii=False)

View File

@@ -1,4 +1,6 @@
from src.book_solver import BookSolver from src.book_solver import BookSolver
from src.epub_converter.css_processor import CSSPreprocessor
from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor
from src.epub_converter.epub_converter import EpubConverter from src.epub_converter.epub_converter import EpubConverter
@@ -7,15 +9,17 @@ class EpubBook(BookSolver):
def __init__(self, book_id=0, access=None, main_logger=None): def __init__(self, book_id=0, access=None, main_logger=None):
super().__init__(book_id, access, main_logger) super().__init__(book_id, access, main_logger)
self.book_type = 'epub' self.book_type = "epub"
def get_converted_book(self): def get_converted_book(self):
""" """
Function Function
Steps Steps
---------- ----------
1. Converts .epub to .html 1. Gets data from preset structure
2. Parses from line structure to nested structure 2. Add preset to html preprocessor
3. Converts .epub to .html
4. Parses from line structure to nested structure
Returns Returns
---------- ----------
@@ -23,7 +27,10 @@ class EpubBook(BookSolver):
json for LiveCarta platform json for LiveCarta platform
""" """
css_processor = CSSPreprocessor()
html_processor = HtmlEpubPreprocessor(self.preset_path, logger=self.logger_object)
json_converter = EpubConverter( json_converter = EpubConverter(
self.file_path, access=self.access, logger=self.logger_object) self.book_path, access=self.access, logger=self.logger_object,
css_processor=css_processor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
return content_dict return content_dict

View File

@@ -0,0 +1,91 @@
import re
from typing import Tuple
from bs4 import BeautifulSoup, Tag
def _replace_with_livecarta_anchor_tag(anchor, i):
"""Function replace noteref_tag(anchor) with new livecarta tag"""
new_tag = BeautifulSoup(features="lxml").new_tag("sup")
new_tag["class"] = "footnote-element"
new_tag["data-id"] = i + 1
new_tag["id"] = f"footnote-{i + 1}"
new_tag.string = "*"
if anchor.parent.name == "sup":
anchor.parent.unwrap()
anchor.replace_with(new_tag)
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name="epub:type") \
-> Tuple[list, list, list]:
"""
This function preprocessing footnotes
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes, new_noterefs_tags, new_footnotes_tags = [], [], []
noterefs_tags = source_html_tag.find_all(
attrs={noteref_attr_name: "noteref"})
bad_noterefs_tags = set(
[tag for tag in noterefs_tags if not tag.attrs.get("href")])
noterefs_tags = [
tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str) -> Tuple[str, str]:
"""Returns name of file & id of an anchor"""
assert "#" in s, f"Error. Unexpected href: {s} in a tag. Href must contain an id."
f, id_ = s.split("#")
return f, id_
def verify_footnote_tag(tags: list):
"""Function verifies is tag - footnote"""
assert len(tags) <= 1, f"Error, Multiple id: {href}.\n{tags}"
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(
f"Warning. Href for tag is detected as footnote:\n{noteref_tag}")
return anchored_tags
else:
assert 0, f"Error, No element with id: {href} found."
return tags
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs["href"]
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(
f"Error while footnotes processing. For {noteref_tag} invalid path: {file}.")
continue
possible_footnote = "note|footnote|endnote|rearenote"
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={"epub:type": re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote":
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(
_replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(
attrs={"role": "docs-backlink"}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)):
noteref.attrs["data-id"] = i + 1
noteref.attrs["id"] = f"footnote-{i + 1}"
footnote.attrs["href"] = f"#footnote-{i + 1}"
return footnotes, new_noterefs_tags, new_footnotes_tags

View File

@@ -1,666 +0,0 @@
import os
import re
import pathlib
from typing import Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from src.access import Access
from src.livecarta_config import LiveCartaConfig
def _replace_with_livecarta_anchor_tag(anchor, i):
"""Function replace noteref_tag(anchor) with new livecarta tag"""
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
if anchor.parent.name == 'sup':
anchor.parent.unwrap()
anchor.replace_with(new_tag)
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-> Tuple[list, list, list]:
"""
This function preprocessing footnotes
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(
attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set(
[tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [
tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
new_noterefs_tags = []
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str) -> Tuple[str, str]:
"""Returns name of file & id of an anchor"""
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#')
return f, id_
def verify_footnote_tag(tags: list):
"""Function verifies is tag - footnote"""
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(
f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
return tags
for i, noteref_tag in enumerate(noterefs_tags):
href = noteref_tag.attrs['href']
file, element_id = parse_a_tag_href(href)
if not file:
target_html_tag = source_html_tag
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(
f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
expected_footnote_tags = list(target_html_tag.find_all(id=element_id,
attrs={'epub:type': re.compile(possible_footnote)}))
expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(
_replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(
attrs={'role': 'doc-backlink'}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
return footnotes, new_noterefs_tags, new_footnotes_tags
def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup:
"""
Main function that works with structure of html. Make changes inplace.
Parameters
----------
body_tag: Tag, soup object
Steps
----------
1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
This tag must have a body_tag as a parent.
Otherwise, it is wrapped with some tags. Like:
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString
Returns
-------
body_tag: Tag, BeautifulSoup
adjusted body_tag
"""
def _preserve_class_in_aside_tag(tag_):
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
if tag_.parent.name == 'aside':
if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class
def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
"""
Function saves css style inherited from class, copies class to child <p>
returns True, if <section> could be unwrapped
Parameters
----------
tag_: Tag, soup object
Returns
-------
bool
"""
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
if 'feature' not in tag_class:
return True
child_p_tags = tag_.find_all("p")
if len(child_p_tags) == 1:
child_p_tag = child_p_tags[0]
if not child_p_tag.attrs.get('class'):
child_p_tag.attrs['class'] = tag_class
return True
elif len(child_p_tags) > 1:
tag_.name = 'p'
return False
else:
return True
def _add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
_insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
def _replace_div_tag_with_table():
"""
Function replace <div> with <table>:
1. Convert div with certain classes to tables
2. Add background color to div with background-color
"""
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(
div.attrs['class'], list) else div.attrs['class'][0]
if div_class in ['C409', 'C409a']:
_wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')
elif div_class in ['C441', 'C816']:
_wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')
if div.attrs.get('style'):
if 'background-color' in div.attrs['style']:
end_index = div.attrs['style'].find(
'background-color') + len('background-color')
start_index_of_color = end_index + 2
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
_wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
elif div.attrs.get('style') == '':
del div.attrs['style']
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
if div.contents:
is_not_struct_tag = [
child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
_add_span_to_save_ids_for_links(div)
div.unwrap()
def _heading_tag_to_p_tag(body_tag):
"""Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
# comments removal
for tag in body_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
_replace_div_tag_with_table()
for s in body_tag.find_all("section"):
could_be_unwrapped = True
if s.attrs.get('class'):
could_be_unwrapped = _preserve_class_in_section_tag(s)
_add_span_to_save_ids_for_links(s)
if could_be_unwrapped:
s.unwrap()
for s in body_tag.find_all("article"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("figure"):
s.name = 'p'
# to center image inside this tag
s.attrs['style'] = "text-align: center;"
for s in body_tag.find_all("figcaption"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("aside"):
s.name = 'blockquote'
for s in body_tag.find_all("main"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("body"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("html"):
_add_span_to_save_ids_for_links(s)
s.unwrap()
for s in body_tag.find_all("header"):
s.name = 'span'
# check marks for chapter starting are on the same 1 level
marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'})
parents_marks_are_body = [x.parent == body_tag for x in marks]
# fix marks to be on 1 level
if not all(parents_marks_are_body):
for x in marks:
while x.parent != body_tag:
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(
parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
_heading_tag_to_p_tag(body_tag)
# wrap NavigableString with <p>
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r'([\n\t\xa0])', ' ', content)
content = content.strip()
if content:
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id:
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
href:
Name of current chapter's file
html_soup: Tag
Soup object of current file
Returns
-------
tags: list [Tag, NavigableString]
Chapter's tags
"""
marked_tags = html_soup.find(
attrs={'id': first_id, 'class': 'converter-chapter-mark'})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
if not isinstance(next_tag, NavigableString) and\
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
else:
assert 0, f'Warning: no match for {first_id, href}'
return tags
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images to Amazon web service"""
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link_path
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'../json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, 'wb+')
f.write(img_content)
f.close()
return new_img_path
def update_images_src_links(body_tag: BeautifulSoup,
href2img_content: dict,
path_to_html: str,
access=None,
path2aws_path: dict = None,
book_id: str = None) -> dict:
"""Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all('img')
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(
html_folder, path_to_img_from_html)).replace('\\', '/')
assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
img_content = href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(
access, path_to_img_from_root, img_content, book_id)
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(
path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):
del img.attrs['width']
if img.attrs.get('height'):
del img.attrs['height']
if img.attrs.get('style'):
del img.attrs['style']
return path2aws_path
def _clean_title_from_numbering(title: str):
"""Function removes numbering from titles"""
title = re.sub(r'^(\s+)+', '', title)
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
# title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
def prepare_title(title_of_chapter: str) -> str:
"""Function finalise processing/cleaning title"""
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
title_str = _clean_title_from_numbering(title_str)
return title_str
def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
"""Function inserts span before tag aren't supported by livecarta"""
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
new_tag.string = "\xa0"
tag.insert_before(new_tag)
def _clean_headings_content(content: BeautifulSoup, title: str):
def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
if tag_to_be_removed.attrs.get('id'):
_insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=tag_to_be_removed.attrs.get(
'id'),
class_=tag_to_be_removed.attrs.get('class'))
for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'):
_insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class'))
title = title.lower()
for child in content.contents:
if isinstance(child, NavigableString):
text = child
else:
text = child.text
if text and re.sub(r'([\n\t\xa0])', '', text):
text = re.sub(r'([\n\t\xa0])', ' ', text)
text = re.sub(r' +', ' ', text).strip()
text = text.lower()
if title == text:
add_span_to_save_ids_for_links(child, content)
child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
add_span_to_save_ids_for_links(child, content)
child.extract()
break
def _process_lists(body_tag: BeautifulSoup):
"""
Function
- process tags <li>.
- unwrap <p> tags.
Parameters
----------
body_tag: Tag, soup object
Returns
-------
None
"""
li_tags = body_tag.find_all("li")
for li_tag in li_tags:
if li_tag.p:
li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap()
def _preprocess_table(body_tag: BeautifulSoup):
"""Function to preprocess tables and tags(td|th|tr): style"""
tables = body_tag.find_all("table")
for table in tables:
t_tags = table.find_all(re.compile("td|th|tr"))
for t_tag in t_tags:
style = t_tag.get('style')
width = ''
if style:
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match:
size = width_match.group(1)
width = size + 'px'
t_tag.attrs['width'] = t_tag.get('width') or width
if t_tag.attrs.get('style'):
t_tag.attrs['style'] = t_tag.attrs['style'].replace(
'border:0;', '')
elif t_tag.attrs.get('style') == '':
del t_tag.attrs['style']
if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
table.attrs['border'] = '1'
def _preprocess_code_tags(chapter_tag: BeautifulSoup):
"""
Function
- transform <code>, <kdb>, <var> tags into span
- add code style to this tags
Parameters
----------
chapter_tag: Tag, soup object
Returns
-------
None
"""
for code in chapter_tag.find_all(re.compile("code|kbd|var")):
if not code.parent.name == "pre":
code.name = "span"
continue
# if tag isn't in pre and doesn't have style
if not code.attrs.get('style'):
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
def _prepare_formatted(text: str) -> str:
"""Function replaces special symbols with their Unicode representation"""
text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
text = text.replace(' ', "\xa0")
text = text.replace('𝑓', "\xf0\x9d\x91\x93")
return text
def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
"""
Function preprocessing <pre> tags
Wrap string of the tag with <code> if it's necessary
Parameters
----------
chapter_tag: Tag, soup object
Returns
----------
None
Modified chapter tag
"""
for pre in chapter_tag.find_all("pre"):
if pre.find_all("code|kbd|var"):
continue
else:
code = chapter_tag.new_tag("code")
# insert all items that was in pre to code and remove from pre
for content in reversed(pre.contents):
code.insert(0, content.extract())
# wrap code with items
pre.append(code)
def _clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
hr.extract()
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
"""Function wraps <block> with <table>"""
table = main_tag.new_tag("table")
table.attrs['border'] = border
table.attrs['align'] = 'center'
table.attrs['style'] = f'width:{width}%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
# td.attrs['border-radius'] = '8px'
if bg_color:
td.attrs['bgcolor'] = bg_color
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
return table
def _preprocess_block_tags(chapter_tag: Tag):
"""Function preprocessing <block> tags"""
for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get(
'class') == 'feature2' else color
_wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(future_block)
color = '#DDDDDD' if future_block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if future_block.attrs.get(
'class') == 'feature2' else color
_wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""
Function finalise processing/cleaning content
Parameters
----------
title_str: str
content_tag: Tag, soup object
remove_title_from_chapter: bool
Steps
----------
1. find \n
2. heading removal
3. processing tags
4. class removal
Returns
-------
content_tag: str
prepared content
"""
# 1. find \n
to_remove = []
for child in content_tag.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t])', '', child.string)
if s == '':
to_remove.append(child)
# 2. heading removal
if remove_title_from_chapter:
_clean_headings_content(content_tag, title_str)
# 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
_process_lists(content_tag)
_preprocess_table(content_tag)
_preprocess_code_tags(content_tag)
_preprocess_pre_tags(content_tag)
_preprocess_block_tags(content_tag)
# 4. class removal
for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']):
del tag.attrs['class']
return str(content_tag)

View File

@@ -0,0 +1,426 @@
import re
import json
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
from src.util.helpers import BookLogger
class HtmlEpubPreprocessor:
def __init__(self, preset_path="../../presets/presets.json", logger=None):
self.preset = json.load(open(preset_path))
self.logger: BookLogger = logger
self.name2function = {
"table_wrapper": self._wrap_tags_with_table,
"replacer": self._tags_to_correspond_livecarta_tag,
"attr_replacer": self._replace_attrs_in_tags,
"unwrapper": self._unwrap_tags,
"inserter": self._insert_tags_into_correspond_tags
}
@staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
Parameters
----------
tag_to_be_removed: Soup object
chapter_tag: BeautifulSoup
Returns
-------
None
updated body tag
"""
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
class_: list):
"""Function inserts span before tag aren't supported by LiveCarta"""
new_tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or ""
new_tag.attrs["class"] = class_ or ""
new_tag.string = "\xa0"
tag_to_be_removed.insert_before(new_tag)
if tag_to_be_removed.attrs.get("id"):
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
id_=tag_to_be_removed.attrs["id"],
class_=tag_to_be_removed.attrs.get("class"))
@staticmethod
def prepare_title(title_of_chapter: str) -> str:
"""
Function finalise processing/cleaning title
Parameters
----------
title_of_chapter: str
Returns
-------
title: str
cleaned title
"""
title = BeautifulSoup(title_of_chapter, features="lxml").string
# clean extra whitespace characters ([\r\n\t\f\v ])
title = re.sub(r"[\s\xa0]", " ", title).strip()
return title
@staticmethod
def _remove_comments(chapter_tag: BeautifulSoup):
"""
Function remove comments
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag without comments
"""
for tag in chapter_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
@staticmethod
def _wrap_strings_with_p(chapter_tag: BeautifulSoup):
"""
Function converts headings that aren't supported by LiveCarta with <p>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with wrapped NavigableStrings
"""
for node in chapter_tag:
if isinstance(node, NavigableString):
content = str(node)
content = re.sub(r"([\s\xa0])", " ", content).strip()
if content:
p_tag = chapter_tag.new_tag("p")
p_tag.append(str(node))
node.replace_with(p_tag)
def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: list):
"""
Function wraps <tag> with <table>
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with wrapped certain tags with <table>
"""
def _wrap_tag_with_table(width="100", border="", bg_color=None):
table = chapter_tag.new_tag("table")
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
= border, "center", f"width:{width}%;"
tbody, tr, td = \
chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
td.attrs["bgcolor"] = bg_color
tag_to_wrap.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
def process_tag_using_table():
_wrap_tag_with_table(
width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100",
border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None,
bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag)
tag_to_wrap.unwrap()
for rule in rules:
tags = rule["tags"]
for attr in rule["attrs"]:
for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
process_tag_using_table()
@staticmethod
def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: list):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with all tags replaced with LiveCarta tags
"""
for rule in rules:
tags = rule["tags"]
tag_to_replace = rule["tag_to_replace"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if tag.parent.select(condition_on_tag[1]):
tag.name = tag_to_replace
elif condition_on_tag[0] == 'child_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
tag.name = tag_to_replace
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
tag.name = tag_to_replace
else:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
# todo can cause appearance of \n <p><p>...</p></p> -> <p>\n</p> <p>...</p> <p>\n</p> (section)
tag.name = tag_to_replace
@staticmethod
def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list):
"""
Function to replace all tags to correspond LiveCarta tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with all tags replaced with LiveCarta tags
"""
for rule in rules:
attr = rule["attr"]
tags = rule["condition"]["tags"]
attr_to_replace = rule["attr_to_replace"]
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr: re.compile(r".*")}):
tag[attr_to_replace] = tag[attr]
del tag[attr]
def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict):
"""
Function unwrap tags and moves id to span
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with unwrapped certain tags
"""
for tag_name in rules["tags"]:
for tag in chapter_tag.select(tag_name):
# if tag is a subtag
if ">" in tag_name:
tag.parent.attrs.update(tag.attrs)
self._add_span_to_save_ids_for_links(tag, chapter_tag)
tag.unwrap()
@staticmethod
def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: list):
"""
Function inserts tags into correspond tags
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with inserted tags
"""
def insert(tag):
tag_to_insert = \
chapter_tag.new_tag(rule["tag_to_insert"])
# insert all items that was in tag to subtag and remove from tag
for content in reversed(tag.contents):
tag_to_insert.insert(0, content.extract())
# wrap subtag with items
tag.append(tag_to_insert)
for rule in rules:
tags = rule["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == 'parent_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if tag.parent.select(condition_on_tag[1]):
insert(tag)
elif condition_on_tag[0] == 'child_tags':
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])):
insert(tag)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
insert(tag)
else:
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
insert(tag)
def _remove_headings_content(self, chapter_tag, title_of_chapter: str):
"""
Function
- cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content
- adds span with id in order to
Parameters
----------
chapter_tag: soup object
Tag of the page
title_of_chapter: str
Chapter title
Returns
-------
None
clean/remove headings & add span with id
"""
title_of_chapter = title_of_chapter.lower()
if title_of_chapter == "chapter 1":
pass
for tag in chapter_tag.contents:
text = tag if isinstance(tag, NavigableString) else tag.text
if re.sub(r"[\s\xa0]", "", text):
text = re.sub(r"[\s\xa0]", " ", text).lower()
text = text.strip() # delete extra spaces
if not isinstance(tag, NavigableString):
if title_of_chapter == text or \
(title_of_chapter in text and
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
self._add_span_to_save_ids_for_links(tag, chapter_tag)
tag.extract()
return
elif not self._remove_headings_content(tag, title_of_chapter):
break
else:
tag.extract()
return
@staticmethod
def _process_tables(chapter_tag: BeautifulSoup):
"""
Function preprocesses tables and tags(td|th|tr)
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag with processed tables
"""
tables = chapter_tag.find_all("table")
for table in tables:
for t_tag in table.find_all(re.compile("td|th|tr")):
width = ""
if t_tag.get("style"):
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
if width_match:
size = width_match.group(1)
width = size + "px"
t_tag.attrs["width"] = t_tag.get("width") or width
if t_tag.attrs.get("style"):
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
"border:0;", "")
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
del t_tag.attrs["style"]
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1"
@staticmethod
def _class_removing(chapter_tag: BeautifulSoup):
"""
Function removes classes that aren't created by converter
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
None
Chapter Tag without original classes of the book
"""
for tag in chapter_tag.find_all(recursive=True):
if tag.attrs.get("class") \
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
"""
Function finalise processing/cleaning content
Parameters
----------
title_str: str
content_tag: Tag, soup object
remove_title_from_chapter: bool
Steps
----------
1. comments removal
2. wrap NavigableString with tag <p>
3-6. wrap tags with <table>
replace tags with correspond LiveCarta tags
unwrap tags
insert tags into correspond tags
7. heading removal
8. process_tables
9. class removal
Returns
-------
content_tag: Tag
prepared content
"""
# 1. remove comments
self._remove_comments(content_tag)
# 2.
self._wrap_strings_with_p(content_tag)
# 3-6.
for dict in self.preset:
func = self.name2function[dict["preset_name"]]
func(content_tag, dict['rules'])
# 7.
if remove_title_from_chapter:
self._remove_headings_content(content_tag, title_str)
# 8.
self._process_tables(content_tag)
# 9. remove classes that weren't created by converter
self._class_removing(content_tag)
return content_tag

View File

@@ -0,0 +1,65 @@
import os
import pathlib
from bs4 import BeautifulSoup
from src.access import Access
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images to Amazon web service"""
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link_path
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f"../books/json/img_{book_id}/"))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
f = open(new_img_path, "wb+")
f.write(img_content)
f.close()
return new_img_path
def update_images_src_links(body_tag: BeautifulSoup,
img_href2img_content: dict,
path_to_html: str,
access=None,
path2aws_path: dict = None,
book_id: str = None) -> dict:
"""Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all("img")
for img in img_tags:
path_to_img_from_html = img.attrs.get("src")
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(
html_folder, path_to_img_from_html)).replace("\\", "/")
assert path_to_img_from_root in img_href2img_content, \
f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest."
img_content = img_href2img_content[path_to_img_from_root]
if access is not None:
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(
access, path_to_img_from_root, img_content, book_id)
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(
path_to_img_from_root, img_content, book_id)
img.attrs["src"] = str(new_folder)
if img.attrs.get("width"):
del img.attrs["width"]
if img.attrs.get("height"):
del img.attrs["height"]
if img.attrs.get("style"):
del img.attrs["style"]
return path2aws_path

View File

@@ -4,61 +4,62 @@ from typing import List
from logging import CRITICAL from logging import CRITICAL
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from premailer import transform
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.epub_converter.css_preprocessing import LIVECARTA_STYLE_ATTRS
cssutils.log.setLevel(CRITICAL) cssutils.log.setLevel(CRITICAL)
class TagStyleConverter: class TagInlineStyleProcessor:
def __init__(self, tag_inline_style): def __init__(self, tag_inline_style):
# tag with inline style + style parsed from css file # tag with inline style + style parsed from css file
self.tag_inline_style = tag_inline_style self.tag_inline_style = tag_inline_style
self.style = self.process_inline_style() self.tag_inline_style.attrs['style'] = self.process_inline_style()
@staticmethod @staticmethod
def remove_white_if_no_bgcolor(style_, tag): def remove_white_if_no_bgcolor(style_, tag):
"""Function remove text white color if there is no bg color""" """Function remove text white color if there is no bg color"""
if 'background' in style_: if "background" in style_:
style_ = style_.replace( style_ = style_.replace(
'background:', 'background-color:') "background:", "background-color:")
return style_ return style_
# if text color is white, check that we have bg-color # if text color is white, check that we have bg-color
if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): if ("color:#ffffff" in style_) or ("color:#fff" in style_) or ("color:white" in style_):
# if bg color is inherited, just return style as is # if bg color is inherited, just return style as is
for parent_tag in tag.parents: for parent_tag in tag.parents:
# white bg color not need to be checked as we do not write 'white bg color' # white bg color not need to be checked as we do not write "white bg color"
tag_with_bg = ['span', 'td', 'tr', 'p'] tag_with_bg = ["span", "td", "tr", "p"]
tag_will_be_saved = parent_tag.name in tag_with_bg tag_will_be_saved = parent_tag.name in tag_with_bg
has_bg = parent_tag.attrs.get('style') and ( has_bg = parent_tag.attrs.get("style") and (
'background' in parent_tag.attrs.get('style')) "background" in parent_tag.attrs.get("style"))
if has_bg and tag_will_be_saved: if has_bg and tag_will_be_saved:
return style_ return style_
children = tag.find_all() children = tag.find_all()
for child in children: for child in children:
if child.attrs.get('style') and ('background' in child.attrs.get('style')): if child.attrs.get("style") and ("background" in child.attrs.get("style")):
tmp_style = child.attrs['style'] + '; color:#fff; ' tmp_style = child.attrs["style"] + "; color:#fff; "
child.attrs['style'] = tmp_style child.attrs["style"] = tmp_style
# for child with bg color we added white text color, so this tag don't need white color # for child with bg color we added white text color, so this tag don"t need white color
style_ = style_.replace('color:#fff;', '') style_ = style_.replace("color:#fff;", "")
style_ = style_.replace('color:#ffffff;', '') style_ = style_.replace("color:#ffffff;", "")
style_ = style_.replace('color:white;', '') style_ = style_.replace("color:white;", "")
return style_ return style_
@staticmethod # @staticmethod
def duplicate_styles_check(split_style: list) -> list: # def duplicate_styles_check(split_style: list) -> list:
style_name2style_value = {} # style_name2style_value = {}
for list_item in split_style: # # {key: val for for list_item in split_style}
key, val = list_item.split(":") # splitstrs = (list_item.split(":") for list_item in split_style)
if val not in style_name2style_value.keys(): # d = {key: val for key, val in splitstrs}
style_name2style_value[key] = val # for list_item in split_style:
split_style = [k + ":" + v for k, v in style_name2style_value.items()] # key, val = list_item.split(":")
return split_style # if key not in style_name2style_value.keys():
# style_name2style_value[key] = val
# split_style = [k + ":" + v for k, v in style_name2style_value.items()]
# return split_style
@staticmethod @staticmethod
def indents_processing(split_style: list) -> str: def indents_processing(split_style: list) -> str:
@@ -68,7 +69,7 @@ class TagStyleConverter:
Parameters Parameters
---------- ----------
split_style: list split_style: list
list of styles split by ';' list of styles split by ";"
Returns Returns
---------- ----------
@@ -76,12 +77,12 @@ class TagStyleConverter:
processed style with counted indent processed style with counted indent
""" """
processed_style = ";".join(split_style) processed_style = ";".join(split_style)+';'
margin_left_regexp = re.compile( margin_left_regexp = re.compile(
r'((margin-left|margin): *(-*\w+);*)') r"((margin-left|margin): *(-*\w+);*)")
text_indent_regexp = re.compile( text_indent_regexp = re.compile(
r'(text-indent: *(-*\w+);*)') r"(text-indent: *(-*\w+);*)")
has_margin = re.search(margin_left_regexp, processed_style) has_margin = re.search(margin_left_regexp, processed_style)
has_text_indent = re.search(text_indent_regexp, processed_style) has_text_indent = re.search(text_indent_regexp, processed_style)
@@ -92,21 +93,21 @@ class TagStyleConverter:
if has_text_indent: if has_text_indent:
num_ti = abs(int("0" + "".join( num_ti = abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2)))))) filter(str.isdigit, str(has_text_indent.group(2))))))
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(num_m - num_ti)) + 'px; ') str(abs(num_m - num_ti)) + "px; ")
processed_style = processed_style.replace( processed_style = processed_style.replace(
has_margin.group(1), '') has_margin.group(1), "")
return processed_style return processed_style
processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' + processed_style = processed_style.replace(has_margin.group(1), "text-indent: " +
str(abs(num_m)) + 'px; ') str(abs(num_m)) + "px; ")
return processed_style return processed_style
elif has_text_indent: elif has_text_indent:
processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
str(abs(int("0" + "".join( str(abs(int("0" + "".join(
filter(str.isdigit, str(has_text_indent.group(2))))))) filter(str.isdigit, str(has_text_indent.group(2)))))))
+ 'px; ') + "px; ")
return processed_style return processed_style
return processed_style return processed_style
@@ -126,23 +127,20 @@ class TagStyleConverter:
processed inline style processed inline style
""" """
inline_style = self.tag_inline_style.attrs.get('style') + ';' inline_style = self.tag_inline_style.attrs.get("style") + ";"
# 1. Remove white color if tag doesn't have background color in style # 1. Remove white color if tag doesn"t have background color in style
inline_style = self.remove_white_if_no_bgcolor( inline_style = self.remove_white_if_no_bgcolor(
inline_style, self.tag_inline_style) inline_style, self.tag_inline_style)
inline_style = inline_style.replace( inline_style = inline_style.replace(
'list-style-image', 'list-style-type') "list-style-image", "list-style-type")
# 2. Create list of styles from inline style # 2. Create list of styles from inline style
# replace all spaces between '; & letter' to ';' # replace all spaces between "; & letter" to ";"
style = re.sub(r"; *", ";", inline_style) style = re.sub(r"; *", ";", inline_style)
# when we split style by ';', last element of the list is '' - None (remove it) # when we split style by ";", last element of the list is "" - None (remove it)
split_inline_style: list = list(filter(None, style.split(';'))) split_inline_style: list = list(filter(None, style.split(";")))
# 3. Duplicate styles check - if the tag had duplicate styles # 3. Duplicate styles check - if the tag had duplicate styles
split_inline_style = self.duplicate_styles_check(split_inline_style) # split_inline_style = self.duplicate_styles_check(split_inline_style)
# 4. Processing indents
# 4. Processing indents#
inline_style: str = self.indents_processing(split_inline_style) inline_style: str = self.indents_processing(split_inline_style)
return inline_style return inline_style
@@ -164,19 +162,19 @@ class TagStyleConverter:
""" """
styles_to_remove = [] styles_to_remove = []
for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style: if f"{k[0]}:{k[1]}" in style:
styles_to_remove.append(k) styles_to_remove.append(k)
return styles_to_remove return styles_to_remove
def change_attrs_with_corresponding_tags(self): def change_attrs_with_corresponding_tags(self):
# adds <strong>, <u>, <sup> instead of styles # adds <strong>, <u>, <sup> instead of styles
styles_to_remove = self.check_style_to_be_tag(self.style) styles_to_remove = self.check_style_to_be_tag(self.tag_inline_style.attrs['style'])
for i, (attr, value) in enumerate(styles_to_remove): for i, (attr, value) in enumerate(styles_to_remove):
self.tag_inline_style.attrs['style'] = self.tag_inline_style.attrs['style']\ self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\
.replace(f'{attr}:{value};', '').strip() .replace(f"{attr}:{value};", "").strip()
corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)] attr, value)]
correspond_tag = BeautifulSoup(features='lxml').new_tag(corr_tag_name) correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name)
for content in reversed(self.tag_inline_style.contents): for content in reversed(self.tag_inline_style.contents):
correspond_tag.insert(0, content.extract()) correspond_tag.insert(0, content.extract())
self.tag_inline_style.append(correspond_tag) self.tag_inline_style.append(correspond_tag)
@@ -184,75 +182,37 @@ class TagStyleConverter:
@staticmethod @staticmethod
def wrap_span_in_tag_to_save_style_attrs(initial_tag): def wrap_span_in_tag_to_save_style_attrs(initial_tag):
"""Function designed to save style attrs that cannot be in tag.name -> span""" """Function designed to save style attrs that cannot be in tag.name -> span"""
dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG)) dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG))
if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'): if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"):
styles_can_be_in_tag = [style styles_can_be_in_tag = [style
for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items() for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items()
if re.match(tag, initial_tag.name) if re.match(tag, initial_tag.name)
for style in styles] for style in styles]
styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS
if attr not in styles_can_be_in_tag] if attr not in styles_can_be_in_tag]
span_style = initial_tag.attrs['style'] span_style = initial_tag.attrs["style"]
# here check that this style is exactly the same. # here check that this style is exactly the same.
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top' # Not "align" when we have "text-align", or "border" when we have "border-top"
styles_to_be_saved_in_span = [((attr + ':') in span_style) & ( styles_to_be_saved_in_span = [((attr + ":") in span_style) & (
'-' + attr not in span_style) for attr in styles_cant_be_in_tag] "-" + attr not in span_style) for attr in styles_cant_be_in_tag]
if any(styles_to_be_saved_in_span): if any(styles_to_be_saved_in_span):
# if we find styles that cannot be in <tag.name> -> wrap them in span # if we find styles that cannot be in <tag.name> -> wrap them in span
tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}') tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}")
style = '' style = ""
possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag] possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag]
for possible_attr_regexp in possible_attrs_regexp: for possible_attr_regexp in possible_attrs_regexp:
has_style_attrs = re.search( has_style_attrs = re.search(
possible_attr_regexp, span_style) possible_attr_regexp, span_style)
if has_style_attrs and has_style_attrs.group(1): if has_style_attrs and has_style_attrs.group(1):
style += has_style_attrs.group(1) style += has_style_attrs.group(1)
span_style = span_style.replace( span_style = span_style.replace(
has_style_attrs.group(1), '') has_style_attrs.group(1), "")
tag.attrs['style'] = style tag.attrs["style"] = style
initial_tag.name = 'span' initial_tag.name = "span"
initial_tag.attrs['style'] = span_style initial_tag.attrs["style"] = span_style
initial_tag.wrap(tag) initial_tag.wrap(tag)
def convert_initial_tag(self): def convert_initial_tag(self):
self.change_attrs_with_corresponding_tags() self.change_attrs_with_corresponding_tags()
self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style)
return self.tag_inline_style return self.tag_inline_style
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""
Function adds styles from .css to inline style.
Parameters
----------
html_soup: BeautifulSoup
html page with inline style
css_text: str
css content from css file
Returns
-------
inline_soup: BeautifulSoup
soup with styles from css
"""
# remove this specification because it causes problems
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
# here we add css styles to inline style
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
allow_network=False,
disable_validation=True,
)
# soup with converted styles from css
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')})
# go through the tags with inline style + style parsed from css file
for tag_inline_style in tags_with_inline_style:
style_converter = TagStyleConverter(tag_inline_style)
style_converter.convert_initial_tag()
return inline_soup

View File

@@ -9,12 +9,12 @@ class LiveCartaConfig:
HEADERS_LEVELS = {"h1", "h2", "h3", HEADERS_LEVELS = {"h1", "h2", "h3",
"h4", "h5", "h6", "h7", "h8", "h9"} "h4", "h5", "h6", "h7", "h8", "h9"}
DEFAULT_ALIGN_STYLE = 'left' DEFAULT_ALIGN_STYLE = "left"
ALIGN_STYLES = ['justify', 'right', 'center', 'left'] ALIGN_STYLES = ["justify", "right", "center", "left"]
# Main constant values # Main constant values
DEFAULT_FONT_NAME = 'Times New Roman' DEFAULT_FONT_NAME = "Times New Roman"
WORD_DEFAULT_FONT_SIZE = 11 WORD_DEFAULT_FONT_SIZE = 11
@@ -23,80 +23,56 @@ class LiveCartaConfig:
FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE /\ FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE /\
WORD_DEFAULT_FONT_SIZE WORD_DEFAULT_FONT_SIZE
FONT_CORRESPONDANCE_TABLE = {
"Arial": "arial,helvetica,sans-serif",
"Comic Sans MS": "comic sans ms,cursive",
"Courier New": "courier new,courier,monospace",
"Georgia": "georgia,serif",
"Lucida Sans Unicode": "lucida sans unicode,lucida grande,sans-serif",
"Tahoma": "tahoma,geneva,sans-serif",
"Times New Roman": "times new roman,times,serif",
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
"Verdana": "verdana,geneva,sans-serif",
"monospace": "courier new,courier,monospace",
"sans-serif": "arial,helvetica,sans-serif"
}
COLORS_MAP = { COLORS_MAP = {
'#ffff00': 'yellow', "#ffff00": "yellow",
'#00ff00': 'darkYellow', "#00ff00": "darkYellow",
'#00ffff': 'cyan', "#00ffff": "cyan",
'#ff00ff': 'magenta', "#ff00ff": "magenta",
'#0000ff': 'blue', "#0000ff": "blue",
'#ff0000': 'red', "#ff0000": "red",
'#000080': 'darkBlue', "#000080": "darkBlue",
'#008080': 'darkCyan', "#008080": "darkCyan",
'#008000': 'green', "#008000": "green",
'#800080': 'darkMagenta', "#800080": "darkMagenta",
'#808000': 'darkGreen', "#808000": "darkGreen",
'#c0c0c0': 'lightGray', "#c0c0c0": "lightGray",
'#ffffff': 'white', "#ffffff": "white",
'#800000': '#800000', "#800000": "#800000",
'#808080': '#808080' "#808080": "#808080"
} }
HTML42LIVECARTA_COLORS = { HTML42LIVECARTA_COLORS = {
'yellow': 'yellow', "yellow": "yellow",
'lime': 'green', "lime": "green",
'aqua': 'cyan', "aqua": "cyan",
'fuchsia': 'magenta', "fuchsia": "magenta",
'blue': 'blue', "blue": "blue",
'red': 'red', "red": "red",
'navy': 'darkBlue', "navy": "darkBlue",
'teal': 'darkCyan', "teal": "darkCyan",
'green': 'darkGreen', "green": "darkGreen",
'purple': 'darkMagenta', "purple": "darkMagenta",
'olive': 'darkYellow', "olive": "darkYellow",
'silver': 'lightGray', "silver": "lightGray",
'white': 'white', "white": "white",
'maroon': 'darkRed', # '#800000', "maroon": "darkRed", # "#800000",
'gray': 'darkGray', "gray": "darkGray",
'grey': 'darkGray', "grey": "darkGray",
} }
INDENT = '30px' INDENT = "30px"
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, list_types = ["circle", "disc", "armenian", "decimal",
1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, "decimal-leading-zero", "georgian", "lower-alpha", "lower-latin",
1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, "lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"]
2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px',
'19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px',
'30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px',
'41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px']
list_types = ['circle', 'disc', 'armenian', 'decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
structural_tags_names = [ structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', "div", "section", "article", "main", "body", "html", "aside",
'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p' "canvas", "data", "figure", "footer", "iframe", "span", "p"
] ]
could_have_style_in_livecarta_regexp = re.compile( could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') "(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)")
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
@@ -104,23 +80,60 @@ class LiveCartaConfig:
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p> <p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong', ("font-weight", "bold"): "strong",
('font-weight', '600'): 'strong', ("font-weight", "600"): "strong",
('font-weight', '700'): 'strong', ("font-weight", "700"): "strong",
('font-weight', '800'): 'strong', ("font-weight", "800"): "strong",
('font-weight', '900'): 'strong', ("font-weight", "900"): "strong",
('font-style', 'italic'): 'i', ("font-style", "italic"): "i",
('text-decoration', 'underline'): 'u', ("text-decoration", "underline"): "u",
('text-decoration', 'line-through'): 's', ("text-decoration", "line-through"): "s",
('text-decoration-line', 'underline'): 'u', ("text-decoration-line", "underline"): "u",
('text-decoration-line', 'line-through'): 's', ("text-decoration-line", "line-through"): "s",
('vertical-align', 'super'): 'sup' ("vertical-align", "super"): "sup"
} }
LIVECARTA_STYLES_CANT_BE_IN_TAG = { LIVECARTA_STYLES_CAN_BE_IN_TAG = {
'p': ['text-align', 'text-indent', 'border-bottom', 'border-top'], "p": ["text-align", "text-indent", "border-bottom", "border-top"],
'li': ['text-align', 'list-style-type'], "li": ["text-align", "list-style-type"],
'ul': ['list-style-type'], "ul": ["list-style-type"],
'ol': ['list-style-type'], "ol": ["list-style-type"],
'(^h[1-9]$)': ['list-style-type'] r"(^h[1-9]$)": ["list-style-type"]
}
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit LiveCarta css style convention.
If property has empty list, it means that any value can be converted.
If property has not empty list, it means that only certain property-value combinations can be transformed.
"""
LIVECARTA_STYLE_ATTRS = {
"text-indent": [],
"font-variant": ["small-caps"],
"text-align": [x for x in ["justify", "right", "center", "left"] if x != "left"],
"align": [],
"font": [],
"font-family": [],
"font-size": [],
"font-weight": ["bold", "600", "700", "800", "900"], # <strong>
"font-style": ["italic"], # <i>
"text-decoration": ["underline", "line-through"], # <u> , <s>
"text-decoration-line": ["underline", "line-through"], # <u> , <s>
"vertical-align": ["super"], # <sup>
"color": [],
"background-color": [],
"background": [],
"width": [],
"border": [],
"border-top-width": [],
"border-right-width": [],
"border-left-width": [],
"border-bottom-width": [],
"border-top": [],
"border-bottom": [],
"list-style-type": [],
"list-style-image": [],
"margin-left": [],
"margin-top": [],
"margin": [],
} }

View File

@@ -96,13 +96,13 @@ def str2hex(s: str):
if '#' in s and (len(s) <= 7): if '#' in s and (len(s) <= 7):
return s.lower() return s.lower()
if ('rgb' in s) and ('%' in s): if ('rgb' in s.lower()) and ('%' in s):
match = re.search(r'rgba*\(((\d+)%, *(\d+)%, *(\d+)%(, \d\.\d+)*)\)', s) match = re.search(r'rgba*\(((\d+)%, *(\d+)%, *(\d+)%(, \d\.\d+)*)\)', s)
if match: if match:
r, g, b = int(match.group(2)), int(match.group(3)), int(match.group(4)) r, g, b = int(match.group(2)), int(match.group(3)), int(match.group(4))
return rgb_percent_to_hex((r, g, b)) return rgb_percent_to_hex((r, g, b))
if 'rgb' in s: if 'rgb' in s.lower():
rgba = re.findall('([0-9] *\.?[0-9]+)', s) rgba = re.findall('([0-9] *\.?[0-9]+)', s)
r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2]) r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2])
if len(rgba) == 4: if len(rgba) == 4:
@@ -110,7 +110,7 @@ def str2hex(s: str):
r, g, b = rgba2rgb(r, g, b, alpha) r, g, b = rgba2rgb(r, g, b, alpha)
return rgb_to_hex((r, g, b)) return rgb_to_hex((r, g, b))
if 'hsl' in s: if 'hsl' in s.lower():
# hsl(hue in {0,360}, saturation [0, 100%], lightness [0, 100%]) # hsl(hue in {0,360}, saturation [0, 100%], lightness [0, 100%])
match = re.search(r'hsla*\(((\d+), *(\d+)%, *(\d+)%, (\d\.\d+)*)\)', s) match = re.search(r'hsla*\(((\d+), *(\d+)%, *(\d+)%, (\d\.\d+)*)\)', s)
if match: if match: