forked from LiveCarta/BookConverter
Modify local consumer.py
This commit is contained in:
46
consumer.py
46
consumer.py
@@ -23,36 +23,48 @@ def configure_file_logger(name, filename='logs/converter.log', filemode='w+',
|
|||||||
file_handler = logging.FileHandler(file_path, mode=filemode)
|
file_handler = logging.FileHandler(file_path, mode=filemode)
|
||||||
logger.addHandler(file_handler)
|
logger.addHandler(file_handler)
|
||||||
|
|
||||||
file_format = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s [%(filename)s:%(lineno)d in %(funcName)s]')
|
file_format = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s '
|
||||||
|
'[%(filename)s:%(lineno)d in %(funcName)s]')
|
||||||
file_handler.setFormatter(file_format)
|
file_handler.setFormatter(file_format)
|
||||||
logger.setLevel(logging_level)
|
logger.setLevel(logging_level)
|
||||||
return logger
|
return logger
|
||||||
|
|
||||||
def convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict,):
|
|
||||||
logger.info(f'Start processing book-{book_id}.')
|
|
||||||
|
|
||||||
|
def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict):
|
||||||
|
logger.info(f'Start processing book-{book_id}.')
|
||||||
|
try:
|
||||||
|
json_file_path = 'json/9781614382264.json'
|
||||||
|
book = book_type(book_id=book_id, main_logger=logger, **params)
|
||||||
|
book.conversion_local(json_file_path)
|
||||||
|
except Exception as exc:
|
||||||
|
raise exc
|
||||||
|
logger.info(f'Book-{book_id} has been proceeded.')
|
||||||
|
|
||||||
|
|
||||||
|
def convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict):
|
||||||
|
logger.info(f'Start processing book-{book_id}.')
|
||||||
try:
|
try:
|
||||||
book = book_type(book_id=book_id, main_logger=logger, **params)
|
book = book_type(book_id=book_id, main_logger=logger, **params)
|
||||||
# book.conversion_local('9781641051217')
|
|
||||||
book.conversion()
|
book.conversion()
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
logger.info(f'Book-{book_id} has been proceeded.')
|
logger.info(f'Book-{book_id} has been proceeded.')
|
||||||
|
|
||||||
def callback(ch, method, properties, body, logger, libra_locker):
|
|
||||||
|
def callback(ch, method, properties, body, logger, libre_locker):
|
||||||
print(f'Message: {body}.')
|
print(f'Message: {body}.')
|
||||||
logger.info(f'Message: {body}.')
|
logger.info(f'Message: {body}.')
|
||||||
try:
|
try:
|
||||||
data = json.loads(body)
|
data = json.loads(body)
|
||||||
assert 'apiURL' in data, 'No apiURL field in received message.'
|
assert 'apiURL' in data, 'No apiURL field in received message.'
|
||||||
assert data.get('fileExtension') in ['epub', 'docx'], 'Wrong book type received.'
|
assert data.get('fileExtension') in [
|
||||||
|
'epub', 'docx'], 'Wrong book type received.'
|
||||||
|
|
||||||
book_params = {
|
book_params = {
|
||||||
'access': Access(url=data['apiURL']),
|
'access': Access(url=data['apiURL']),
|
||||||
}
|
}
|
||||||
if data.get('fileExtension') == 'docx':
|
if data.get('fileExtension') == 'docx':
|
||||||
book_params.update({'libra_locker': libra_locker})
|
book_params.update({'libre_locker': libre_locker})
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
'book_type': EpubBook if data.get('fileExtension') == 'epub' else DocxBook,
|
'book_type': EpubBook if data.get('fileExtension') == 'epub' else DocxBook,
|
||||||
@@ -75,6 +87,7 @@ def callback(ch, method, properties, body, logger, libra_locker):
|
|||||||
finally:
|
finally:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def server_run():
|
def server_run():
|
||||||
logger = configure_file_logger('consumer')
|
logger = configure_file_logger('consumer')
|
||||||
|
|
||||||
@@ -87,25 +100,30 @@ def server_run():
|
|||||||
port = conf_param.get('port') or pika.ConnectionParameters().DEFAULT_PORT
|
port = conf_param.get('port') or pika.ConnectionParameters().DEFAULT_PORT
|
||||||
channel = None
|
channel = None
|
||||||
try:
|
try:
|
||||||
credentials = pika.PlainCredentials(username=conf_param['username'], password=conf_param['password'])
|
credentials = pika.PlainCredentials(
|
||||||
parameters = pika.ConnectionParameters(host=host, port=port, credentials=credentials)
|
username=conf_param['username'], password=conf_param['password'])
|
||||||
|
parameters = pika.ConnectionParameters(
|
||||||
|
host=host, port=port, credentials=credentials)
|
||||||
connection = pika.BlockingConnection(parameters)
|
connection = pika.BlockingConnection(parameters)
|
||||||
channel = connection.channel()
|
channel = connection.channel()
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.log(logging.ERROR, f'Problems with queue connection.\n' + str(exc))
|
logger.log(logging.ERROR,
|
||||||
|
f'Problems with queue connection.\n' + str(exc))
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
try:
|
try:
|
||||||
channel.queue_declare(queue=conf_param['queue'], durable=True, arguments={'x-max-priority': 10})
|
channel.queue_declare(queue=conf_param['queue'], durable=True, arguments={
|
||||||
|
'x-max-priority': 10})
|
||||||
except ValueError as exc:
|
except ValueError as exc:
|
||||||
logger.log(logging.ERROR, f'Queue {conf_param["queue"]} is not declared.')
|
logger.log(logging.ERROR,
|
||||||
|
f'Queue {conf_param["queue"]} is not declared.')
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
locker = Event()
|
locker = Event()
|
||||||
locker.set()
|
locker.set()
|
||||||
channel.basic_consume(queue=conf_param['queue'],
|
channel.basic_consume(queue=conf_param['queue'],
|
||||||
auto_ack=True,
|
auto_ack=True,
|
||||||
on_message_callback=partial(callback, logger=logger, libra_locker=locker))
|
on_message_callback=partial(callback, logger=logger, libre_locker=locker))
|
||||||
logger.info('Connection has been established.')
|
logger.info('Connection has been established.')
|
||||||
print('Waiting for messages...')
|
print('Waiting for messages...')
|
||||||
logger.info('Waiting for messages...')
|
logger.info('Waiting for messages...')
|
||||||
|
|||||||
@@ -18,16 +18,16 @@ from src.util.helpers import BookLogger
|
|||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
|
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
|
||||||
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \
|
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\
|
||||||
update_images_src_links, preprocess_footnotes
|
prepare_title, prepare_content, update_images_src_links, preprocess_footnotes
|
||||||
|
|
||||||
|
|
||||||
class EpubConverter:
|
class EpubConverter:
|
||||||
def __init__(self, file, access=None, logger=None):
|
def __init__(self, file_path, access=None, logger=None):
|
||||||
self.file = file
|
self.file_path = file_path
|
||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
self.ebooklib_book = epub.read_epub(file)
|
self.ebooklib_book = epub.read_epub(file_path)
|
||||||
|
|
||||||
# main container for all epub .xhtml files
|
# main container for all epub .xhtml files
|
||||||
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
|
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}
|
||||||
@@ -66,6 +66,7 @@ class EpubConverter:
|
|||||||
self.logger.log('HTML files reading.')
|
self.logger.log('HTML files reading.')
|
||||||
self.html_href2html_body_soup: Dict[str,
|
self.html_href2html_body_soup: Dict[str,
|
||||||
BeautifulSoup] = self.build_href2soup_content()
|
BeautifulSoup] = self.build_href2soup_content()
|
||||||
|
# TODO Presets
|
||||||
|
|
||||||
self.logger.log('CSS files processing.')
|
self.logger.log('CSS files processing.')
|
||||||
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
|
||||||
@@ -122,18 +123,25 @@ class EpubConverter:
|
|||||||
join(html_folder, path_to_css_from_html)).replace('\\', '/')
|
join(html_folder, path_to_css_from_html)).replace('\\', '/')
|
||||||
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
|
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
|
||||||
if "@import" in str(css_obj.content):
|
if "@import" in str(css_obj.content):
|
||||||
path_to_css_from_root = "css/" + re.search('"(.*)"', str(css_obj.content)).group(1)
|
path_to_css_from_root = "css/" + \
|
||||||
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
|
re.search('"(.*)"', str(css_obj.content)).group(1)
|
||||||
|
css_obj = self.ebooklib_book.get_item_with_href(
|
||||||
|
path_to_css_from_root)
|
||||||
assert css_obj, f'Css style {css_href} was not in manifest.'
|
assert css_obj, f'Css style {css_href} was not in manifest.'
|
||||||
css_content: str = css_obj.get_content().decode()
|
css_content: str = css_obj.get_content().decode()
|
||||||
return css_content
|
return css_content
|
||||||
|
|
||||||
def build_html_and_css_relations(self):
|
def build_html_and_css_relations(self) -> tuple[dict, dict]:
|
||||||
"""
|
"""
|
||||||
This function is designed to get 2 dictionaries:
|
Function is designed to get 2 dictionaries:
|
||||||
The first is css_href2css_content. It is created to connect href of css to content of css
|
The first is css_href2css_content. It is created to connect href of css to content of css
|
||||||
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
|
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them
|
||||||
|
) which are used on this html
|
||||||
...2... = key2value
|
...2... = key2value
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
html_href2css_href, css_href2css_content: tuple[dict, dict]
|
||||||
|
dictionary: href of html to related css files, dictionary: css files to related css content
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# dictionary: href of html to related css files
|
# dictionary: href of html to related css files
|
||||||
@@ -160,8 +168,7 @@ class EpubConverter:
|
|||||||
html_href2css_href[html_href].append(f'href{i}')
|
html_href2css_href[html_href].append(f'href{i}')
|
||||||
css_href2css_content[f'href{i}'] = build_css_content(
|
css_href2css_content[f'href{i}'] = build_css_content(
|
||||||
css_content)
|
css_content)
|
||||||
|
return html_href2css_href, css_href2css_content
|
||||||
return html_href2css_href, css_href2css_content,
|
|
||||||
|
|
||||||
def add_css_styles_to_html_soup(self):
|
def add_css_styles_to_html_soup(self):
|
||||||
"""
|
"""
|
||||||
@@ -178,22 +185,24 @@ class EpubConverter:
|
|||||||
content = convert_html_soup_with_css_style(content, css)
|
content = convert_html_soup_with_css_style(content, css)
|
||||||
self.html_href2html_body_soup[html_href] = content
|
self.html_href2html_body_soup[html_href] = content
|
||||||
|
|
||||||
def build_manifest_id2html_href(self):
|
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
|
||||||
links = dict()
|
|
||||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
|
||||||
links[item.id] = item.file_name
|
|
||||||
|
|
||||||
return links
|
|
||||||
|
|
||||||
def build_adjacency_list_from_toc(self, element, lvl=0):
|
|
||||||
"""
|
"""
|
||||||
|
Function
|
||||||
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
|
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
|
||||||
|
|
||||||
key = -1 if root(top chapters),
|
key = -1 if root(top chapters),
|
||||||
value = None if leaf(least chapters)
|
value = None if leaf(least chapters)
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
element: [Link, tuple, list]
|
||||||
|
element that appears in TOC(usually parsed from nav.ncx)
|
||||||
|
lvl: int
|
||||||
|
level of node
|
||||||
|
|
||||||
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
|
Returns
|
||||||
:param lvl: level of depth
|
----------
|
||||||
|
None
|
||||||
|
built adjacency list
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if isinstance(element, Link):
|
if isinstance(element, Link):
|
||||||
@@ -250,6 +259,12 @@ class EpubConverter:
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def build_manifest_id2html_href(self) -> dict:
|
||||||
|
links = dict()
|
||||||
|
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||||
|
links[item.id] = item.file_name
|
||||||
|
return links
|
||||||
|
|
||||||
def build_adjacency_list_from_spine(self):
|
def build_adjacency_list_from_spine(self):
|
||||||
manifest_id2html_href = self.build_manifest_id2html_href()
|
manifest_id2html_href = self.build_manifest_id2html_href()
|
||||||
self.adjacency_list = {
|
self.adjacency_list = {
|
||||||
@@ -316,7 +331,7 @@ class EpubConverter:
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
full_path[0]: s
|
full_path[0]: str
|
||||||
prepared content
|
prepared content
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -453,6 +468,8 @@ class EpubConverter:
|
|||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
None
|
None
|
||||||
|
built chapter
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if nav_point.id:
|
if nav_point.id:
|
||||||
soup = self.html_href2html_body_soup[nav_point.href]
|
soup = self.html_href2html_body_soup[nav_point.href]
|
||||||
@@ -487,7 +504,7 @@ class EpubConverter:
|
|||||||
path_to_html=nav_point.href,
|
path_to_html=nav_point.href,
|
||||||
access=self.access,
|
access=self.access,
|
||||||
path2aws_path=self.book_image_src_path2aws_path,
|
path2aws_path=self.book_image_src_path2aws_path,
|
||||||
book_id=self.file.stem if hasattr(self.file, 'stem') else 'book_id')
|
book_id=self.file_path.stem if hasattr(self.file_path, 'stem') else 'book_id')
|
||||||
|
|
||||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||||
title_preprocessed = prepare_title(title)
|
title_preprocessed = prepare_title(title)
|
||||||
@@ -525,12 +542,12 @@ class EpubConverter:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
filename = '9781614382264'
|
epub_file_path = '../../epub/9781614382264.epub'
|
||||||
logger_object = BookLogger(name='epub', book_id=filename)
|
logger_object = BookLogger(
|
||||||
|
name='epub', book_id=epub_file_path.split('/')[-1])
|
||||||
|
|
||||||
json_converter = EpubConverter(f'../../epub/{filename}.epub',
|
json_converter = EpubConverter(epub_file_path, logger=logger_object)
|
||||||
logger=logger_object)
|
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
|
|
||||||
with codecs.open(f'../../json/{filename}.json', 'w', encoding='utf-8') as f:
|
with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json:
|
||||||
json.dump(content_dict, f, ensure_ascii=False)
|
json.dump(content_dict, f_json, ensure_ascii=False)
|
||||||
|
|||||||
Reference in New Issue
Block a user