From 5039417a0f28f2d74ee59aa1500ab44e093e945c Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 1 Jun 2022 16:18:12 +0300 Subject: [PATCH] Modify local consumer.py --- consumer.py | 48 +++++++++++------ src/epub_converter/epub_converter.py | 77 +++++++++++++++++----------- 2 files changed, 80 insertions(+), 45 deletions(-) diff --git a/consumer.py b/consumer.py index d1cacc1..4c67d6e 100644 --- a/consumer.py +++ b/consumer.py @@ -23,36 +23,48 @@ def configure_file_logger(name, filename='logs/converter.log', filemode='w+', file_handler = logging.FileHandler(file_path, mode=filemode) logger.addHandler(file_handler) - file_format = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s [%(filename)s:%(lineno)d in %(funcName)s]') + file_format = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s ' + '[%(filename)s:%(lineno)d in %(funcName)s]') file_handler.setFormatter(file_format) logger.setLevel(logging_level) return logger -def convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict,): - logger.info(f'Start processing book-{book_id}.') +def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict): + logger.info(f'Start processing book-{book_id}.') + try: + json_file_path = 'json/9781614382264.json' + book = book_type(book_id=book_id, main_logger=logger, **params) + book.conversion_local(json_file_path) + except Exception as exc: + raise exc + logger.info(f'Book-{book_id} has been proceeded.') + + +def convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict): + logger.info(f'Start processing book-{book_id}.') try: book = book_type(book_id=book_id, main_logger=logger, **params) - # book.conversion_local('9781641051217') book.conversion() except Exception as exc: raise exc - logger.info(f'Book-{book_id} has been proceeded.') -def callback(ch, method, properties, body, logger, libra_locker): + +def callback(ch, method, properties, body, logger, libre_locker): print(f'Message: {body}.') logger.info(f'Message: {body}.') try: data = json.loads(body) assert 'apiURL' in data, 'No apiURL field in received message.' - assert data.get('fileExtension') in ['epub', 'docx'], 'Wrong book type received.' + assert data.get('fileExtension') in [ + 'epub', 'docx'], 'Wrong book type received.' book_params = { 'access': Access(url=data['apiURL']), } if data.get('fileExtension') == 'docx': - book_params.update({'libra_locker': libra_locker}) + book_params.update({'libre_locker': libre_locker}) params = { 'book_type': EpubBook if data.get('fileExtension') == 'epub' else DocxBook, @@ -75,6 +87,7 @@ def callback(ch, method, properties, body, logger, libra_locker): finally: pass + def server_run(): logger = configure_file_logger('consumer') @@ -87,25 +100,30 @@ def server_run(): port = conf_param.get('port') or pika.ConnectionParameters().DEFAULT_PORT channel = None try: - credentials = pika.PlainCredentials(username=conf_param['username'], password=conf_param['password']) - parameters = pika.ConnectionParameters(host=host, port=port, credentials=credentials) + credentials = pika.PlainCredentials( + username=conf_param['username'], password=conf_param['password']) + parameters = pika.ConnectionParameters( + host=host, port=port, credentials=credentials) connection = pika.BlockingConnection(parameters) channel = connection.channel() except Exception as exc: - logger.log(logging.ERROR, f'Problems with queue connection.\n' + str(exc)) + logger.log(logging.ERROR, + f'Problems with queue connection.\n' + str(exc)) raise exc try: - channel.queue_declare(queue=conf_param['queue'], durable=True, arguments={'x-max-priority': 10}) + channel.queue_declare(queue=conf_param['queue'], durable=True, arguments={ + 'x-max-priority': 10}) except ValueError as exc: - logger.log(logging.ERROR, f'Queue {conf_param["queue"]} is not declared.') + logger.log(logging.ERROR, + f'Queue {conf_param["queue"]} is not declared.') raise exc locker = Event() locker.set() channel.basic_consume(queue=conf_param['queue'], auto_ack=True, - on_message_callback=partial(callback, logger=logger, libra_locker=locker)) + on_message_callback=partial(callback, logger=logger, libre_locker=locker)) logger.info('Connection has been established.') print('Waiting for messages...') logger.info('Waiting for messages...') @@ -114,4 +132,4 @@ def server_run(): if __name__ == '__main__': - server_run() \ No newline at end of file + server_run() diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 17f41a2..7e5e389 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -18,16 +18,16 @@ from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style -from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \ - update_images_src_links, preprocess_footnotes +from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\ + prepare_title, prepare_content, update_images_src_links, preprocess_footnotes class EpubConverter: - def __init__(self, file, access=None, logger=None): - self.file = file + def __init__(self, file_path, access=None, logger=None): + self.file_path = file_path self.access = access self.logger: BookLogger = logger - self.ebooklib_book = epub.read_epub(file) + self.ebooklib_book = epub.read_epub(file_path) # main container for all epub .xhtml files self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} @@ -66,6 +66,7 @@ class EpubConverter: self.logger.log('HTML files reading.') self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() + # TODO Presets self.logger.log('CSS files processing.') self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() @@ -122,18 +123,25 @@ class EpubConverter: join(html_folder, path_to_css_from_html)).replace('\\', '/') css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) if "@import" in str(css_obj.content): - path_to_css_from_root = "css/" + re.search('"(.*)"', str(css_obj.content)).group(1) - css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) + path_to_css_from_root = "css/" + \ + re.search('"(.*)"', str(css_obj.content)).group(1) + css_obj = self.ebooklib_book.get_item_with_href( + path_to_css_from_root) assert css_obj, f'Css style {css_href} was not in manifest.' css_content: str = css_obj.get_content().decode() return css_content - def build_html_and_css_relations(self): + def build_html_and_css_relations(self) -> tuple[dict, dict]: """ - This function is designed to get 2 dictionaries: + Function is designed to get 2 dictionaries: The first is css_href2css_content. It is created to connect href of css to content of css - The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html + The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them + ) which are used on this html ...2... = key2value + Returns + ---------- + html_href2css_href, css_href2css_content: tuple[dict, dict] + dictionary: href of html to related css files, dictionary: css files to related css content """ # dictionary: href of html to related css files @@ -160,8 +168,7 @@ class EpubConverter: html_href2css_href[html_href].append(f'href{i}') css_href2css_content[f'href{i}'] = build_css_content( css_content) - - return html_href2css_href, css_href2css_content, + return html_href2css_href, css_href2css_content def add_css_styles_to_html_soup(self): """ @@ -178,22 +185,24 @@ class EpubConverter: content = convert_html_soup_with_css_style(content, css) self.html_href2html_body_soup[html_href] = content - def build_manifest_id2html_href(self): - links = dict() - for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): - links[item.id] = item.file_name - - return links - - def build_adjacency_list_from_toc(self, element, lvl=0): + def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): """ + Function self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc key = -1 if root(top chapters), value = None if leaf(least chapters) + Parameters + ---------- + element: [Link, tuple, list] + element that appears in TOC(usually parsed from nav.ncx) + lvl: int + level of node - :param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx) - :param lvl: level of depth + Returns + ---------- + None + built adjacency list """ if isinstance(element, Link): @@ -250,6 +259,12 @@ class EpubConverter: return True return False + def build_manifest_id2html_href(self) -> dict: + links = dict() + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + links[item.id] = item.file_name + return links + def build_adjacency_list_from_spine(self): manifest_id2html_href = self.build_manifest_id2html_href() self.adjacency_list = { @@ -316,7 +331,7 @@ class EpubConverter: Returns ------- - full_path[0]: s + full_path[0]: str prepared content """ @@ -453,6 +468,8 @@ class EpubConverter: Returns ------- None + built chapter + """ if nav_point.id: soup = self.html_href2html_body_soup[nav_point.href] @@ -487,7 +504,7 @@ class EpubConverter: path_to_html=nav_point.href, access=self.access, path2aws_path=self.book_image_src_path2aws_path, - book_id=self.file.stem if hasattr(self.file, 'stem') else 'book_id') + book_id=self.file_path.stem if hasattr(self.file_path, 'stem') else 'book_id') is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS title_preprocessed = prepare_title(title) @@ -525,12 +542,12 @@ class EpubConverter: if __name__ == "__main__": - filename = '9781614382264' - logger_object = BookLogger(name='epub', book_id=filename) + epub_file_path = '../../epub/9781614382264.epub' + logger_object = BookLogger( + name='epub', book_id=epub_file_path.split('/')[-1]) - json_converter = EpubConverter(f'../../epub/{filename}.epub', - logger=logger_object) + json_converter = EpubConverter(epub_file_path, logger=logger_object) content_dict = json_converter.convert_to_dict() - with codecs.open(f'../../json/{filename}.json', 'w', encoding='utf-8') as f: - json.dump(content_dict, f, ensure_ascii=False) \ No newline at end of file + with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json: + json.dump(content_dict, f_json, ensure_ascii=False)