diff --git a/docx/.gitignore b/books/docx/.gitignore similarity index 100% rename from docx/.gitignore rename to books/docx/.gitignore diff --git a/epub/.gitignore b/books/epub/.gitignore similarity index 100% rename from epub/.gitignore rename to books/epub/.gitignore diff --git a/html/.gitignore b/books/html/.gitignore similarity index 100% rename from html/.gitignore rename to books/html/.gitignore diff --git a/json/.gitignore b/books/json/.gitignore similarity index 100% rename from json/.gitignore rename to books/json/.gitignore diff --git a/consumer.py b/consumer.py index 2ea307c..dfa0b16 100644 --- a/consumer.py +++ b/consumer.py @@ -33,7 +33,7 @@ def configure_file_logger(name, filename="logs/converter.log", filemode="w+", def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict): logger.info(f"Start processing book-{book_id}.") try: - json_file_path = "json/9781614382264.json" + json_file_path = "books/json/9781614382264.json" book = book_type(book_id=book_id, main_logger=logger, **params) book.conversion_local(json_file_path) except Exception as exc: @@ -77,7 +77,6 @@ def callback(ch, method, properties, body, logger, libre_locker): thread.start() logging.log(logging.INFO, f"Active threads: {active_count()}.") # print(f"Active threads: {active_count()}.") - except Exception as exc: if hasattr(exc, "message"): logger.error(f"{sys.exc_info()[0]}: {exc.message}") @@ -90,15 +89,18 @@ def callback(ch, method, properties, body, logger, libre_locker): def server_run(): logger = configure_file_logger("consumer") + channel = None try: folder_path = os.path.dirname(os.path.abspath(__file__)) - config_path = Path(os.path.join(folder_path, "config/queue_config.json")) + config_path = Path(os.path.join( + folder_path, "config/queue_config.json")) with open(config_path, "r") as f: conf_param = json.load(f) - host = conf_param.get("host") or pika.ConnectionParameters().DEFAULT_HOST - port = conf_param.get("port") or pika.ConnectionParameters().DEFAULT_PORT - channel = None + host = conf_param.get( + "host") or pika.ConnectionParameters().DEFAULT_HOST + port = conf_param.get( + "port") or pika.ConnectionParameters().DEFAULT_PORT credentials = pika.PlainCredentials( username=conf_param["username"], password=conf_param["password"]) parameters = pika.ConnectionParameters( @@ -113,7 +115,6 @@ def server_run(): logger.log(logging.ERROR, f"Queue {conf_param['queue']} is not declared.") raise exc - locker = Event() locker.set() channel.basic_consume(queue=conf_param["queue"], diff --git a/doc/style_config b/docs/style_config similarity index 100% rename from doc/style_config rename to docs/style_config diff --git a/presets/presets.json b/presets/presets.json new file mode 100644 index 0000000..7272038 --- /dev/null +++ b/presets/presets.json @@ -0,0 +1,113 @@ +[ + { + "preset_name": "table_wrapper", + "rules": [ + { + "tags": ["div"], + "attrs": [ + { + "name": "width", + "value": ".*" + }, + { + "name": "border", + "value": ".*" + }, + { + "name": "bgcolor", + "value": ".*" + } + ] + }, + { + "tags": ["section", "blockquote"], + "attrs": [ + { + "name": "class", + "value": "feature[1234]" + } + ] + } + ] + }, + { + "preset_name": "replacer", + "rules": [ + { + "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$"], + "condition": null, + "tag_to_replace": "p" + }, + { + "tags": ["^aside$"], + "condition": null, + "tag_to_replace": "blockquote" + }, + { + "tags": ["^header$", "^footer$"], + "condition": null, + "tag_to_replace": "span" + }, + { + "tags": ["^code$", "^kbd$", "^var$"], + "condition": { + "parent_tags": ":not(pre)", + "child_tags": null, + "attrs": null + }, + "tag_to_replace": "span" + }, + { + "tags": ["^b$"], + "condition": null, + "tag_to_replace": "strong" + }, + { + "tags": ["^image$"], + "condition": null, + "tag_to_replace": "img" + } + ] + }, + { + "preset_name": "attr_replacer", + "rules": [ + { + "attr": "xlink:href", + "condition": { + "tags": ["img"] + }, + "attr_to_replace": "src" + } + ] + }, + { + "preset_name": "unwrapper", + "rules": { + "tags": [ + "section", + "article", + "figcaption", + "main", + "body", + "html", + "svg", + "li > p" + ] + } + }, + { + "preset_name": "inserter", + "rules": [ + { + "tags": ["pre"], + "condition": { + "parent_tags": null, + "child_tags": ":not(code, kbd, var)", + "attrs": null + }, + "tag_to_insert": "code" + } + ] + } +] \ No newline at end of file diff --git a/src/access.py b/src/access.py index 4367c33..6d22202 100644 --- a/src/access.py +++ b/src/access.py @@ -8,49 +8,30 @@ from io import BytesIO class Access: """Class accessing our platform""" - - PENDING = 1 - PROCESS = 2 - GENERATE = 3 - FINISH = 4 - ERROR = 5 - - url = None - username = None - password = None - - token = None - refresh = None - refresh_time = None - headers = None - refreshing = Event() - - def __init__(self, url): + def __init__(self, url=None): """ :param url: str, url received from queue message, if field apiURL exists else None """ + self.PENDING = 1 + self.PROCESS = 2 + self.GENERATE = 3 + self.FINISH = 4 + self.ERROR = 5 + + self.username = None + self.password = None + + self.token = None + self.refresh = None + self.refresh_time = None + self.headers = None + self.refreshing = Event() self.set_credentials(url) self.get_token() self.refreshing.set() - def sleep(timeout: float, retry=3): - def decorator(function): - """Decorator sleeping timeout sec and makes 3 retries""" - def wrapper(*args, **kwargs): - retries = 0 - while retries < retry: - try: - value = function(*args, **kwargs) - if value is not None: - return value - except: - time.sleep(timeout) - retries += 1 - return wrapper - return decorator - def set_credentials(self, url): folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) @@ -60,8 +41,8 @@ class Access: self.refreshing.clear() self.url = url - self.username = params['username'] - self.password = params['password'] + self.username = params["username"] + self.password = params["password"] self.refreshing.set() def format_header(self): @@ -123,14 +104,14 @@ class Access: else: raise Exception(f'{response.status_code}') - def get_book(self, book_id): - """Function downloads the book from site""" + def get_file(self, file_path): + """Function downloads the file[book, preset] from site""" if self.is_time_for_refreshing(): self.refresh_token() self.refreshing.wait() response = requests.get( - f'{self.url}/doc-convert/{book_id}/file', headers=self.headers, + file_path, headers=self.headers, # auth=('kiryl.miatselitsa', 'iK4yXCvdyHFEEOvG2v3F') ) @@ -139,11 +120,26 @@ class Access: elif response.status_code == 200: content = response.content else: - raise Exception(f'Error in getting doc from url: {self.url}/doc-convert/{book_id}/file, ' + raise Exception(f'Error in getting preset from url: {file_path}, ' f'status code:{response.status_code}') - return content + def sleep(timeout: float, retry=3): + def decorator(function): + """Decorator sleeping timeout sec and makes 3 retries""" + def wrapper(*args, **kwargs): + retries = 0 + while retries < retry: + try: + value = function(*args, **kwargs) + if value is not None: + return value + except: + time.sleep(timeout) + retries += 1 + return wrapper + return decorator + @sleep(3) def send_image(self, img_path, doc_id, img_content: bytes = None): """Function sends images to site""" diff --git a/src/book_solver.py b/src/book_solver.py index c45af0f..a7625d5 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -24,9 +24,10 @@ class BookSolver: self.book_type = None self.book_id = book_id self.access = access - self.file_path = None # path to book file, appears after downloading from server - self.output_path = None # path to json file - self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}', + self.preset_path = None + self.book_path = None # path to book file, appears after downloading from server + self.book_output_path = None # path to json file + self.logger_object = BookLogger(name=f"{__name__}_{self.book_id}", book_id=book_id, main_logger=main_logger) self.status_wrapper = BookStatusWrapper( @@ -35,9 +36,9 @@ class BookSolver: assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \ "Length of headers doesn't match allowed levels." - def save_book_file(self, content: bytes): + def save_file(self, content: bytes, path_to_save, file_type): """ - Function saves binary content of file to .docx/.epub + Function saves binary content of file to folder(path_to_save) Parameters ---------- content: bytes str @@ -47,80 +48,100 @@ class BookSolver: folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.join( - folder_path, f'{self.book_type}/{self.book_id}') + folder_path, path_to_save) pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) file_path = os.path.join( - folder_path, f'{self.book_id}.{self.book_type}') + folder_path, f"{self.book_id}.{file_type}") try: - with open(file_path, 'wb+') as file: + with open(file_path, "wb+") as file: file.write(content) - self.logger_object.log(f'File was saved to folder: {folder_path}.') + self.logger_object.log( + f"File was saved to folder: {folder_path}.") except Exception as exc: self.logger_object.log( f"Error in writing {self.book_type} file.", logging.ERROR) self.logger_object.log_error_to_main_log() raise exc + return file_path - self.file_path = pathlib.Path(file_path) + def get_preset_file(self): + """Method for getting and saving preset from server""" + try: + self.logger_object.log(f"Start receiving preset file from server. URL:" + f" {self.access.url}/doc-convert/{self.book_id}/presets") + content = self.access.get_file( + file_path=f"{self.access.url}/doc-convert/{self.book_id}/presets") + self.logger_object.log("Preset file was received from server.") + self.preset_path = pathlib.Path( + str(self.save_file(content, path_to_save="presets", file_type="json"))) + except FileNotFoundError as f_err: + self.logger_object.log( + "Can't get preset file from server.", logging.ERROR) + self.logger_object.log_error_to_main_log() + raise f_err + except Exception as exc: + raise exc def get_book_file(self): """Method for getting and saving book from server""" try: - self.logger_object.log(f'Start receiving file from server. URL:' - f' {self.access.url}/doc-convert/{self.book_id}/file') - content = self.access.get_book(self.book_id) - self.logger_object.log('File was received from server.') - self.save_book_file(content) + self.logger_object.log(f"Start receiving book file from server. URL:" + f" {self.access.url}/doc-convert/{self.book_id}/file") + content = self.access.get_file( + file_path=f"{self.access.url}/doc-convert/{self.book_id}/file") + self.logger_object.log("Book file was received from server.") + self.book_path = pathlib.Path(self.save_file( + content, path_to_save=f"books/{self.book_type}", file_type=self.book_type)) except FileNotFoundError as f_err: self.logger_object.log( - "Can't get file from server.", logging.ERROR) + "Can't get book file from server.", logging.ERROR) self.logger_object.log_error_to_main_log() raise f_err except Exception as exc: raise exc def check_output_directory(self): - if self.output_path is None: + if self.book_output_path is None: folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) output_path = os.path.join( - folder_path, f'json/{self.book_id}.json') - self.output_path = output_path + folder_path, f"books/json/{self.book_id}.json") + self.book_output_path = output_path - self.output_path = pathlib.Path(self.output_path) - self.logger_object.log(f'Output file path: {self.output_path}') + self.book_output_path = pathlib.Path(self.book_output_path) + self.logger_object.log(f"Output file path: {self.book_output_path}") - pathlib.Path(self.output_path).parent.mkdir( + pathlib.Path(self.book_output_path).parent.mkdir( parents=True, exist_ok=True) - self.output_path.touch(exist_ok=True) + self.book_output_path.touch(exist_ok=True) def write_to_json(self, content: dict): self.check_output_directory() try: - with codecs.open(self.output_path, 'w', encoding='utf-8') as f: + with codecs.open(self.book_output_path, "w", encoding="utf-8") as f: json.dump(content, f, ensure_ascii=False) self.logger_object.log( - f'Data has been saved to .json file: {self.output_path}') + f"Data has been saved to .json file: {self.book_output_path}") except Exception as exc: self.logger_object.log( - 'Error has occurred while writing .json file.' + str(exc), logging.ERROR) + "Error has occurred while writing .json file." + str(exc), logging.ERROR) def send_json_content_to_server(self, content: dict): """Function sends json_content to site""" try: self.access.send_book(self.book_id, content) - self.logger_object.log(f'JSON data has been sent to server.') + self.logger_object.log(f"JSON data has been sent to server.") except Exception as exc: self.logger_object.log( - 'Error has occurred while sending json content.', logging.ERROR) + "Error has occurred while sending json content.", logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc @abstractmethod def get_converted_book(self): - self.logger_object.log('Beginning of processing .json output.') + self.logger_object.log("Beginning of processing .json output.") self.status_wrapper.set_generating() return {} @@ -132,20 +153,23 @@ class BookSolver: """ try: - self.logger_object.log( - f'Beginning of conversion from .{self.book_type} to .json.') + self.get_preset_file() self.get_book_file() + self.logger_object.log( + f"Beginning of conversion from .{self.book_type} to .json.") self.status_wrapper.set_processing() content_dict = self.get_converted_book() + [os.remove(path) for path in [self.preset_path, self.book_path]] + self.logger_object.log("Beginning of processing .json output.") self.status_wrapper.set_generating() self.write_to_json(content_dict) self.send_json_content_to_server(content_dict) self.logger_object.log( - f'End of the conversion to LiveCarta format. Check {self.output_path}.') + f"End of the conversion to LiveCarta format. Check {self.book_output_path}.") except Exception as exc: self.status_wrapper.set_error() self.logger_object.log( - 'Error has occurred while conversion.', logging.ERROR) + "Error has occurred while conversion.", logging.ERROR) self.logger_object.log_error_to_main_log(str(exc)) raise exc @@ -158,15 +182,16 @@ class BookSolver: """ try: self.logger_object.log( - f'Data has been downloaded from {file_path} file') + f"Data has been downloaded from {file_path} file") self.status_wrapper.set_processing() - with codecs.open(file_path, 'r', encoding='utf-8') as f_json: + with codecs.open(file_path, "r", encoding="utf-8") as f_json: content_dict = json.load(f_json) + self.logger_object.log("Beginning of processing .json output.") self.status_wrapper.set_generating() self.send_json_content_to_server(content_dict) - self.logger_object.log(f'Sent a file to server. Check LiveCarta.') + self.logger_object.log(f"Sent a file to server. Check LiveCarta.") except Exception as exc: self.status_wrapper.set_error() self.logger_object.log( - 'Error has occurred while reading json file.' + str(exc), logging.ERROR) + "Error has occurred while reading json file." + str(exc), logging.ERROR) self.logger_object.log_error_to_main_log(str(exc)) diff --git a/src/docx_converter/docx2libre_html.py b/src/docx_converter/docx2libre_html.py index 889aa25..56fe2f7 100644 --- a/src/docx_converter/docx2libre_html.py +++ b/src/docx_converter/docx2libre_html.py @@ -10,12 +10,12 @@ from src.util.helpers import BookLogger class Docx2LibreHTML: - def __init__(self, book_id=0, file_path=None, access=None, logger=None, status_wrapper=None, libre_locker=None): - self.book_id = book_id + def __init__(self, book_id=0, file_path=None, access=None, logger=None, libre_locker=None): + self.book_id = book_id if book_id != 0 else pathlib.Path( + file_path).stem self.file_path = file_path self.access = access self.logger_object: BookLogger = logger - self.status_wrapper: status_wrapper = status_wrapper # critical section for occupying libreoffice by one thread self.libre_locker: Event() = libre_locker @@ -24,15 +24,15 @@ class Docx2LibreHTML: self.html_soup = self.read_html(self.html_path) def _libre_run(self, out_dir_path): - command = ['libreoffice', '--headless', - '--convert-to', 'html', f'{str(self.file_path)}', - '--outdir', f'{out_dir_path}'] + command = ["libreoffice", "--headless", + "--convert-to", "html", f"{str(self.file_path)}", + "--outdir", f"{out_dir_path}"] print(command) result = subprocess.run(command, stdout=PIPE, stderr=PIPE) - self.logger_object.log(f'Result of libre conversion for book_{self.book_id}:' - f' {result.returncode}, {result.stdout}', logging.DEBUG) - self.logger_object.log(f'Any error while libre conversion for book_' - f'{self.book_id}: {result.stderr}', logging.DEBUG) + self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:" + f" {result.returncode}, {result.stdout}", logging.DEBUG) + self.logger_object.log(f"Any error while libre conversion for book_" + f"{self.book_id}: {result.stderr}", logging.DEBUG) def convert_docx_to_html(self): """ @@ -48,82 +48,73 @@ class Docx2LibreHTML: path to html file, file appears after libre-conversion """ - self.logger_object.log(f'File - {self.file_path}.') - print(f'{self.file_path}') - self.logger_object.log('Beginning of conversion from .docx to .html.') + def get_and_clear_flag(out_dir_path: str): + self.libre_locker.clear() + self.logger_object.log(f"Got flag!", logging.DEBUG) + self._libre_run(out_dir_path) + self.libre_locker.set() + self.logger_object.log("Cleared flag...", logging.DEBUG) - try: - f = open(self.file_path) - f.close() - except FileNotFoundError as error: - self.logger_object.log( - 'Invalid path to input data.', logging.ERROR) - self.status_wrapper.set_error() - raise error + def check_file_exists(path, error_string: str): + try: + f = open(path) + f.close() + except FileNotFoundError as error: + self.logger_object.log( + error_string, logging.ERROR) + self.logger_object.log_error_to_main_log() + raise error + + self.logger_object.log(f"File - {self.file_path}.") + self.logger_object.log("Beginning of conversion from .docx to .html.") + + check_file_exists( + self.file_path, error_string="Invalid path to input data.") folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) - out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}') + out_dir_path = os.path.join(folder_path, f"../books/html/{self.book_id}") pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) - is_book_converted = False try: if self.libre_locker.isSet(): - self.libre_locker.clear() - self.logger_object.log('Got flag...', logging.DEBUG) - self._libre_run(out_dir_path) - self.libre_locker.set() - self.logger_object.log('Cleared flag...', logging.DEBUG) - + get_and_clear_flag(out_dir_path) else: - while not self.libre_locker.isSet() and not is_book_converted: + while not self.libre_locker.isSet(): self.logger_object.log( - 'Waiting for libre...', logging.DEBUG) + "Waiting for libre...", logging.DEBUG) flag = self.libre_locker.wait(50) if flag: if self.libre_locker.isSet(): - self.libre_locker.clear() - self.logger_object.log(f'Got flag!', logging.DEBUG) - self._libre_run(out_dir_path) - self.libre_locker.set() + get_and_clear_flag(out_dir_path) break - except Exception as exc: self.logger_object.log( "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() raise exc - out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html') + out_dir_path = os.path.join(out_dir_path, f"{self.book_id}.html") html_path = pathlib.Path(out_dir_path) - try: - f = open(html_path) - f.close() - except FileNotFoundError as exc: - self.logger_object.log( - "Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR) - self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() - raise exc + check_file_exists( + html_path, error_string="Conversion has gone wrong. HTML file doesn't exist.") - self.logger_object.log('End of conversion from .docx to .html.') + self.logger_object.log("End of conversion from .docx to .html.") self.logger_object.log( - f'Input file path after conversion: {html_path}.') + f"Input file path after conversion: {html_path}.") return html_path def read_html(self, html_path): """Method for reading .html file into beautiful soup tag.""" try: - html_text = open(html_path, 'r', encoding='utf8').read() - self.logger_object.log('HTML for book has been loaded.') + html_text = open(html_path, "r", encoding="utf8").read() + self.logger_object.log("HTML for book has been loaded.") except FileNotFoundError as exc: - self.logger_object.log('There is no html to process.' - 'Conversion went wrong or you specified wrong paths.', logging.ERROR) + self.logger_object.log("There is no html to process." + "Conversion went wrong or you specified wrong paths.", logging.ERROR) self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() raise exc - html_soup = BeautifulSoup(html_text, features='lxml') + html_soup = BeautifulSoup(html_text, features="lxml") return html_soup diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index b4aa9b3..5edeb46 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -14,7 +14,7 @@ class DocxBook(BookSolver): def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None): super().__init__(book_id, access, main_logger) - self.book_type = 'docx' + self.book_type = "docx" # critical section for occupying libreoffice by one thread self.libre_locker: Event() = libre_locker @@ -34,9 +34,9 @@ class DocxBook(BookSolver): """ # 1. Converts docx to html with LibreOffice - html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, - self.logger_object, self.status_wrapper, self.libre_locker) - # TODO presets + html_converter = Docx2LibreHTML(self.book_id, self.book_path, self.access, + self.logger_object, self.libre_locker) + # todo presets # 2. Parses and cleans html, gets list of tags, gets footnotes parser = HTMLDocxPreprocessor( @@ -46,26 +46,29 @@ class DocxBook(BookSolver): # 3. Parses from line structure to nested structure with JSONConverter json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers, - self.logger_object, self.status_wrapper) + self.logger_object) content_dict = json_converter.convert_to_dict() return content_dict if __name__ == "__main__": - docx_file_path = '../../docx/music_inquiry.docx' + docx_file_path = "../../books/docx/music_inquiry.docx" logger_object = BookLogger( - name='docx', book_id=docx_file_path.split('/')[-1]) + name="docx", book_id=docx_file_path.split("/")[-1]) + locker = Event() + locker.set() - html_converter = Docx2LibreHTML(file_path=docx_file_path) + html_converter = Docx2LibreHTML(file_path=docx_file_path, + logger=logger_object, libre_locker=locker) parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object) content, footnotes, top_level_headers = parser.process_html( - html_converter.html_path) + html_path=html_converter.html_path, book_id=html_converter.book_id) json_converter = LibreHTML2JSONConverter( content, footnotes, top_level_headers, logger_object) content_dict = json_converter.convert_to_dict() - with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f: + with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f: json.dump(content_dict, f, ensure_ascii=False) diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py new file mode 100644 index 0000000..bda6733 --- /dev/null +++ b/src/docx_converter/footnotes_processing.py @@ -0,0 +1,73 @@ +import re +from bs4 import BeautifulSoup, NavigableString + + +def _clean_footnote_content(content): + content = content.strip() + return content.strip() + + +def process_footnotes(body_tag): + """Function returns list of footnotes and delete them from html_soup.""" + footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc") + footnote_content = body_tag.find_all( + "div", id=re.compile(r"^sdfootnote\d+$")) + footnote_amt = len(footnote_anchors) + + assert footnote_amt == len(footnote_content), \ + "Something went wrong with footnotes after libre conversion" + + footnotes = [] + + for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): + true_a_tag = cont_tag.find_all( + "a", class_=re.compile(r"^sdfootnote.+$"))[0] + + if true_a_tag.attrs.get("href") is None: + cont_tag.a.decompose() + continue + + assert anc_tag["name"] == true_a_tag["href"][1:], \ + "Something went wrong with footnotes after libre conversion" + + new_tag = BeautifulSoup(features="lxml").new_tag("sup") + new_tag["class"] = "footnote-element" + new_tag["data-id"] = i + 1 + new_tag["id"] = f"footnote-{i + 1}" + new_tag.string = "*" + anc_tag.replace_with(new_tag) + + # extra digits in footnotes from documents downloaded from livecarta + a_text = true_a_tag.text + if len(cont_tag.find_all("p")): + sup = cont_tag.find_all("p")[0].find("sup") + if sup and sup.text == a_text: + sup.decompose() + + for tag_a in cont_tag.find_all("a", {"class": "sdfootnotesym"}): + tag_a.decompose() + + # remove font-size + for span in cont_tag.find_all("span", {"style": re.compile("font-size")}): + style = span.get("style") + style = re.sub(r"font-size: \d+px", "", style) + if style == "": + del span.attrs["style"] + else: + span.attrs["style"] = style + + unicode_string = "" + for child in cont_tag.children: + if type(child) is NavigableString: + continue + if child.name == "blockquote": + unicode_string += str(child) + else: + unicode_string += child.decode_contents() + + content = _clean_footnote_content(unicode_string) + cont_tag.decompose() + + footnotes.append(content) + + return footnotes diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index e9683f4..a44df01 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -1,18 +1,17 @@ -import os import re import logging -import pathlib from typing import List -from shutil import copyfile from bs4 import BeautifulSoup, NavigableString, Tag from src.livecarta_config import LiveCartaConfig from src.util.helpers import BookLogger, BookStatusWrapper +from src.docx_converter.footnotes_processing import process_footnotes +from src.docx_converter.image_processing import process_images class HTMLDocxPreprocessor: - + def __init__(self, html_soup, logger_object, status_wrapper=None): self.body_tag = html_soup.body self.html_soup = html_soup @@ -21,7 +20,40 @@ class HTMLDocxPreprocessor: self.top_level_headers = None self.content = list() + def _process_toc_links(self): + def _check_parent_link_exist_in_toc(tag_with_link): + toc_links = [] + for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}): + link_name = a_tag.attrs["name"] + toc_item = self.body_tag.find("a", {"href": "#" + link_name}) + if toc_item: + toc_links.append(toc_item) + return len(toc_links) > 0 + """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" + toc_links = self.body_tag.find_all( + "a", {"name": re.compile(r"^_Toc\d+")}) + headers = [link.parent for link in toc_links] + outline_level = "1" # All the unknown outlines will be predicted as

+ for h_tag in headers: + if re.search(r"^h\d$", h_tag.name): + h_tag.a.unwrap() + # outline_level = tag.name[-1] # TODO: add prediction of the outline level + elif h_tag.name == "p": + exist_in_toc = _check_parent_link_exist_in_toc(h_tag) + if h_tag in self.body_tag.find_all("p") and exist_in_toc: + new_tag = BeautifulSoup( + features="lxml").new_tag("h" + outline_level) + text = h_tag.text + h_tag.replaceWith(new_tag) + new_tag.string = text + else: + # rethink document structure when you have toc_links, other cases? + self.logger_object.log(f"Something went wrong in processing toc_links." + f" Check the structure of the file. " + f"Tag name: {h_tag.name}") + def _clean_tag(self, tag: str, attr_name: str, attr_value: re): + # todo regex """ Function to clean tags by its name and attribute value. Parameters @@ -44,15 +76,16 @@ class HTMLDocxPreprocessor: tag.unwrap() def _clean_underline_links(self): + # todo regex """Function cleans meaningless tags before links.""" underlines = self.body_tag.find_all("u") for u in underlines: - if u.find_all('a'): + if u.find_all("a"): u.unwrap() - links = self.body_tag.find_all('a') + links = self.body_tag.find_all("a") for link in links: - u = link.find_all('u') + u = link.find_all("u") if u and len(u) == 1: u[0].unwrap() @@ -80,16 +113,12 @@ class HTMLDocxPreprocessor: """ size = re.search(r"font-size: (\d{1,3})pt", style) - if size is None: return style - size = size.group(1) new_size = cls.convert_pt_to_px(size) - if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE: return "" - return re.sub(size + "pt", str(new_size) + "px", style) def _font_to_span(self): @@ -99,27 +128,18 @@ class HTMLDocxPreprocessor: """ fonts = self.body_tag.find_all("font") for font in fonts: - face = font.get("face") - style = font.get("style") - color = font.get("color") + face, style, color =\ + font.get("face"), font.get("style"), font.get("color") - font.attrs = {} - font.name = "span" + font.attrs, font.name = {}, "span" if style: style = self.convert_font_pt_to_px(style) if style != "": if color and color in LiveCartaConfig.COLORS_MAP: - style += f'; color: {color};' + style += f"; color: {color};" font.attrs["style"] = style elif color and color in LiveCartaConfig.COLORS_MAP: - font.attrs["style"] = f'color: {color};' - - if face is not None: - face = re.sub(r",[\w,\- ]*$", "", face) - if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(face): - font.attrs["face"] = LiveCartaConfig.FONT_CORRESPONDANCE_TABLE[face] - else: - font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME + font.attrs["style"] = f"color: {color};" if len(font.attrs) == 0: font.unwrap() @@ -127,24 +147,18 @@ class HTMLDocxPreprocessor: # on this step there should be no more tags assert len(self.body_tag.find_all("font")) == 0 - def delete_content_before_toc(self): - # remove all tag upper the only in content !!! body tag is not updated - toc_tag = self.html_soup.new_tag('TOC') - if toc_tag in self.content: - ind = self.content.index(toc_tag) + 1 - self.content = self.content[ind:] - def clean_trash(self): - """Function to remove all styles and tags we don't need.""" - self._clean_tag('span', 'style', re.compile( - r'^background: #[\da-fA-F]{6}$')) + # todo make it regex dict + """Function to remove all styles and tags we don"t need.""" + self._clean_tag("span", "style", re.compile( + r"^background: #[\da-fA-F]{6}$")) # todo: check for another languages - self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) - self._clean_tag('span', 'style', re.compile( - '^letter-spacing: -?[\d.]+pt$')) + self._clean_tag("span", "lang", re.compile(r"^ru-RU$")) + self._clean_tag("span", "style", re.compile( + "^letter-spacing: -?[\d.]+pt$")) - self._clean_tag('font', 'face', re.compile( - r'^Times New Roman[\w, ]+$')) + self._clean_tag("font", "face", re.compile( + r"^Times New Roman[\w, ]+$")) self._clean_tag("a", "name", "_GoBack") self._clean_underline_links() @@ -153,63 +167,68 @@ class HTMLDocxPreprocessor: # replace toc with empty tag tables = self.body_tag.find_all( - "div", id=re.compile(r'^Table of Contents\d+')) + "div", id=re.compile(r"^Table of Contents\d+")) for table in tables: table.wrap(self.html_soup.new_tag("TOC")) table.decompose() + def _preprocessing_headings(self): + # todo regex + """Function to convert all lower level headings to p tags""" + pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$" + header_tags = self.body_tag.find_all(re.compile(pattern)) + for tag in header_tags: + tag.name = "p" + def _process_paragraph(self): """Function to process

tags (text-align and text-indent value).""" - paragraphs = self.body_tag.find_all('p') + paragraphs = self.body_tag.find_all("p") for p in paragraphs: # libre converts some \n into

with 2
# there we remove 1 unnecessary
- brs = p.find_all('br') + brs = p.find_all("br") text = p.text - if brs and text == '\n\n' and len(brs) == 2: + if brs and text == "\n\n" and len(brs) == 2: brs[0].decompose() indent_should_be_added = False - if text and ((text[0:1] == '\t') or (text[:2] == '\n\t')): + if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")): indent_should_be_added = True - align = p.get('align') - style = p.get('style') + align = p.get("align") + style = p.get("style") if style: - indent = re.search(r'text-indent: ([\d.]{1,4})in', style) - margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style) + indent = re.search(r"text-indent: ([\d.]{1,4})in", style) + margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style) margin_right = re.search( - r'margin-right: ([\d.]{1,4})in', style) - margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style) + r"margin-right: ([\d.]{1,4})in", style) + margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style) margin_bottom = re.search( - r'margin-bottom: ([\d.]{1,4})in', style) + r"margin-bottom: ([\d.]{1,4})in", style) else: - indent = None - margin_left = None - margin_right = None - margin_top = None - margin_bottom = None + indent = margin_left = margin_right = \ + margin_top = margin_bottom = None if margin_left and margin_right and margin_top and margin_bottom and \ - margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \ - margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11': - p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) + margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \ + margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11": + p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote")) p.attrs = {} - style = '' + style = "" if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE: - style += f'text-align: {align};' + style += f"text-align: {align};" if indent is not None or indent_should_be_added: # indent = indent.group(1) - style += f'text-indent: {LiveCartaConfig.INDENT};' + style += f"text-indent: {LiveCartaConfig.INDENT};" if style: - p.attrs['style'] = style + p.attrs["style"] = style def _process_two_columns(self): """Function to process paragraphs which has two columns layout.""" @@ -220,41 +239,6 @@ class HTMLDocxPreprocessor: child["class"] = "columns2" div.unwrap() - def _process_tables(self): - """Function to process tables. Set "border" attribute.""" - - tables = self.body_tag.find_all("table") - for table in tables: - tds = table.find_all("td") - - sizes = [] - for td in tds: - style = td.get('style') - - if style: - match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) - - if match: - size = match.group(1) - units = match.group(2) - - if units == "pt": - size = self.convert_pt_to_px(size) - - sizes.append(float(size)) - - width = td.get('width') - - td.attrs = {} - if width: - td.attrs['width'] = width - - if sizes: - border_size = sum(sizes) / len(sizes) - table.attrs['border'] = f'{border_size:.2}' - - self.tables_amount = len(tables) - def _process_quotes(self): """ Function to process block quotes. @@ -277,9 +261,9 @@ class HTMLDocxPreprocessor: for table in tables: trs = table.find_all("tr") tds = table.find_all("td") - if len(trs) == 1 and len(tds) == 1 and tds[0].get('width') == '600': + if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600": td = tds[0] - is_zero_border = 'border: none;' in td.get('style') + is_zero_border = "border: none;" in td.get("style") paragraphs = td.find_all("p") has_i_tag_or_br = [(p.i, p.br) for p in paragraphs] has_i_tag_or_br = [x[0] is not None or x[1] is not None @@ -287,231 +271,79 @@ class HTMLDocxPreprocessor: if all(has_i_tag_or_br) and is_zero_border: new_div = BeautifulSoup( - features='lxml').new_tag('blockquote') + features="lxml").new_tag("blockquote") for p in paragraphs: new_div.append(p) table.replaceWith(new_div) + def _process_tables(self): + """Function to process tables. Set "border" attribute.""" + tables = self.body_tag.find_all("table") + for table in tables: + tds = table.find_all("td") + + sizes = [] + for td in tds: + style = td.get("style") + + if style: + match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) + + if match: + size = match.group(1) + units = match.group(2) + + if units == "pt": + size = self.convert_pt_to_px(size) + + sizes.append(float(size)) + + width = td.get("width") + + td.attrs = {} + if width: + td.attrs["width"] = width + + if sizes: + border_size = sum(sizes) / len(sizes) + table.attrs["border"] = f"{border_size:.2}" + + self.tables_amount = len(tables) + def _process_hrefs(self): a_tags_with_href = self.body_tag.find_all( - 'a', {'href': re.compile('^.*http.+')}) + "a", {"href": re.compile("^.*http.+")}) # remove char=end of file for some editors for tag in a_tags_with_href: - tag.string = tag.text.replace('\u200c', '') - tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') + tag.string = tag.text.replace("\u200c", "") + tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") a_tags_with_href = self.body_tag.find_all( - 'a', {'href': re.compile('^(?!#sdfootnote)')}) + "a", {"href": re.compile("^(?!#sdfootnote)")}) for tag in a_tags_with_href: - tag.string = tag.text.replace('\u200c', '') - tag.string = tag.text.replace('\u200b', '') # zero-width-space - tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') - - @staticmethod - def _clean_footnote_content(content): - content = content.strip() - return content.strip() - - def _process_footnotes(self): - """Function returns list of footnotes and delete them from html_soup.""" - footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc') - footnote_content = self.body_tag.find_all( - 'div', id=re.compile(r'^sdfootnote\d+$')) - footnote_amt = len(footnote_anchors) - - assert footnote_amt == len(footnote_content), \ - 'Something went wrong with footnotes after libre conversion' - - footnotes = [] - - for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): - true_a_tag = cont_tag.find_all( - 'a', class_=re.compile(r'^sdfootnote.+$'))[0] - - if true_a_tag.attrs.get('href') is None: - cont_tag.a.decompose() - continue - - assert anc_tag['name'] == true_a_tag['href'][1:], \ - 'Something went wrong with footnotes after libre conversion' - - new_tag = BeautifulSoup(features='lxml').new_tag('sup') - new_tag['class'] = 'footnote-element' - new_tag['data-id'] = i + 1 - new_tag['id'] = f'footnote-{i + 1}' - new_tag.string = '*' - anc_tag.replace_with(new_tag) - - # extra digits in footnotes from documents downloaded from livecarta - a_text = true_a_tag.text - if len(cont_tag.find_all('p')): - sup = cont_tag.find_all('p')[0].find('sup') - if sup and sup.text == a_text: - sup.decompose() - - for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}): - tag_a.decompose() - - # remove font-size - for span in cont_tag.find_all('span', {'style': re.compile('font-size')}): - style = span.get('style') - style = re.sub(r"font-size: \d+px", "", style) - if style == '': - del span.attrs['style'] - else: - span.attrs['style'] = style - - unicode_string = '' - for child in cont_tag.children: - if type(child) is NavigableString: - continue - if child.name == 'blockquote': - unicode_string += str(child) - else: - unicode_string += child.decode_contents() - - content = self._clean_footnote_content(unicode_string) - cont_tag.decompose() - - footnotes.append(content) - - self.footnotes = footnotes - - def _process_images(self, access, html_path, book_id): - """ - Function to process tag. Img should be sent Amazon S3 and then return new tag with valid link. - For now images are moved to one folder. - """ - img_tags = self.body_tag.find_all('img') - - if len(img_tags): - if access is None: - folder_path = os.path.dirname( - os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join( - folder_path, f'json/img_{book_id}/')) - new_path.mkdir(exist_ok=True) - - for img in img_tags: - img_name = img.attrs.get('src') - # quick fix for bad links - if (len(img_name) >= 3) and img_name[:3] == '../': - img_name = img_name[3:] - - img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') - - if access is not None: - link = access.send_image(img_path, doc_id=book_id) - img.attrs['src'] = link - self.logger_object.log( - f'{img_name} successfully uploaded.') - else: - img_size = os.path.getsize(img_path) - self.logger_object.log( - f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG) - new_img_path = new_path / img_name - copyfile(img_path, new_img_path) - img.attrs["src"] = str(new_img_path) - - self.images = img_tags + tag.string = tag.text.replace("\u200c", "") + tag.string = tag.text.replace("\u200b", "") # zero-width-space + tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") def _process_footer(self): + # todo regex """ Function to process

tags. All the tags will be deleted from file. """ - divs = self.body_tag.find_all('div', {'title': 'footer'}) + divs = self.body_tag.find_all("div", {"title": "footer"}) for div in divs: div.decompose() def _process_div(self): + # todo regex """Function to process
tags. All the tags will be deleted from file, all content of the tags will stay.""" divs = self.body_tag.find_all("div") - for div in divs: div.unwrap() - def _check_parent_link_exist_in_toc(self, tag_with_link): - toc_links = [] - for a_tag in tag_with_link.find_all("a", {'name': re.compile(r'^_Toc\d+')}): - link_name = a_tag.attrs['name'] - toc_item = self.body_tag.find("a", {'href': '#' + link_name}) - if toc_item: - toc_links.append(toc_item) - - return len(toc_links) > 0 - - def _process_toc_links(self): - """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" - toc_links = self.body_tag.find_all( - "a", {'name': re.compile(r'^_Toc\d+')}) - headers = [link.parent for link in toc_links] - outline_level = "1" # All the unknown outlines will be predicted as

- for tag in headers: - if re.search(r"^h\d$", tag.name): - tag.a.unwrap() - # outline_level = tag.name[-1] # TODO: add prediction of the outline level - elif tag.name == "p": - exist_in_toc = self._check_parent_link_exist_in_toc(tag) - if tag in self.body_tag.find_all("p") and exist_in_toc: - new_tag = BeautifulSoup( - features="lxml").new_tag("h" + outline_level) - text = tag.text - tag.replaceWith(new_tag) - new_tag.string = text - else: - # rethink document structure when you have toc_links, other cases? - self.logger_object.log(f'Something went wrong in processing toc_links.' - f' Check the structure of the file. ' - f'Tag name: {tag.name}') - - @staticmethod - def clean_title_from_numbering(title: str): - """Function to remove digits from headers.""" - title = re.sub(r'^(\s+)+', '', title) - # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title - # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title - # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title - return title - - @staticmethod - def clean_tag_from_tabs(tag: NavigableString): - cleaned = re.sub(r'(\s+)+', ' ', tag) - this = BeautifulSoup.new_string(BeautifulSoup( - features="lxml"), cleaned, NavigableString) - tag.replace_with(this) - # print('input: ', repr(tag)) - # print('test: ', repr(cleaned)) - - def clean_tag_from_numbering(self, tag): - cleaned = self.clean_title_from_numbering(tag) - this = BeautifulSoup.new_string(BeautifulSoup( - features="lxml"), cleaned, NavigableString) - tag.replace_with(this) - # print('input: ', repr(tag)) - # print('test: ', repr(cleaned)) - - def apply_func_to_last_child(self, tag, func=None): - """ - works only with constructions like (((child to work with))) - where child is object of NavigableString - """ - if type(tag) is NavigableString: - func(tag) - else: - children = list(tag.children) - if children: - self.apply_func_to_last_child(children[0], func) - - def _preprocessing_headings(self): - """Function to convert all lower level headings to p tags""" - pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' - header_tags = self.body_tag.find_all(re.compile(pattern)) - for tag in header_tags: - tag.name = 'p' - def _get_top_level_headers(self): """ Function for gathering info about top-level chapters. @@ -539,27 +371,26 @@ class HTMLDocxPreprocessor: tag.parent.unwrap() title = tag.text - title = re.sub(r'\s+', ' ', title).strip() - number = re.match(r'^(?:\.?\d+\.? ?)+', title) + title = re.sub(r"\s+", " ", title).strip() + number = re.match(r"^(?:\.?\d+\.? ?)+", title) is_numbered = number is not None - cleaned_title = self.clean_title_from_numbering(tag.text) - is_introduction = cleaned_title.lower() == 'introduction' + cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text) + is_introduction = cleaned_title.lower() == "introduction" headers_info.append({ - 'title': cleaned_title, - 'is_numbered': is_numbered, - 'is_introduction': is_introduction}) - + "title": cleaned_title, + "is_numbered": is_numbered, + "is_introduction": is_introduction}) return headers_info def _mark_introduction_headers(self): """ Function to find out: - what header shouldn't be numbered and can be treated as introduction chapter + what header shouldn"t be numbered and can be treated as introduction chapter Assume header(s) to be introduction if: 1. one header not numbered, before 1 numbered header - 2. it is first header from the top level list, and it equals to 'introduction' + 2. it is first header from the top level list, and it equals to "introduction" Returns ------- @@ -567,9 +398,9 @@ class HTMLDocxPreprocessor: mark each top-level header with flag should_be_numbered = true/false """ - is_numbered_header = [header['is_numbered'] + is_numbered_header = [header["is_numbered"] for header in self.top_level_headers] - is_title = [header['is_introduction'] + is_title = [header["is_introduction"] for header in self.top_level_headers] first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 @@ -577,14 +408,34 @@ class HTMLDocxPreprocessor: first_header_is_introduction = is_title and is_title[0] if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction: - self.top_level_headers[0]['should_be_numbered'] = False + self.top_level_headers[0]["should_be_numbered"] = False for i in range(1, len(self.top_level_headers)): - self.top_level_headers[i]['should_be_numbered'] = True + self.top_level_headers[i]["should_be_numbered"] = True else: for i in range(0, len(self.top_level_headers)): - self.top_level_headers[i]['should_be_numbered'] = True + self.top_level_headers[i]["should_be_numbered"] = True + + @staticmethod + def clean_title_from_tabs(tag: NavigableString): + cleaned = re.sub(r"[\s\xa0]", " ", tag) + this = BeautifulSoup.new_string(BeautifulSoup( + features="lxml"), cleaned, NavigableString) + tag.replace_with(this) + + def apply_func_to_last_child(self, tag, func=None): + """ + works only with constructions like (((child to work with))) + where child is object of NavigableString + """ + if type(tag) is NavigableString: + func(tag) + else: + children = list(tag.children) + if children: + self.apply_func_to_last_child(children[0], func) def _process_headings(self): + # todo regex """ Function to process tags . Steps @@ -621,46 +472,36 @@ class HTMLDocxPreprocessor: while tag.parent.name == "ol": tag.parent.unwrap() - title = tag.text - title = self.clean_title_from_numbering(title) - if title == "": + cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text) + if cleaned_title == "": tag.unwrap() else: assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \ - f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' + f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings." content = list(tag.children) - # do not take into account rubbish empty tags like , but don't remove them + # do not take into account rubbish empty tags like , but don"t remove them content = [item for item in content if - (type(item) is not NavigableString and item.text != '') + (type(item) is not NavigableString and item.text != "") or (type(item) is NavigableString)] + content[0] = "" if content[0] == " " else content[0] + content = [item for item in content if item != ""] + for i, item in enumerate(content): if type(content[i]) is NavigableString: - cleaned = re.sub(r'(\s+)+', ' ', content[i]) + cleaned = re.sub(r"(\s+)+", " ", content[i]) this = BeautifulSoup.new_string(BeautifulSoup( features="lxml"), cleaned, NavigableString) content[i].replace_with(this) content[i] = this else: self.apply_func_to_last_child( - content[i], self.clean_tag_from_tabs) - - content[0] = '' if content[0] == ' ' else content[0] - content = [item for item in content if item != ''] - - if type(content[0]) is NavigableString: - cleaned = self.clean_title_from_numbering(content[0]) - this = BeautifulSoup.new_string(BeautifulSoup( - features="lxml"), cleaned, NavigableString) - content[0].replace_with(this) - content[0] = this - else: - self.apply_func_to_last_child( - content[0], self.clean_tag_from_numbering) + content[i], self.clean_title_from_tabs) def _process_lists(self): + # todo regex """ Function - process tags
  • . @@ -672,74 +513,76 @@ class HTMLDocxPreprocessor: uwrap

    tag with li """ - li_tags = self.body_tag.find_all("li") - for li_tag in li_tags: li_tag.attrs.update(li_tag.p.attrs) li_tag.p.unwrap() - def process_html(self, access=None, html_path='', book_id='local'): + def delete_content_before_toc(self): + # remove all tag upper the only in content !!! body tag is not updated + toc_tag = self.html_soup.new_tag("TOC") + self.content: List[Tag] = self.body_tag.find_all(recursive=False) + if toc_tag in self.content: + ind = self.content.index(toc_tag) + 1 + self.content = self.content[ind:] + + def process_html(self, access=None, html_path="", book_id=0): """Process html code to satisfy LiveCarta formatting.""" - self.logger_object.log('Beginning of processing .html file.') + self.logger_object.log("Beginning of processing .html file.") try: - self.logger_object.log(f'Processing TOC and headers.') + self.logger_object.log(f"Processing TOC and headers.") self._process_toc_links() self.clean_trash() # process main elements of the .html doc - self.logger_object.log(f'Processing main elements of html.') + self.logger_object.log(f"Processing main elements of html.") self._preprocessing_headings() self._process_paragraph() self._process_two_columns() - self.logger_object.log('Block quotes processing.') + self.logger_object.log("Block quotes processing.") self._process_quotes() - self.logger_object.log('Tables processing.') + self.logger_object.log("Tables processing.") self._process_tables() self.logger_object.log( - f'{self.tables_amount} tables have been processed.') + f"{self.tables_amount} tables have been processed.") - self.logger_object.log('Hrefs processing.') + self.logger_object.log("Hrefs processing.") self._process_hrefs() - self.logger_object.log('Footnotes processing.') - self._process_footnotes() + self.logger_object.log("Footnotes processing.") + self.footnotes = process_footnotes(self.body_tag) self.logger_object.log( - f'{len(self.footnotes)} footnotes have been processed.') + f"{len(self.footnotes)} footnotes have been processed.") - self.logger_object.log('Image processing.') - self._process_images( - access=access, html_path=html_path, book_id=book_id) + self.logger_object.log("Image processing.") + self.images = process_images(access=access, html_path=html_path, + book_id=book_id, body_tag=self.body_tag) self.logger_object.log( - f'{len(self.images)} images have been processed.') + f"{len(self.images)} images have been processed.") self._process_footer() self._process_div() - self.content = self.body_tag.find_all(recursive=False) - self.top_level_headers = self._get_top_level_headers() self._mark_introduction_headers() self._process_headings() - self.content: List[Tag] = self.body_tag.find_all(recursive=False) - self._process_lists() # delete text before table of content if exists self.delete_content_before_toc() except Exception as exc: self.logger_object.log( - 'Error has occurred while processing html.', logging.ERROR) + "Error has occurred while processing html.", logging.ERROR) self.logger_object.log_error_to_main_log() if self.status_wrapper: self.status_wrapper.set_error() raise exc - self.logger_object.log('End of processing .html file.') + self.logger_object.log("End of processing .html file.") return self.content, self.footnotes, self.top_level_headers diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py new file mode 100644 index 0000000..9c5fdab --- /dev/null +++ b/src/docx_converter/image_processing.py @@ -0,0 +1,34 @@ +import os +import pathlib +from shutil import copyfile + + +def process_images(access, html_path, book_id, body_tag): + """ + Function to process tag. + Img should be sent Amazon S3 and then return new tag with valid link. + For now images are moved to one folder. + + """ + img_tags = body_tag.find_all("img") + for img in img_tags: + img_name = img.attrs.get("src") + # quick fix for bad links + if (len(img_name) >= 3) and img_name[:3] == "../": + img_name = img_name[3:] + img_path = pathlib.Path(f"{html_path.parent}", f"{img_name}") + + if access is not None: + link = access.send_image(img_path, doc_id=book_id) + img.attrs["src"] = link + else: + if img_tags.index(img) == 0: + folder_path = os.path.dirname( + os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join( + folder_path, f"../books/json/img_{book_id}/")) + new_path.mkdir(exist_ok=True) + new_img_path = new_path / img_name + copyfile(img_path, new_img_path) + img.attrs["src"] = str(new_img_path) + return img_tags diff --git a/src/docx_converter/libre_html2json_converter.py b/src/docx_converter/libre_html2json_converter.py index 45522da..eb5f0a2 100644 --- a/src/docx_converter/libre_html2json_converter.py +++ b/src/docx_converter/libre_html2json_converter.py @@ -29,7 +29,7 @@ class LibreHTML2JSONConverter: cleaned text """ - new_text = re.sub(r'([\n\t])', ' ', html_text) + new_text = re.sub(r"([\n\t])", " ", html_text) return new_text # TODO: rethink the function structure without indexes. @@ -48,16 +48,16 @@ class LibreHTML2JSONConverter: """ if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: title = str(self.content[ind]) - title = title.replace(f'<{self.content[ind].name}>', '') - title = title.replace(f'', '') - title = re.sub(r'^\n', '', title) + title = title.replace(f"<{self.content[ind].name}>", "") + title = title.replace(f"", "") + title = re.sub(r"^\n", "", title) # extract outline from tag curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) result = { - 'title': f'{title}', - 'contents': [], - 'sub_items': [] + "title": f"{title}", + "contents": [], + "sub_items": [] } ch_content = [] ind += 1 @@ -71,9 +71,9 @@ class LibreHTML2JSONConverter: header_dict, ind = self.header_to_livecarta_chapter_item( ind) if ch_content: - result['contents'].append("".join(ch_content)) + result["contents"].append("".join(ch_content)) ch_content = [] - result['sub_items'].append(header_dict) + result["sub_items"].append(header_dict) # - current h_i <= h_initial, end of recursion else: # return result, ind @@ -85,21 +85,21 @@ class LibreHTML2JSONConverter: ind += 1 if ch_content: - result['contents'].append("".join(ch_content)) + result["contents"].append("".join(ch_content)) return result, ind - return '' + return "" @staticmethod def _is_empty_p_tag(tag): - if tag.name != 'p': + if tag.name != "p": return False temp_tag = copy(tag) - brs = temp_tag.find_all('br') + brs = temp_tag.find_all("br") for br in brs: br.decompose() - text = re.sub(r'\s+', '', temp_tag.text) + text = re.sub(r"\s+", "", temp_tag.text) if text: return False @@ -107,10 +107,7 @@ class LibreHTML2JSONConverter: def convert_to_dict(self): """Function which convert list of html nodes to appropriate json structure.""" - json_strc = [] - ind = 0 - ch_num = 0 - ch_amt = 0 + json_strc, ind, ch_num, ch_amt = [], 0, 0, 0 try: while ind < len(self.content): @@ -120,7 +117,7 @@ class LibreHTML2JSONConverter: res, ind = self.header_to_livecarta_chapter_item(ind) else: - chapter_title = f'Untitled chapter {ch_num}' + chapter_title = f"Untitled chapter {ch_num}" chapter = [] while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: if not self._is_empty_p_tag(self.content[ind]): @@ -129,9 +126,9 @@ class LibreHTML2JSONConverter: ind += 1 if chapter: res = { - 'title': chapter_title, - 'contents': ["".join(chapter)], - 'sub_items': [] + "title": chapter_title, + "contents": ["".join(chapter)], + "sub_items": [] } ch_num += 1 @@ -139,10 +136,10 @@ class LibreHTML2JSONConverter: json_strc.append(res) ch_amt += 1 self.logger_object.log( - f'Chapter {ch_amt} has been added to structure.') + f"Chapter {ch_amt} has been added to structure.") except Exception as exc: self.logger_object.log( - 'Error has occurred while making json structure.', logging.ERROR) + "Error has occurred while making json structure.", logging.ERROR) self.logger_object.log_error_to_main_log() if self.book_api_status: self.book_api_status.set_error() @@ -151,10 +148,10 @@ class LibreHTML2JSONConverter: # Add is_introduction field to json structure # after deleting content before toc, some chapters can be deleted if self.top_level_headers: - same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title'] - is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered'] + same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"] + is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"] - json_strc[0]['is_introduction'] = is_first_header_introduction + json_strc[0]["is_introduction"] = is_first_header_introduction self.content_dict = { "content": json_strc, diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py deleted file mode 100644 index 2212bd5..0000000 --- a/src/epub_converter/css_preprocessing.py +++ /dev/null @@ -1,238 +0,0 @@ -import re -import cssutils - -from ebooklib import epub -from bs4 import BeautifulSoup -from itertools import takewhile - -from src.util.color_reader import str2hex -from src.livecarta_config import LiveCartaConfig - - -def get_text_color(x): - color = str2hex(x) - color = color if color not in ['#000000', '#000', 'black'] else '' - return color - - -def get_bg_color(x): - color = str2hex(x) - color = color if color not in ['#ffffff', '#fff', 'white'] else '' - return color - - -def convert_tag_style_values(size_value: str) -> str: - """ - Function - - converts values of tags from em/%/pt to px - - find closest font-size px - Parameters - ---------- - size_value: str - - Returns - ------- - size_value: str - - """ - def find_closest_size(style_value): - possible_sizes = list( - takewhile(lambda x: style_value >= x, LiveCartaConfig.sizes_pr)) - last_possible_size_index = LiveCartaConfig.sizes_pr.index( - possible_sizes[-1]) - return LiveCartaConfig.sizes_px[last_possible_size_index] - - font_size_regexp = re.compile( - r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') - has_style_attrs = re.search(font_size_regexp, size_value) - if has_style_attrs: - if has_style_attrs.group(1): - size_value = float(size_value.replace('%', '')) / 100.0 - return find_closest_size(size_value) - elif has_style_attrs.group(3): - size_value = float(size_value.replace('em', '')) - return find_closest_size(size_value) - elif has_style_attrs.group(5): - return size_value.replace('pt', 'px') - else: - return '' - return size_value - - -def convert_indents_tag_values(size_value: str) -> str: - """ - Function converts values of ['text-indent', 'margin-left', 'margin'] - Parameters - ---------- - size_value: str - - Returns - ------- - size_value: str - - """ - if len(size_value.split(' ')) == 3: - size_value = convert_tag_style_values(size_value.split( - ' ')[-2]) # returns middle value - else: - size_value = convert_tag_style_values(size_value.split( - ' ')[-1]) # returns last value - return size_value - - -""" -Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } -Style properties that can be used to fit livecarta css style convention. -If property has empty list, it means that any value can be converted. -If property has not empty list, it means that only certain property-value combinations can be transformed. -""" -LIVECARTA_STYLE_ATTRS = { - 'text-indent': [], - 'font-variant': ['small-caps'], - 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], - 'align': [], - 'font': [], - 'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys() - if x != LiveCartaConfig.DEFAULT_FONT_NAME], - 'font-size': [], - 'font-weight': ['bold', '600', '700', '800', '900'], # - 'font-style': ['italic'], # - 'text-decoration': ['underline', 'line-through'], # , - 'text-decoration-line': ['underline', 'line-through'], # , - 'vertical-align': ['super'], # - 'color': [], - 'background-color': [], - 'background': [], - 'width': [], - 'border': [], - 'border-top-width': [], - 'border-right-width': [], - 'border-left-width': [], - 'border-bottom-width': [], - 'border-top': [], - 'border-bottom': [], - 'list-style-type': [], - 'list-style-image': [], - 'margin-left': [], - 'margin-top': [], - 'margin': [], -} - -""" -Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } - -Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated -to suit livecarta style convention. -""" -LIVECARTA_STYLE_ATTRS_MAPPING = { - 'text-indent': convert_indents_tag_values, - 'font-variant': lambda x: x, - 'text-align': lambda x: x, - 'font': lambda x: '', - 'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title())) - or LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x)), - 'font-size': convert_tag_style_values, - 'color': get_text_color, - 'background-color': get_bg_color, - 'background': get_bg_color, - 'border': lambda x: x if x != '0' else '', - 'border-top-width': lambda x: x if x != '0' else '', - 'border-right-width': lambda x: x if x != '0' else '', - 'border-left-width': lambda x: x if x != '0' else '', - 'border-bottom-width': lambda x: x if x != '0' else '', - 'border-top': lambda x: x if x != '0' else '', - 'border-bottom': lambda x: x if x != '0' else '', - 'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc', - 'list-style-image': lambda x: 'disc', - 'margin-left': convert_indents_tag_values, - 'margin-top': convert_tag_style_values, - 'margin': convert_indents_tag_values -} - - -def update_inline_styles_to_livecarta_convention(split_style: list): - for i, style in enumerate(split_style): - style_name, style_value = style.split(":") - if style_name not in LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = '' - return split_style - - cleaned_value = style_value.replace('\"', '').split()[-1] - constraints_on_value = LIVECARTA_STYLE_ATTRS.get( - style_name) - value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ - style_name] - if constraints_on_value and value_not_in_possible_values_list: - # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = '' - else: - if style_name in LIVECARTA_STYLE_ATTRS_MAPPING: - # function that converts our data - func = LIVECARTA_STYLE_ATTRS_MAPPING[style_name] - style_value = func(cleaned_value) - split_style[i] = style_name + ":" + style_value - return split_style - - -def build_inline_style_content(style: str) -> str: - """Build inline style with livecarta convention""" - # replace all spaces between '; & letter' to ';' - style = re.sub(r"; *", ";", style) - # when we split style by ';', last element of the list is '' - None - # remove it - split_style: list = list(filter(None, style.split(';'))) - # replace all spaces between ': & letter' to ':' - split_style = [el.replace( - re.search(r'(:\s*)', el).group(1), ':') for el in split_style] - - split_style = update_inline_styles_to_livecarta_convention(split_style) - style = "; ".join(split_style) - return style - - -def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule, - style_type: cssutils.css.property.Property): - if style_type.name not in LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = '' - return - - cleaned_value = style_type.value.replace('\"', '') - constraints_on_value = LIVECARTA_STYLE_ATTRS.get( - style_type.name) - value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ - style_type.name] - if constraints_on_value and value_not_in_possible_values_list: - # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = '' - else: - if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: - # function that converts our data - func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] - css_rule.style[style_type.name] = func(cleaned_value) - - -def build_css_file_content(css_content: str) -> str: - """Build css content with livecarta convention""" - sheet = cssutils.parseString(css_content.lower(), validate=False) - - for css_rule in sheet: - if css_rule.type == css_rule.STYLE_RULE: - for style_type in css_rule.style: - update_css_styles_to_livecarta_convention( - css_rule, style_type) - - css_text: str = sheet._getCssText().decode() - return css_text - - -if __name__ == '__main__': - file = '../../epub/9781627222174.epub' - ebooklib_book = epub.read_epub(file) - css_ = ebooklib_book.get_item_with_href('css/epub.css') - css_ = css_.get_content().decode() - css_cleaned = build_css_file_content(css_) - html_ = ebooklib_book.get_item_with_href( - 'pr01s05.xhtml').get_body_content().decode() - html_soup = BeautifulSoup(html_, features='lxml') diff --git a/src/epub_converter/css_processor.py b/src/epub_converter/css_processor.py new file mode 100644 index 0000000..2be0dab --- /dev/null +++ b/src/epub_converter/css_processor.py @@ -0,0 +1,216 @@ +import re +import cssutils +from bs4 import BeautifulSoup +from os.path import dirname, normpath, join + +from src.util.color_reader import str2hex +from src.livecarta_config import LiveCartaConfig + + +class CSSPreprocessor: + def __init__(self): + """ + Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } + + Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated + to suit LiveCarta style convention. + """ + self.LIVECARTA_STYLE_ATTRS_MAPPING = { + "text-indent": self.convert_indents_tag_values, + "font-variant": lambda x: x, + "text-align": lambda x: x, + "font": lambda x: "", + "font-family": lambda x: x, + "font-size": self.convert_tag_style_values, + "color": self.get_text_color, + "background-color": self.get_bg_color, + "background": self.get_bg_color, + "border": lambda x: x if x != "0" else "", + "border-top-width": lambda x: x if x != "0" else "", + "border-right-width": lambda x: x if x != "0" else "", + "border-left-width": lambda x: x if x != "0" else "", + "border-bottom-width": lambda x: x if x != "0" else "", + "border-top": lambda x: x if x != "0" else "", + "border-bottom": lambda x: x if x != "0" else "", + "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", + "list-style-image": lambda x: "disc", + "margin-left": self.convert_indents_tag_values, + "margin-top": self.convert_tag_style_values, + "margin": self.convert_indents_tag_values, + "width": self.convert_tag_style_values, + } + + @staticmethod + def get_text_color(x): + color = str2hex(x) + color = color if color not in ["#000000", "#000", "black"] else "" + return color + + @staticmethod + def get_bg_color(x): + color = str2hex(x) + color = color if color not in ["#ffffff", "#fff", "white"] else "" + return color + + @staticmethod + def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str: + """ + Function + - converts values of tags from em/%/pt to px + - find closest font-size px + Parameters + ---------- + size_value: str + + is_indent: bool + + Returns + ------- + size_value: str + converted value size + """ + size_regexp = re.compile( + r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)") + has_style_attrs = re.search(size_regexp, size_value) + if has_style_attrs: + if has_style_attrs.group(1): + multiplier = 5.76 if is_indent else 0.16 + size_value = float(size_value.replace("%", "")) * multiplier + return str(size_value)+'px' + elif has_style_attrs.group(3): + multiplier = 18 if is_indent else 16 + size_value = float(size_value.replace("em", "")) * multiplier + return str(size_value)+'px' + elif has_style_attrs.group(5): + size_value = float(size_value.replace("pt", "")) * 4/3 + return str(size_value)+'px' + else: + return "" + return size_value + + def convert_indents_tag_values(self, size_value: str) -> str: + """ + Function converts values of ["text-indent", "margin-left", "margin"] + Parameters + ---------- + size_value: str + + Returns + ------- + size_value: str + + """ + size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\ + else self.convert_tag_style_values(size_value.split(" ")[-1], True) + return size_value + + @staticmethod + def clean_value(style_value: str, style_name: str): + cleaned_value = style_value.replace("\"", "") + if style_name == 'font-family': + for symbol in ["+", "*", ".", "%", "?", "$", "^", "[", "]"]: + cleaned_value = re.sub( + re.escape(f"{symbol}"), rf"\\{symbol}", cleaned_value) + return cleaned_value + + @staticmethod + def style_conditions(style_value: str, style_name: str) -> tuple[bool, bool]: + constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get( + style_name) + value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[ + style_name] + return constraints_on_value, value_not_in_possible_values_list + + def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list: + for i, style in enumerate(split_style): + style_name, style_value = style.split(":") + if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: + # property not in LIVECARTA_STYLE_ATTRS, remove from css file + split_style[i] = "" + return split_style + + cleaned_value = self.clean_value(style_value, style_name) + if all(self.style_conditions(cleaned_value, style_name)): + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file + split_style[i] = "" + else: + if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING: + # function that converts our data + func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name] + style_value = func(cleaned_value) + split_style[i] = style_name + ":" + style_value + return split_style + + def build_inline_style_content(self, style: str) -> str: + """Build inline style with LiveCarta convention""" + # replace all spaces between "; & letter" to ";" + style = re.sub(r"; *", ";", style) + # when we split style by ";", last element of the list is "" - None (we remove it) + split_style: list = list(filter(None, style.split(";"))) + # replace all spaces between ": & letter" to ":" + split_style = [el.replace( + re.search(r"(:\s*)", el).group(1), ":") for el in split_style] + + split_style = self.update_inline_styles_to_livecarta_convention( + split_style) + style = "; ".join(split_style) + return style + + def process_inline_styles_in_html_soup(self, html_href2html_body_soup: dict): + """This function is designed to convert inline html styles""" + for html_href in html_href2html_body_soup: + html_content: BeautifulSoup = html_href2html_body_soup[html_href] + tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + attrs={"style": re.compile(".*")}) + + for tag_initial_inline_style in tags_with_inline_style: + inline_style = tag_initial_inline_style.attrs["style"] + tag_initial_inline_style.attrs["style"] = \ + self.build_inline_style_content(inline_style) + + @staticmethod + def get_css_content(css_href, html_href, ebooklib_book): + path_to_css_from_html = css_href + html_folder = dirname(html_href) + path_to_css_from_root = normpath( + join(html_folder, path_to_css_from_html)).replace("\\", "/") + css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root) + # if in css file we import another css + if "@import" in str(css_obj.content): + path_to_css_from_root = "css/" + \ + re.search('"(.*)"', str(css_obj.content)).group(1) + css_obj = ebooklib_book.get_item_with_href( + path_to_css_from_root) + assert css_obj, f"Css style {css_href} was not in manifest." + css_content: str = css_obj.get_content().decode() + return css_content + + def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule, + style_type: cssutils.css.property.Property): + if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: + # property not in LIVECARTA_STYLE_ATTRS, remove from css file + css_rule.style[style_type.name] = "" + return + + cleaned_value = self.clean_value(style_type.value, style_type.name) + if all(self.style_conditions(cleaned_value, style_type.name)): + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file + css_rule.style[style_type.name] = "" + else: + if style_type.name in self.LIVECARTA_STYLE_ATTRS_MAPPING: + # function that converts our data + func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] + css_rule.style[style_type.name] = func(cleaned_value) + + def build_css_file_content(self, css_content: str) -> str: + """Build css content with LiveCarta convention""" + sheet = cssutils.parseString(css_content, validate=False) + + for css_rule in sheet: + if css_rule.type == css_rule.STYLE_RULE: + for style_type in css_rule.style: + self.update_css_styles_to_livecarta_convention( + css_rule, style_type) + + css_text: str = sheet._getCssText().decode() + return css_text diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index dc8d3a2..fb3b786 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -1,39 +1,40 @@ import re import json import codecs -import os -from os.path import dirname, normpath, join -from itertools import chain -from collections import defaultdict -from typing import Dict, Union, List - - import ebooklib from ebooklib import epub from ebooklib.epub import Link, Section -from bs4 import BeautifulSoup, Tag - +from os import path +from pathlib import Path +from itertools import chain +from premailer import transform +from collections import defaultdict +from typing import Dict, Union, List +from bs4 import BeautifulSoup, NavigableString, Tag from src.util.helpers import BookLogger +from src.epub_converter.css_processor import CSSPreprocessor +from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint -from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content -from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style -from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\ - prepare_title, prepare_content, update_images_src_links, preprocess_footnotes +from src.epub_converter.image_processing import update_images_src_links +from src.epub_converter.footnotes_processing import preprocess_footnotes +from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor class EpubConverter: - def __init__(self, file_path, access=None, logger=None): - self.file_path = file_path + def __init__(self, book_path, access=None, logger=None, css_processor=None, html_processor=None): + self.book_path = book_path self.access = access self.logger: BookLogger = logger - self.ebooklib_book = epub.read_epub(file_path) + self.ebooklib_book = epub.read_epub(book_path) + self.css_processor = css_processor + self.html_processor = html_processor # main container for all epub .xhtml files self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # enumerate all subchapter id for each file - self.html_href2subchapter_ids = defaultdict(list) + self.html_href2subchapters_ids = defaultdict(list) self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC # toc tree structure stored as adj.list (NavPoint to list of NavPoints) @@ -57,55 +58,51 @@ class EpubConverter: self.noterefs: List[Tag] = [] # start of the footnote self.footnotes: List[Tag] = [] # end of the footnote - self.logger.log('Image processing.') + self.logger.log("Image processing.") for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): file_name = x.file_name content = x.content self.img_href2img_bytes[file_name] = content - self.logger.log('HTML files reading.') + self.logger.log("HTML files reading.") self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() - # TODO Presets - self.logger.log('Process CSS inline styles.') - self.process_inline_styles_in_html_soup() - self.logger.log('CSS files processing.') + self.logger.log("CSS inline style processing.") + self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup) + self.logger.log("CSS files processing.") self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() - self.logger.log('CSS styles adding.') + self.logger.log("CSS styles fusion(inline+file).") self.add_css_styles_to_html_soup() - self.logger.log('Footnotes processing.') + self.logger.log("Footnotes processing.") for href in self.html_href2html_body_soup: - content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], - self.html_href2html_body_soup) - self.footnotes_contents.extend(content) - self.noterefs.extend(noterefs) - self.footnotes.extend(footnotes_tags) + self.footnotes_contents, self.noterefs, self.footnotes =\ + preprocess_footnotes( + self.html_href2html_body_soup[href], self.html_href2html_body_soup) + self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") - for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)): - noteref.attrs['data-id'] = i + 1 - noteref.attrs['id'] = f'footnote-{i + 1}' - footnote.attrs['href'] = f'#footnote-{i + 1}' - - self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.') - self.logger.log('TOC processing.') + self.logger.log("TOC processing.") self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed if self.is_toc_empty(): self.build_adjacency_list_from_spine() not_added = [ x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] - self.logger.log(f'Html documents not added to TOC: {not_added}.') + self.logger.log(f"Html documents not added to TOC: {not_added}.") + self.logger.log(f"Add documents not added to TOC.") self.add_not_added_files_to_adjacency_list(not_added) - self.logger.log(f'Html internal links and structure processing.') - self.label_chapters_ids_with_tmp_id() - # used only after parsed toc, ids from toc needed - self.process_html_soup_structure_to_line() + self.logger.log(f"Label subchapters with converter tag.") + self.label_subchapters_with_lc_tag() + self.logger.log(f"Process html internal links.") self.process_internal_links() - self.logger.log(f'Building chapters content.') - self.define_chapters_content() + self.logger.log( + f"Check if converter-chapter-marks are on the same level.") + self.chapter_marks_are_same_level() + self.logger.log(f"Define chapters content.") + self.define_chapters_with_content() + self.logger.log(f"Converting html_nodes to LiveCarta chapter items.") def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: # using EpubElements @@ -115,38 +112,10 @@ class EpubConverter: for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_body_text = item.get_body_content() # html.parser closes tags if needed - soup = BeautifulSoup(html_body_text, features='html.parser') + soup = BeautifulSoup(html_body_text, features="html.parser") nodes[item.file_name] = soup return nodes - def get_css_content(self, css_href, html_href): - path_to_css_from_html = css_href - html_folder = dirname(html_href) - path_to_css_from_root = normpath( - join(html_folder, path_to_css_from_html)).replace('\\', '/') - css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) - # if in css file we import another css - if "@import" in str(css_obj.content): - path_to_css_from_root = "css/" + \ - re.search('"(.*)"', str(css_obj.content)).group(1) - css_obj = self.ebooklib_book.get_item_with_href( - path_to_css_from_root) - assert css_obj, f'Css style {css_href} was not in manifest.' - css_content: str = css_obj.get_content().decode() - return css_content - - def process_inline_styles_in_html_soup(self): - """This function is designed to convert inline html styles""" - for html_href in self.html_href2html_body_soup: - html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] - tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={'style': re.compile('.*')}) - - for tag_initial_inline_style in tags_with_inline_style: - inline_style = tag_initial_inline_style.attrs['style'] - tag_initial_inline_style.attrs['style'] = \ - build_inline_style_content(inline_style) - def build_html_and_css_relations(self) -> tuple[dict, dict]: """ Function is designed to get 2 dictionaries: @@ -167,39 +136,81 @@ class EpubConverter: for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_content = item.content html_href = item.file_name - soup_html_content = BeautifulSoup(html_content, features='lxml') + soup_html_content = BeautifulSoup(html_content, features="lxml") # check if file links to css file - for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): + for tag in soup_html_content.find_all("link", attrs={"type": "text/css"}): # alternate page of original page (e.g. another language) - if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): + if tag.attrs.get("rel") and ("alternate" in tag.attrs["rel"]): continue - css_href = tag.attrs.get('href') + css_href = tag.attrs.get("href") html_href2css_href[html_href].append(css_href) if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict - css_href2css_content[css_href] = build_css_file_content( - self.get_css_content(css_href, html_href)) + css_href2css_content[css_href] = self.css_processor.build_css_file_content( + self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book)) - for i, tag in enumerate(soup_html_content.find_all('style')): + for i, tag in enumerate(soup_html_content.find_all("style")): css_content = tag.string - html_href2css_href[html_href].append(f'href{i}') - css_href2css_content[f'href{i}'] = build_css_file_content( + html_href2css_href[html_href].append(f"href{i}") + css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content( css_content) return html_href2css_href, css_href2css_content + @staticmethod + def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: + """ + Function adds styles from .css to inline style. + Parameters + ---------- + html_soup: BeautifulSoup + html page with inline style + css_text: str + css content from css file + Returns + ------- + inline_soup: BeautifulSoup + soup with styles from css + + """ + # remove this specification because it causes problems + css_text = css_text.replace( + '@namespace epub "http://www.idpf.org/2007/ops";', '') + # here we add css styles to inline style + html_with_css_styles: str = transform(str(html_soup), css_text=css_text, + remove_classes=False, + external_styles=False, + allow_network=False, + disable_validation=True, + ) + # soup with converted styles from css + inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") + + tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + attrs={"style": re.compile(".*")}) + + # go through the tags with inline style + style parsed from css file + for tag_inline_style in tags_with_inline_style: + style_converter = TagInlineStyleProcessor(tag_inline_style) + style_converter.convert_initial_tag() + return inline_soup + def add_css_styles_to_html_soup(self): """ This function is designed to update html_href2html_body_soup - add to html_inline_style css_style_content - + Returns + ------- + None + updated soups with styles from css """ for html_href in self.html_href2html_body_soup: if self.html_href2css_href.get(html_href): - css = '' + css = "" for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] - html_content = convert_html_soup_with_css_style(html_content, css) + html_content = self.modify_html_soup_with_css_styles( + html_content, css) self.html_href2html_body_soup[html_href] = html_content def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): @@ -226,7 +237,7 @@ class EpubConverter: nav_point = NavPoint(element) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.html_href2subchapter_ids[nav_point.href].append( + self.html_href2subchapters_ids[nav_point.href].append( nav_point.id) self.adjacency_list[nav_point] = None self.hrefs_added_to_toc.add(nav_point.href) @@ -238,12 +249,12 @@ class EpubConverter: nav_point = NavPoint(first) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.html_href2subchapter_ids[nav_point.href].append( + self.html_href2subchapters_ids[nav_point.href].append( nav_point.id) sub_nodes = [] for elem in second: - if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1: + if (bool(re.search('^section$|^part$', first.title.lower()))) and lvl == 1: self.offset_sub_nodes.append( self.build_adjacency_list_from_toc(elem, lvl)) else: @@ -267,7 +278,7 @@ class EpubConverter: self.adjacency_list[-1] = nodes else: - assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' + assert 0, f"Error. Element is not tuple/Link/list instance: {type(element)}" def is_toc_empty(self) -> bool: """Function checks is toc empty""" @@ -276,14 +287,14 @@ class EpubConverter: return True return False - def build_manifest_id2html_href(self) -> dict: - links = dict() - for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): - links[item.id] = item.file_name - return links - def build_adjacency_list_from_spine(self): - manifest_id2html_href = self.build_manifest_id2html_href() + def build_manifest_id2html_href() -> dict: + links = dict() + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + links[item.id] = item.file_name + return links + + manifest_id2html_href = build_manifest_id2html_href() self.adjacency_list = { -1: [] } @@ -293,42 +304,49 @@ class EpubConverter: self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(nav_point.href) - def add_not_added_files_to_adjacency_list(self, not_added): + def add_not_added_files_to_adjacency_list(self, not_added: list): """Function add files that not added to adjacency list""" for i, file in enumerate(not_added): nav_point = NavPoint( - Section(f'To check #{i}, filename: {file}', file)) + Section(f"To check #{i}, filename: {file}", file)) self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(file) - def label_chapters_ids_with_tmp_id(self): + def label_subchapters_with_lc_tag(self): for html_href in self.html_href2html_body_soup: - ids = self.html_href2subchapter_ids[html_href] + ids, soup = self.html_href2subchapters_ids[html_href], \ + self.html_href2html_body_soup[html_href] for i in ids: - soup = self.html_href2html_body_soup[html_href] tag = soup.find(id=i) - new_h = soup.new_tag('tmp') - new_h.attrs['class'] = 'converter-chapter-mark' - new_h.attrs['id'] = i - tag.insert_before(new_h) + tmp_tag = soup.new_tag("lc_tmp") + tmp_tag.attrs["class"] = "converter-chapter-mark" + tmp_tag.attrs["id"] = i + tag.insert_before(tmp_tag) - def process_html_soup_structure_to_line(self): - # go to line structure + def chapter_marks_are_same_level(self): + """ + Function checks that marks for pointing a start of a chapter are placed on one level in html tree. + Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed. + This tag must have a chapter_tag as a parent. + Otherwise, it is wrapped with some tags. Like: +

    + + """ for html_href in self.html_href2html_body_soup: - soup = self.html_href2html_body_soup[html_href] - self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup) + chapter_tag = self.html_href2html_body_soup[html_href] + # check marks for chapter starting are on the same level - 1st + marks = chapter_tag.find_all( + attrs={"class": "converter-chapter-mark"}) + + # fix marks to be on 1 level + for mark in marks: + while mark.parent != chapter_tag: + # todo warning! could reflect on formatting/internal links in some cases + mark.parent.unwrap() @staticmethod def create_unique_id(href, id_): - return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) - - @staticmethod - def create_new_anchor_span(soup, id_): - new_anchor_span = soup.new_tag("span") - new_anchor_span.attrs['id'] = id_ - new_anchor_span.attrs['class'] = 'link-anchor' - new_anchor_span.string = "\xa0" - return new_anchor_span + return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_) def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]: """ @@ -351,23 +369,31 @@ class EpubConverter: prepared content """ - dir_name = os.path.dirname(cur_file_path) - normed_path = os.path.normpath(os.path.join( - dir_name, href_in_link)).replace('\\', '/') + dir_name = path.dirname(cur_file_path) + normed_path = path.normpath(path.join( + dir_name, href_in_link)).replace("\\", "/") full_path = [ path for path in self.hrefs_added_to_toc if normed_path in path] if not full_path: - self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' - f'While processing href in {internal_link_tag}.') - internal_link_tag.attrs['converter-mark'] = 'bad-link' + self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. " + f"While processing href in {internal_link_tag}.") + internal_link_tag.attrs["converter-mark"] = "bad-link" return None if len(full_path) > 1: - self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}' - f' while {internal_link_tag} processing. The first one will be chosen.') + self.logger.log(f"Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}" + f" while {internal_link_tag} processing. The first one will be chosen.") return full_path[0] + @staticmethod + def create_new_anchor_span(soup, id_): + new_anchor_span = soup.new_tag("span") + new_anchor_span.attrs["id"] = id_ + new_anchor_span.attrs["class"] = "link-anchor" + new_anchor_span.string = "\xa0" + return new_anchor_span + def process_internal_links(self): """ Function @@ -376,8 +402,8 @@ class EpubConverter: Steps ---------- 1. rebuild ids to be unique in all documents - 2a. process anchor which is a whole xhtml file - 2b. process anchor which is an element in xhtml file + 2a. process anchor which is a whole htm|html|xhtml file + 2b. process anchor which is an element in htm|html|xhtml file Returns ------- @@ -385,99 +411,128 @@ class EpubConverter: process links in html """ - # 1. rebuild ids to be unique in all documents - for toc_href in self.hrefs_added_to_toc: - for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}): - if tag.attrs.get('class') == 'converter-chapter-mark': - continue + def make_ids_unique(): + for toc_href in self.hrefs_added_to_toc: + for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): + if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]: + new_id = self.create_unique_id(toc_href, tag.attrs["id"]) + tag.attrs["id"] = new_id - if tag.attrs.get('class') == 'footnote-element': - continue + def process_file_anchor(): + for toc_href in self.hrefs_added_to_toc: + soup = self.html_href2html_body_soup[toc_href] + for internal_link_tag in soup.find_all("a", + {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}): + a_tag_href = internal_link_tag.attrs["href"] + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( + toc_href, a_tag_href, internal_link_tag) + if a_tag_href_matched_to_toc: + new_id = self.create_unique_id(a_tag_href_matched_to_toc, "") + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" + if new_id not in self.internal_anchors: + anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] + new_anchor_span = self.create_new_anchor_span(soup, new_id) + # insert a new span to the beginning of the file + anchor_soup.insert(0, new_anchor_span) + self.internal_anchors.add(new_id) + del internal_link_tag.attrs["href"] - new_id = self.create_unique_id(toc_href, tag.attrs['id']) - tag.attrs['id'] = new_id + def process_file_element_anchor(): + for toc_href in self.hrefs_added_to_toc: + soup = self.html_href2html_body_soup[toc_href] + # process_file_element_anchor + for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}): + a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#") + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( + toc_href, a_tag_href, internal_link_tag) if a_tag_href \ + else path.normpath(toc_href).replace("\\", "/") + if a_tag_href_matched_to_toc: + new_id = self.create_unique_id( + a_tag_href_matched_to_toc, a_tag_id) - # 2a. process anchor which is a whole xhtml file - internal_link_reg1 = re.compile( - r'(^(?!https?://).+\.(htm|html|xhtml)$)') - for toc_href in self.hrefs_added_to_toc: - soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): - a_tag_href = internal_link_tag.attrs['href'] - # find full path - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( - toc_href, a_tag_href, internal_link_tag) - if not a_tag_href_matched_to_toc: - continue - new_id = self.create_unique_id(a_tag_href_matched_to_toc, '') - internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' - if new_id not in self.internal_anchors: - anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - new_anchor_span = self.create_new_anchor_span(soup, new_id) - # insert a new span to the beginning of the file - anchor_soup.insert(0, new_anchor_span) - self.internal_anchors.add(new_id) + anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] + anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \ + anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote + if anchor_tags: + if len(anchor_tags) > 1: + self.logger.log(f"Warning in {toc_href}: multiple anchors:" + f"{len(anchor_tags)} found.\n" + f"{anchor_tags}\n" + f"While processing {internal_link_tag}") - del internal_link_tag.attrs['href'] + anchor_tag = anchor_tags[0] + assert anchor_tag.attrs["id"] in [new_id, a_tag_id] + # if anchor is found we could add placeholder for link creation on server side. + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" + # create span to have cyclic links, link has 1 type of class, anchor another + if anchor_tag.attrs["id"] not in self.internal_anchors: + new_anchor_span = self.create_new_anchor_span( + soup, new_id) + anchor_tag.insert_before(new_anchor_span) + self.internal_anchors.add(new_id) + del anchor_tag.attrs["id"] + del internal_link_tag.attrs["href"] + else: + internal_link_tag.attrs["converter-mark"] = "bad-link" + self.logger.log(f"Error in {toc_href}." + f" While processing {internal_link_tag} no anchor found." + f" Should be anchor with new id={new_id} in" + f" {a_tag_href_matched_to_toc} file." + f" Old id={a_tag_id}") + # 1. make ids to be unique in all documents + make_ids_unique() + # 2a. process anchor which is a whole htm|html|xhtml file + process_file_anchor() + # 2b. process anchor which is an element in htm|html|xhtml file + process_file_element_anchor() - # 2b. process anchor which is an element in xhtml file - internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)') - for toc_href in self.hrefs_added_to_toc: - soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): - a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split( - '#') - # find full path - if a_tag_href: - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, - internal_link_tag) - else: - a_tag_href_matched_to_toc = os.path.normpath( - toc_href).replace('\\', '/') + @staticmethod + def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: + """ + Get tags between LiveCarta chapter marks + Parameters + ---------- + first_id: str + Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" + href: str + Name of current chapters file + html_soup: Tag + Soup object of current file - if not a_tag_href_matched_to_toc: - continue + Returns + ------- + tags: list [Tag, NavigableString] + Chapter's tags - new_id = self.create_unique_id( - a_tag_href_matched_to_toc, a_tag_id) + """ + marked_tags = html_soup.find( + attrs={"id": first_id, "class": "converter-chapter-mark"}) + if marked_tags: + next_tag = marked_tags.next_sibling + tags = [] + while next_tag: + if not isinstance(next_tag, NavigableString) and \ + (next_tag.attrs.get("class") == "converter-chapter-mark"): + break + tags.append(next_tag) + next_tag = next_tag.next_sibling - anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - anchor_tags = anchor_soup.find_all(attrs={'id': new_id, }) - anchor_tags = anchor_tags or anchor_soup.find_all( - attrs={'id': a_tag_id}) # if link is a footnote + # remove tags between first_id and next found id + # save them in list for next steps + tags = [tag.extract() for tag in tags] + html_soup.smooth() + else: + assert 0, f"Warning: no match for {first_id, href}" - if anchor_tags: - if len(anchor_tags) > 1: - self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n' - f'{anchor_tags}\n' - f' While processing {internal_link_tag}') + return tags - anchor_tag = anchor_tags[0] - assert anchor_tag.attrs['id'] in [new_id, a_tag_id] - # if anchor is found we could add placeholder for link creation on server side. - internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' - # create span to have cyclic links, link has 1 type of class, anchor another - if anchor_tag.attrs['id'] not in self.internal_anchors: - new_anchor_span = self.create_new_anchor_span( - soup, new_id) - anchor_tag.insert_before(new_anchor_span) - self.internal_anchors.add(new_id) - del anchor_tag.attrs['id'] - del internal_link_tag.attrs['href'] - - else: - internal_link_tag.attrs['converter-mark'] = 'bad-link' - self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.' - f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' - f' Old id={a_tag_id}') - - def build_one_chapter(self, nav_point: NavPoint): + def detect_one_chapter(self, nav_point: NavPoint): """ Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) 3 cases: id wraps all chapter content, - id wraps chapter's content + subchapters' content + id wraps chapter"s content + subchapters" content id points to the start of title of a chapter In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id @@ -494,68 +549,82 @@ class EpubConverter: """ if nav_point.id: soup = self.html_href2html_body_soup[nav_point.href] - chapter_tags = get_tags_between_chapter_marks( + subchapter_tags = self.get_tags_between_chapter_marks( first_id=nav_point.id, href=nav_point.href, html_soup=soup) - new_tree = BeautifulSoup('', 'html.parser') - for tag in chapter_tags: - new_tree.append(tag) + new_tree = BeautifulSoup("", "html.parser") + for subchapter_tag in subchapter_tags: + new_tree.append(subchapter_tag) self.href_chapter_id2soup_html[( nav_point.href, nav_point.id)] = new_tree if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: - self.build_one_chapter(sub_node) + self.detect_one_chapter(sub_node) - def define_chapters_content(self): + def define_chapters_with_content(self): """Function build chapters content, starts from top level chapters""" top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: - for point in top_level_nav_points: - self.build_one_chapter(point) + for tl_nav_point in top_level_nav_points: + self.detect_one_chapter(tl_nav_point) - def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: + def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: + """ + Function prepare style, tags to json structure + Parameters + ---------- + nav_point: NavPoint + + lvl: int + level of chapter + + Returns + ------- + ChapterItem + built chapter + + """ title = nav_point.title - if nav_point.id: - content: BeautifulSoup = self.href_chapter_id2soup_html[( - nav_point.href, nav_point.id)] - else: - content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href] - self.book_image_src_path2aws_path = update_images_src_links(content, + content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \ + if nav_point.id else self.html_href2html_body_soup[nav_point.href] + + indent = " " * lvl + self.logger.log(indent + f"Chapter: {title} is processing.") + is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS + self.logger.log(indent + "Process title.") + title_preprocessed = self.html_processor.prepare_title(title) + self.logger.log(indent + "Process content.") + content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content, + remove_title_from_chapter=is_chapter) + + self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed, self.img_href2img_bytes, path_to_html=nav_point.href, access=self.access, path2aws_path=self.book_image_src_path2aws_path, - book_id=self.file_path.stem - if hasattr(self.file_path, 'stem') else 'book_id') - - is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS - title_preprocessed = prepare_title(title) - content_preprocessed = prepare_content(title_preprocessed, content, - remove_title_from_chapter=is_chapter) + book_id=Path(self.book_path).stem) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter + # if it doesn't have subchapters if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: - sub_chapter_item = self.node_to_livecarta_chapter_item( + sub_chapter_item = self.html_node_to_livecarta_chapter_item( sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) - - if self.logger: - indent = ' ' * lvl - self.logger.log(f'{indent}Chapter: {title} is prepared.') - return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) + return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes) def convert_to_dict(self) -> dict: """Function which convert list of html nodes to appropriate json structure""" top_level_nav_points = self.adjacency_list[-1] top_level_chapters = [] - for nav_point in top_level_nav_points: - chapter = self.node_to_livecarta_chapter_item(nav_point) + # loop through to level chapters + for tl_nav_point in top_level_nav_points: + chapter = self.html_node_to_livecarta_chapter_item(tl_nav_point) top_level_chapters.append(chapter) top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] - self.logger.log(f'Anchors found: {len(self.internal_anchors)}.') - self.logger.log('End conversion.') + self.logger.log(f"Anchors found: {len(self.internal_anchors)}.") + self.logger.log("End conversion.") return { "content": top_level_dict_chapters, @@ -564,12 +633,16 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = '../../epub/9781614382264.epub' + epub_file_path = "../../books/epub/9780763774134.epub" logger_object = BookLogger( - name='epub', book_id=epub_file_path.split('/')[-1]) + name="epub", book_id=epub_file_path.split("/")[-1]) - json_converter = EpubConverter(epub_file_path, logger=logger_object) + css_processor = CSSPreprocessor() + html_processor = HtmlEpubPreprocessor(logger=logger_object) + + json_converter = EpubConverter(epub_file_path, logger=logger_object, + css_processor=css_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() - with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json: + with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: json.dump(content_dict, f_json, ensure_ascii=False) diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index cb6e080..9131eda 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -1,4 +1,6 @@ from src.book_solver import BookSolver +from src.epub_converter.css_processor import CSSPreprocessor +from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.epub_converter.epub_converter import EpubConverter @@ -7,15 +9,17 @@ class EpubBook(BookSolver): def __init__(self, book_id=0, access=None, main_logger=None): super().__init__(book_id, access, main_logger) - self.book_type = 'epub' + self.book_type = "epub" def get_converted_book(self): """ Function Steps ---------- - 1. Converts .epub to .html - 2. Parses from line structure to nested structure + 1. Gets data from preset structure + 2. Add preset to html preprocessor + 3. Converts .epub to .html + 4. Parses from line structure to nested structure Returns ---------- @@ -23,7 +27,10 @@ class EpubBook(BookSolver): json for LiveCarta platform """ + css_processor = CSSPreprocessor() + html_processor = HtmlEpubPreprocessor(self.preset_path, logger=self.logger_object) json_converter = EpubConverter( - self.file_path, access=self.access, logger=self.logger_object) + self.book_path, access=self.access, logger=self.logger_object, + css_processor=css_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() return content_dict diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py new file mode 100644 index 0000000..34cd1fb --- /dev/null +++ b/src/epub_converter/footnotes_processing.py @@ -0,0 +1,91 @@ +import re +from typing import Tuple +from bs4 import BeautifulSoup, Tag + + +def _replace_with_livecarta_anchor_tag(anchor, i): + """Function replace noteref_tag(anchor) with new livecarta tag""" + new_tag = BeautifulSoup(features="lxml").new_tag("sup") + new_tag["class"] = "footnote-element" + new_tag["data-id"] = i + 1 + new_tag["id"] = f"footnote-{i + 1}" + new_tag.string = "*" + if anchor.parent.name == "sup": + anchor.parent.unwrap() + anchor.replace_with(new_tag) + return new_tag + + +def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name="epub:type") \ + -> Tuple[list, list, list]: + """ + This function preprocessing footnotes + This function should be earlier that adding fonts in pipeline. + +

    Here is an example footnote1

    + + + """ + footnotes, new_noterefs_tags, new_footnotes_tags = [], [], [] + noterefs_tags = source_html_tag.find_all( + attrs={noteref_attr_name: "noteref"}) + bad_noterefs_tags = set( + [tag for tag in noterefs_tags if not tag.attrs.get("href")]) + noterefs_tags = [ + tag for tag in noterefs_tags if tag not in bad_noterefs_tags] + [tag.decompose() for tag in bad_noterefs_tags] + + def parse_a_tag_href(s: str) -> Tuple[str, str]: + """Returns name of file & id of an anchor""" + assert "#" in s, f"Error. Unexpected href: {s} in a tag. Href must contain an id." + f, id_ = s.split("#") + return f, id_ + + def verify_footnote_tag(tags: list): + """Function verifies is tag - footnote""" + assert len(tags) <= 1, f"Error, Multiple id: {href}.\n{tags}" + if len(tags) == 0: + anchored_tags = list(target_html_tag.find_all(id=element_id)) + if len(anchored_tags): + print( + f"Warning. Href for tag is detected as footnote:\n{noteref_tag}") + return anchored_tags + else: + assert 0, f"Error, No element with id: {href} found." + return tags + + for i, noteref_tag in enumerate(noterefs_tags): + href = noteref_tag.attrs["href"] + file, element_id = parse_a_tag_href(href) + if not file: + target_html_tag = source_html_tag + else: + target_html_tag = href2soup_html.get(file) + if not target_html_tag: + print( + f"Error while footnotes processing. For {noteref_tag} invalid path: {file}.") + continue + + possible_footnote = "note|footnote|endnote|rearenote" + expected_footnote_tags = list(target_html_tag.find_all(id=element_id, + attrs={"epub:type": re.compile(possible_footnote)})) + + expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) + footnote_tag = expected_footnote_tags[0] + if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote": + footnote_tag = footnote_tag.parent + new_noterefs_tags.append( + _replace_with_livecarta_anchor_tag(noteref_tag, i)) + content = footnote_tag.text + # footnote_tag.decompose() + footnotes.append(content) + footnote_tag = footnote_tag.find( + attrs={"role": "docs-backlink"}) or footnote_tag + new_footnotes_tags.append(footnote_tag) + + for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)): + noteref.attrs["data-id"] = i + 1 + noteref.attrs["id"] = f"footnote-{i + 1}" + footnote.attrs["href"] = f"#footnote-{i + 1}" + + return footnotes, new_noterefs_tags, new_footnotes_tags diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py deleted file mode 100644 index d94c43a..0000000 --- a/src/epub_converter/html_epub_preprocessor.py +++ /dev/null @@ -1,666 +0,0 @@ -import os -import re -import pathlib -from typing import Tuple - -from bs4 import BeautifulSoup, NavigableString, Tag, Comment - -from src.access import Access -from src.livecarta_config import LiveCartaConfig - - -def _replace_with_livecarta_anchor_tag(anchor, i): - """Function replace noteref_tag(anchor) with new livecarta tag""" - new_tag = BeautifulSoup(features='lxml').new_tag('sup') - new_tag['class'] = 'footnote-element' - new_tag['data-id'] = i + 1 - new_tag['id'] = f'footnote-{i + 1}' - new_tag.string = '*' - if anchor.parent.name == 'sup': - anchor.parent.unwrap() - anchor.replace_with(new_tag) - return new_tag - - -def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \ - -> Tuple[list, list, list]: - """ - This function preprocessing footnotes - This function should be earlier that adding fonts in pipeline. - -

    Here is an example footnote1

    - - - """ - footnotes = [] - noterefs_tags = source_html_tag.find_all( - attrs={noteref_attr_name: 'noteref'}) - bad_noterefs_tags = set( - [tag for tag in noterefs_tags if not tag.attrs.get('href')]) - noterefs_tags = [ - tag for tag in noterefs_tags if tag not in bad_noterefs_tags] - new_noterefs_tags = [] - new_footnotes_tags = [] - [tag.decompose() for tag in bad_noterefs_tags] - - def parse_a_tag_href(s: str) -> Tuple[str, str]: - """Returns name of file & id of an anchor""" - assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' - f, id_ = s.split('#') - return f, id_ - - def verify_footnote_tag(tags: list): - """Function verifies is tag - footnote""" - assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}' - if len(tags) == 0: - anchored_tags = list(target_html_tag.find_all(id=element_id)) - if len(anchored_tags): - print( - f'Warning. Href for tag is detected as footnote:\n{noteref_tag}') - return anchored_tags - else: - assert 0, f'Error, No element with id: {href} found.' - - return tags - - for i, noteref_tag in enumerate(noterefs_tags): - href = noteref_tag.attrs['href'] - file, element_id = parse_a_tag_href(href) - if not file: - target_html_tag = source_html_tag - else: - target_html_tag = href2soup_html.get(file) - if not target_html_tag: - print( - f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.') - continue - - possible_footnote = 'note|footnote|endnote|rearenote' - expected_footnote_tags = list(target_html_tag.find_all(id=element_id, - attrs={'epub:type': re.compile(possible_footnote)})) - - expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) - footnote_tag = expected_footnote_tags[0] - if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote': - footnote_tag = footnote_tag.parent - new_noterefs_tags.append( - _replace_with_livecarta_anchor_tag(noteref_tag, i)) - content = footnote_tag.text - # footnote_tag.decompose() - footnotes.append(content) - footnote_tag = footnote_tag.find( - attrs={'role': 'doc-backlink'}) or footnote_tag - new_footnotes_tags.append(footnote_tag) - - return footnotes, new_noterefs_tags, new_footnotes_tags - - -def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup: - """ - Main function that works with structure of html. Make changes inplace. - Parameters - ---------- - body_tag: Tag, soup object - - Steps - ---------- - 1. Extracts tags that are not needed - 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree. - Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed. - This tag must have a body_tag as a parent. - Otherwise, it is wrapped with some tags. Like: -

    - 3. Headings that are not supported by livecarta converts to

    - 4. Wrapping NavigableString - - Returns - ------- - body_tag: Tag, BeautifulSoup - adjusted body_tag - - """ - def _preserve_class_in_aside_tag(tag_): - """to save css style inherited from class, copy class to aside tag (which is parent to tag_)""" - # this is for Wiley books with boxes - tag_class = tag_.attrs['class'] if not isinstance( - tag_.attrs['class'], list) else tag_.attrs['class'][0] - if tag_.parent.name == 'aside': - if not tag_.parent.attrs.get('class'): - tag_.parent.attrs['class'] = tag_class - - def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool: - """ - Function saves css style inherited from class, copies class to child

    - returns True, if

    could be unwrapped - Parameters - ---------- - tag_: Tag, soup object - - Returns - ------- - bool - - """ - # this is for Wiley books with boxes - tag_class = tag_.attrs['class'] if not isinstance( - tag_.attrs['class'], list) else tag_.attrs['class'][0] - if 'feature' not in tag_class: - return True - child_p_tags = tag_.find_all("p") - if len(child_p_tags) == 1: - child_p_tag = child_p_tags[0] - if not child_p_tag.attrs.get('class'): - child_p_tag.attrs['class'] = tag_class - return True - - elif len(child_p_tags) > 1: - tag_.name = 'p' - return False - else: - return True - - def _add_span_to_save_ids_for_links(tag_to_be_removed): - if tag_to_be_removed.attrs.get('id'): - _insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed, - id_=tag_to_be_removed.attrs['id'], - class_=tag_to_be_removed.attrs.get('class')) - - def _replace_div_tag_with_table(): - """ - Function replace
    with : - 1. Convert div with certain classes to tables - 2. Add background color to div with background-color - - """ - for div in body_tag.find_all("div"): - if div.attrs.get('class'): - div_class = div.attrs['class'] if not isinstance( - div.attrs['class'], list) else div.attrs['class'][0] - if div_class in ['C409', 'C409a']: - _wrap_block_tag_with_table( - body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9') - - elif div_class in ['C441', 'C816']: - _wrap_block_tag_with_table( - body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8') - - if div.attrs.get('style'): - if 'background-color' in div.attrs['style']: - end_index = div.attrs['style'].find( - 'background-color') + len('background-color') - start_index_of_color = end_index + 2 - bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7] - _wrap_block_tag_with_table( - body_tag, old_tag=div, width='100', border='', bg_color=bg_color) - elif div.attrs.get('style') == '': - del div.attrs['style'] - - structural_tags_names = [ - 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', - 'figure', 'footer', 'iframe', 'span', 'p' - ] - - if div.contents: - is_not_struct_tag = [ - child.name not in structural_tags_names for child in div.contents] - if all(is_not_struct_tag): - div.name = 'p' - continue - _add_span_to_save_ids_for_links(div) - div.unwrap() - - def _heading_tag_to_p_tag(body_tag): - """Function to convert all lower level headings to p tags""" - pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' - header_tags = body_tag.find_all(re.compile(pattern)) - for tag in header_tags: - tag.name = 'p' - - # comments removal - for tag in body_tag.find_all(): - for element in tag(text=lambda text: isinstance(text, Comment)): - element.extract() - - _replace_div_tag_with_table() - - for s in body_tag.find_all("section"): - could_be_unwrapped = True - if s.attrs.get('class'): - could_be_unwrapped = _preserve_class_in_section_tag(s) - _add_span_to_save_ids_for_links(s) - if could_be_unwrapped: - s.unwrap() - - for s in body_tag.find_all("article"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("figure"): - s.name = 'p' - # to center image inside this tag - s.attrs['style'] = "text-align: center;" - - for s in body_tag.find_all("figcaption"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("aside"): - s.name = 'blockquote' - - for s in body_tag.find_all("main"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("body"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("html"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("header"): - s.name = 'span' - - # check marks for chapter starting are on the same 1 level - marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'}) - parents_marks_are_body = [x.parent == body_tag for x in marks] - - # fix marks to be on 1 level - if not all(parents_marks_are_body): - for x in marks: - while x.parent != body_tag: - x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases - - parents_marks_are_body = [x.parent == body_tag for x in marks] - assert all( - parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' - - _heading_tag_to_p_tag(body_tag) - - # wrap NavigableString with

    - for node in body_tag: - if isinstance(node, NavigableString): - content = str(node) - content = re.sub(r'([\n\t\xa0])', ' ', content) - content = content.strip() - if content: - tag = body_tag.new_tag('p') - tag.append(str(node)) - node.replace_with(tag) - return body_tag - - -def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: - """After processing on a first_id that corresponds to current chapter, - from initial html_soup all tags from current chapter are extracted - Parameters - ---------- - first_id: - Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' - href: - Name of current chapter's file - html_soup: Tag - Soup object of current file - - Returns - ------- - tags: list [Tag, NavigableString] - Chapter's tags - - """ - marked_tags = html_soup.find( - attrs={'id': first_id, 'class': 'converter-chapter-mark'}) - if marked_tags: - next_tag = marked_tags.next_sibling - tags = [] - while next_tag: - if not isinstance(next_tag, NavigableString) and\ - (next_tag.attrs.get('class') == 'converter-chapter-mark'): - break - tags.append(next_tag) - next_tag = next_tag.next_sibling - - # remove tags between first_id and next found id - # save them in list for next steps - tags = [tag.extract() for tag in tags] - html_soup.smooth() - - else: - assert 0, f'Warning: no match for {first_id, href}' - - return tags - - -def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str): - """Function saves all images to Amazon web service""" - link_path = access.send_image( - img_file_path, doc_id=book_id, img_content=img_content) - return link_path - - -def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): - """Function saves all images locally""" - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join( - folder_path, f'../json/img_{book_id}/')) - new_path.mkdir(exist_ok=True) - - new_img_path = new_path / os.path.basename(img_file_path) - f = open(new_img_path, 'wb+') - f.write(img_content) - f.close() - - return new_img_path - - -def update_images_src_links(body_tag: BeautifulSoup, - href2img_content: dict, - path_to_html: str, - access=None, - path2aws_path: dict = None, - book_id: str = None) -> dict: - """Function makes dictionary image_src_path -> Amazon web service_path""" - img_tags = body_tag.find_all('img') - - for img in img_tags: - path_to_img_from_html = img.attrs.get('src') - html_folder = os.path.dirname(path_to_html) - path_to_img_from_root = os.path.normpath(os.path.join( - html_folder, path_to_img_from_html)).replace('\\', '/') - - assert path_to_img_from_root in href2img_content, \ - f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' - - img_content = href2img_content[path_to_img_from_root] - if access is not None: - if path_to_img_from_root in path2aws_path: - new_folder = path2aws_path[path_to_img_from_root] - else: - new_folder = save_image_to_aws( - access, path_to_img_from_root, img_content, book_id) - path2aws_path[path_to_img_from_root] = new_folder - else: - new_folder = save_image_locally( - path_to_img_from_root, img_content, 'book_id') - - img.attrs['src'] = str(new_folder) - if img.attrs.get('width'): - del img.attrs['width'] - if img.attrs.get('height'): - del img.attrs['height'] - if img.attrs.get('style'): - del img.attrs['style'] - return path2aws_path - - -def _clean_title_from_numbering(title: str): - """Function removes numbering from titles""" - title = re.sub(r'^(\s+)+', '', title) - # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title - # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title - # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title - return title - - -def prepare_title(title_of_chapter: str) -> str: - """Function finalise processing/cleaning title""" - title_str = BeautifulSoup(title_of_chapter, features='lxml').string - title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) - title_str = re.sub(r' +', ' ', title_str).rstrip() - title_str = _clean_title_from_numbering(title_str) - return title_str - - -def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): - """Function inserts span before tag aren't supported by livecarta""" - new_tag = main_tag.new_tag("span") - new_tag.attrs['id'] = id_ or '' - new_tag.attrs['class'] = class_ or '' - new_tag.string = "\xa0" - tag.insert_before(new_tag) - - -def _clean_headings_content(content: BeautifulSoup, title: str): - def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup): - if tag_to_be_removed.attrs.get('id'): - _insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=tag_to_be_removed.attrs.get( - 'id'), - class_=tag_to_be_removed.attrs.get('class')) - - for sub_tag in tag_to_be_removed.find_all(): - if sub_tag.attrs.get('id'): - _insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=sub_tag.attrs['id'], - class_=sub_tag.attrs.get('class')) - - title = title.lower() - for child in content.contents: - if isinstance(child, NavigableString): - text = child - else: - text = child.text - if text and re.sub(r'([\n\t\xa0])', '', text): - text = re.sub(r'([\n\t\xa0])', ' ', text) - text = re.sub(r' +', ' ', text).strip() - text = text.lower() - if title == text: - add_span_to_save_ids_for_links(child, content) - child.extract() - elif (title in text) and (child.name in ['h1', 'h2', 'h3']): - add_span_to_save_ids_for_links(child, content) - child.extract() - break - - -def _process_lists(body_tag: BeautifulSoup): - """ - Function - - process tags

  • . - - unwrap

    tags. - Parameters - ---------- - body_tag: Tag, soup object - - Returns - ------- - None - - """ - li_tags = body_tag.find_all("li") - for li_tag in li_tags: - if li_tag.p: - li_tag.attrs.update(li_tag.p.attrs) - li_tag.p.unwrap() - - -def _preprocess_table(body_tag: BeautifulSoup): - """Function to preprocess tables and tags(td|th|tr): style""" - tables = body_tag.find_all("table") - for table in tables: - t_tags = table.find_all(re.compile("td|th|tr")) - for t_tag in t_tags: - style = t_tag.get('style') - width = '' - if style: - width_match = re.search( - r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) - if width_match: - size = width_match.group(1) - width = size + 'px' - - t_tag.attrs['width'] = t_tag.get('width') or width - - if t_tag.attrs.get('style'): - t_tag.attrs['style'] = t_tag.attrs['style'].replace( - 'border:0;', '') - - elif t_tag.attrs.get('style') == '': - del t_tag.attrs['style'] - - if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']: - table.attrs['border'] = '1' - - -def _preprocess_code_tags(chapter_tag: BeautifulSoup): - """ - Function - - transform , , tags into span - - add code style to this tags - Parameters - ---------- - chapter_tag: Tag, soup object - - Returns - ------- - None - - """ - for code in chapter_tag.find_all(re.compile("code|kbd|var")): - if not code.parent.name == "pre": - code.name = "span" - continue - # if tag isn't in pre and doesn't have style - if not code.attrs.get('style'): - code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;' - - -def _prepare_formatted(text: str) -> str: - """Function replaces special symbols with their Unicode representation""" - text = text.replace("<", "\x3C") - text = text.replace(">", "\x3E") - text = text.replace('\t', "\xa0 \xa0 ") #     - text = text.replace(' ', "\xa0") - text = text.replace('𝑓', "\xf0\x9d\x91\x93") - return text - - -def _preprocess_pre_tags(chapter_tag: BeautifulSoup): - """ - Function preprocessing

     tags
    -    Wrap string of the tag with  if it's necessary
    -    Parameters
    -    ----------
    -    chapter_tag: Tag, soup object
    -
    -    Returns
    -    ----------
    -    None
    -        Modified chapter tag
    -
    -    """
    -    for pre in chapter_tag.find_all("pre"):
    -        if pre.find_all("code|kbd|var"):
    -            continue
    -        else:
    -            code = chapter_tag.new_tag("code")
    -            # insert all items that was in pre to code and remove from pre
    -            for content in reversed(pre.contents):
    -                code.insert(0, content.extract())
    -            # wrap code with items
    -            pre.append(code)
    -
    -
    -def _clean_wiley_block(block):
    -    hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
    -    for hr in hrs:
    -        hr.extract()
    -    h = block.find(re.compile("h[1-9]"))
    -    if h:
    -        h.name = "p"
    -        h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
    -
    -
    -def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
    -    """Function wraps  with 
  • """ - table = main_tag.new_tag("table") - table.attrs['border'] = border - table.attrs['align'] = 'center' - table.attrs['style'] = f'width:{width}%;' - tbody = main_tag.new_tag("tbody") - tr = main_tag.new_tag("tr") - td = main_tag.new_tag("td") - # td.attrs['border-radius'] = '8px' - if bg_color: - td.attrs['bgcolor'] = bg_color - old_tag.wrap(td) - td.wrap(tr) - tr.wrap(tbody) - tbody.wrap(table) - table.insert_after(BeautifulSoup(features='lxml').new_tag("br")) - return table - - -def _preprocess_block_tags(chapter_tag: Tag): - """Function preprocessing tags""" - for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}): - _clean_wiley_block(block) - color = '#DDDDDD' if block.attrs.get( - 'class') == 'feature1' else None - color = '#EEEEEE' if block.attrs.get( - 'class') == 'feature2' else color - _wrap_block_tag_with_table(chapter_tag, block, bg_color=color) - block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) - block.unwrap() - - for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): - _clean_wiley_block(future_block) - color = '#DDDDDD' if future_block.attrs.get( - 'class') == 'feature1' else None - color = '#EEEEEE' if future_block.attrs.get( - 'class') == 'feature2' else color - _wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) - - -def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: - """ - Function finalise processing/cleaning content - Parameters - ---------- - title_str: str - - content_tag: Tag, soup object - - remove_title_from_chapter: bool - - Steps - ---------- - 1. find \n - 2. heading removal - 3. processing tags - 4. class removal - - Returns - ------- - content_tag: str - prepared content - - """ - # 1. find \n - to_remove = [] - for child in content_tag.contents: - if isinstance(child, NavigableString): - s = re.sub(r'([\n\t])', '', child.string) - if s == '': - to_remove.append(child) - - # 2. heading removal - if remove_title_from_chapter: - _clean_headings_content(content_tag, title_str) - - # 3. processing tags (
  • ,
  • , ,
    , )
    -    _process_lists(content_tag)
    -    _preprocess_table(content_tag)
    -    _preprocess_code_tags(content_tag)
    -    _preprocess_pre_tags(content_tag)
    -    _preprocess_block_tags(content_tag)
    -
    -    # 4. class removal
    -    for tag in content_tag.find_all(recursive=True):
    -        if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
    -                                                                                                'footnote-element']):
    -            del tag.attrs['class']
    -    return str(content_tag)
    diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py
    new file mode 100644
    index 0000000..da2a6c0
    --- /dev/null
    +++ b/src/epub_converter/html_epub_processor.py
    @@ -0,0 +1,426 @@
    +import re
    +import json
    +from bs4 import BeautifulSoup, NavigableString, Comment, Tag
    +
    +from src.util.helpers import BookLogger
    +
    +
    +class HtmlEpubPreprocessor:
    +    def __init__(self, preset_path="../../presets/presets.json", logger=None):
    +        self.preset = json.load(open(preset_path))
    +        self.logger: BookLogger = logger
    +        self.name2function = {
    +            "table_wrapper": self._wrap_tags_with_table,
    +            "replacer": self._tags_to_correspond_livecarta_tag,
    +            "attr_replacer": self._replace_attrs_in_tags,
    +            "unwrapper": self._unwrap_tags,
    +            "inserter": self._insert_tags_into_correspond_tags
    +        }
    +
    +    @staticmethod
    +    def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup):
    +        """
    +        Function adds span with id from tag_to_be_removed
    +        because this tag will be removed(unwrapped/extract)
    +        Parameters
    +        ----------
    +        tag_to_be_removed: Soup object
    +        chapter_tag: BeautifulSoup
    +
    +        Returns
    +        -------
    +        None
    +            updated body tag
    +
    +        """
    +
    +        def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str,
    +                                               class_: list):
    +            """Function inserts span before tag aren't supported by LiveCarta"""
    +            new_tag = chapter_tag.new_tag("span")
    +            new_tag.attrs["id"] = id_ or ""
    +            new_tag.attrs["class"] = class_ or ""
    +            new_tag.string = "\xa0"
    +            tag_to_be_removed.insert_before(new_tag)
    +
    +        if tag_to_be_removed.attrs.get("id"):
    +            _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed,
    +                                               id_=tag_to_be_removed.attrs["id"],
    +                                               class_=tag_to_be_removed.attrs.get("class"))
    +
    +    @staticmethod
    +    def prepare_title(title_of_chapter: str) -> str:
    +        """
    +        Function finalise processing/cleaning title
    +        Parameters
    +        ----------
    +        title_of_chapter: str
    +
    +        Returns
    +        -------
    +        title: str
    +            cleaned title
    +
    +        """
    +        title = BeautifulSoup(title_of_chapter, features="lxml").string
    +        # clean extra whitespace characters ([\r\n\t\f\v ])
    +        title = re.sub(r"[\s\xa0]", " ", title).strip()
    +        return title
    +
    +    @staticmethod
    +    def _remove_comments(chapter_tag: BeautifulSoup):
    +        """
    +        Function remove comments
    +        Parameters
    +        ----------
    +        chapter_tag: BeautifulSoup
    +            Tag & contents of the chapter tag
    +
    +        Returns
    +        -------
    +        None
    +            Chapter Tag without comments
    +
    +        """
    +        for tag in chapter_tag.find_all():
    +            for element in tag(text=lambda text: isinstance(text, Comment)):
    +                element.extract()
    +
    +    @staticmethod
    +    def _wrap_strings_with_p(chapter_tag: BeautifulSoup):
    +        """
    +        Function converts headings that aren't supported by LiveCarta with 

    + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with wrapped NavigableStrings + + """ + for node in chapter_tag: + if isinstance(node, NavigableString): + content = str(node) + content = re.sub(r"([\s\xa0])", " ", content).strip() + if content: + p_tag = chapter_tag.new_tag("p") + p_tag.append(str(node)) + node.replace_with(p_tag) + + def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: list): + """ + Function wraps with

    + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with wrapped certain tags with
    + + """ + + def _wrap_tag_with_table(width="100", border="", bg_color=None): + table = chapter_tag.new_tag("table") + table.attrs["border"], table.attrs["align"], table.attrs["style"] \ + = border, "center", f"width:{width}%;" + tbody, tr, td = \ + chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") + td.attrs["bgcolor"] = bg_color + tag_to_wrap.wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) + return table + + def process_tag_using_table(): + _wrap_tag_with_table( + width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", + border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, + bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) + self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) + tag_to_wrap.unwrap() + + for rule in rules: + tags = rule["tags"] + for attr in rule["attrs"]: + for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): + process_tag_using_table() + + @staticmethod + def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: list): + """ + Function to replace all tags to correspond LiveCarta tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with all tags replaced with LiveCarta tags + + """ + for rule in rules: + tags = rule["tags"] + tag_to_replace = rule["tag_to_replace"] + if rule["condition"]: + for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): + if condition_on_tag[0] == 'parent_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if tag.parent.select(condition_on_tag[1]): + tag.name = tag_to_replace + elif condition_on_tag[0] == 'child_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): + tag.name = tag_to_replace + elif condition_on_tag[0] == "attrs": + for attr in rule["condition"]["attrs"]: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): + tag.name = tag_to_replace + else: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) + tag.name = tag_to_replace + + @staticmethod + def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list): + """ + Function to replace all tags to correspond LiveCarta tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with all tags replaced with LiveCarta tags + + """ + for rule in rules: + attr = rule["attr"] + tags = rule["condition"]["tags"] + attr_to_replace = rule["attr_to_replace"] + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr: re.compile(r".*")}): + tag[attr_to_replace] = tag[attr] + del tag[attr] + + def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict): + """ + Function unwrap tags and moves id to span + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with unwrapped certain tags + + """ + for tag_name in rules["tags"]: + for tag in chapter_tag.select(tag_name): + # if tag is a subtag + if ">" in tag_name: + tag.parent.attrs.update(tag.attrs) + self._add_span_to_save_ids_for_links(tag, chapter_tag) + tag.unwrap() + + @staticmethod + def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: list): + """ + Function inserts tags into correspond tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with inserted tags + + """ + def insert(tag): + tag_to_insert = \ + chapter_tag.new_tag(rule["tag_to_insert"]) + # insert all items that was in tag to subtag and remove from tag + for content in reversed(tag.contents): + tag_to_insert.insert(0, content.extract()) + # wrap subtag with items + tag.append(tag_to_insert) + + for rule in rules: + tags = rule["tags"] + if rule["condition"]: + for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): + if condition_on_tag[0] == 'parent_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if tag.parent.select(condition_on_tag[1]): + insert(tag) + elif condition_on_tag[0] == 'child_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): + insert(tag) + elif condition_on_tag[0] == "attrs": + for attr in rule["condition"]["attrs"]: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): + insert(tag) + else: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + insert(tag) + + def _remove_headings_content(self, chapter_tag, title_of_chapter: str): + """ + Function + - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content + - adds span with id in order to + Parameters + ---------- + chapter_tag: soup object + Tag of the page + title_of_chapter: str + Chapter title + + Returns + ------- + None + clean/remove headings & add span with id + + """ + title_of_chapter = title_of_chapter.lower() + if title_of_chapter == "chapter 1": + pass + for tag in chapter_tag.contents: + text = tag if isinstance(tag, NavigableString) else tag.text + if re.sub(r"[\s\xa0]", "", text): + text = re.sub(r"[\s\xa0]", " ", text).lower() + text = text.strip() # delete extra spaces + if not isinstance(tag, NavigableString): + if title_of_chapter == text or \ + (title_of_chapter in text and + re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): + self._add_span_to_save_ids_for_links(tag, chapter_tag) + tag.extract() + return + elif not self._remove_headings_content(tag, title_of_chapter): + break + else: + tag.extract() + return + + @staticmethod + def _process_tables(chapter_tag: BeautifulSoup): + """ + Function preprocesses tables and tags(td|th|tr) + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with processed tables + + """ + tables = chapter_tag.find_all("table") + for table in tables: + for t_tag in table.find_all(re.compile("td|th|tr")): + width = "" + if t_tag.get("style"): + width_match = re.search( + r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) + if width_match: + size = width_match.group(1) + width = size + "px" + + t_tag.attrs["width"] = t_tag.get("width") or width + + if t_tag.attrs.get("style"): + t_tag.attrs["style"] = t_tag.attrs["style"].replace( + "border:0;", "") + if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": + del t_tag.attrs["style"] + + if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: + table.attrs["border"] = "1" + + @staticmethod + def _class_removing(chapter_tag: BeautifulSoup): + """ + Function removes classes that aren't created by converter + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag without original classes of the book + + """ + for tag in chapter_tag.find_all(recursive=True): + if tag.attrs.get("class") \ + and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): + del tag.attrs["class"] + + def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: + """ + Function finalise processing/cleaning content + Parameters + ---------- + title_str: str + + content_tag: Tag, soup object + + remove_title_from_chapter: bool + + Steps + ---------- + 1. comments removal + 2. wrap NavigableString with tag

    + 3-6. wrap tags with

    + replace tags with correspond LiveCarta tags + unwrap tags + insert tags into correspond tags + 7. heading removal + 8. process_tables + 9. class removal + + Returns + ------- + content_tag: Tag + prepared content + + """ + # 1. remove comments + self._remove_comments(content_tag) + # 2. + self._wrap_strings_with_p(content_tag) + # 3-6. + for dict in self.preset: + func = self.name2function[dict["preset_name"]] + func(content_tag, dict['rules']) + # 7. + if remove_title_from_chapter: + self._remove_headings_content(content_tag, title_str) + # 8. + self._process_tables(content_tag) + # 9. remove classes that weren't created by converter + self._class_removing(content_tag) + return content_tag diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py new file mode 100644 index 0000000..6f35c3a --- /dev/null +++ b/src/epub_converter/image_processing.py @@ -0,0 +1,65 @@ +import os +import pathlib +from bs4 import BeautifulSoup + +from src.access import Access + + +def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str): + """Function saves all images to Amazon web service""" + link_path = access.send_image( + img_file_path, doc_id=book_id, img_content=img_content) + return link_path + + +def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): + """Function saves all images locally""" + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join( + folder_path, f"../books/json/img_{book_id}/")) + new_path.mkdir(exist_ok=True) + + new_img_path = new_path / os.path.basename(img_file_path) + f = open(new_img_path, "wb+") + f.write(img_content) + f.close() + return new_img_path + + +def update_images_src_links(body_tag: BeautifulSoup, + img_href2img_content: dict, + path_to_html: str, + access=None, + path2aws_path: dict = None, + book_id: str = None) -> dict: + """Function makes dictionary image_src_path -> Amazon web service_path""" + img_tags = body_tag.find_all("img") + for img in img_tags: + path_to_img_from_html = img.attrs.get("src") + html_folder = os.path.dirname(path_to_html) + path_to_img_from_root = os.path.normpath(os.path.join( + html_folder, path_to_img_from_html)).replace("\\", "/") + + assert path_to_img_from_root in img_href2img_content, \ + f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest." + + img_content = img_href2img_content[path_to_img_from_root] + if access is not None: + if path_to_img_from_root in path2aws_path: + new_folder = path2aws_path[path_to_img_from_root] + else: + new_folder = save_image_to_aws( + access, path_to_img_from_root, img_content, book_id) + path2aws_path[path_to_img_from_root] = new_folder + else: + new_folder = save_image_locally( + path_to_img_from_root, img_content, book_id) + + img.attrs["src"] = str(new_folder) + if img.attrs.get("width"): + del img.attrs["width"] + if img.attrs.get("height"): + del img.attrs["height"] + if img.attrs.get("style"): + del img.attrs["style"] + return path2aws_path diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_inline_style_processor.py similarity index 55% rename from src/epub_converter/tag_css_style_converter.py rename to src/epub_converter/tag_inline_style_processor.py index 37b2672..30d7e50 100644 --- a/src/epub_converter/tag_css_style_converter.py +++ b/src/epub_converter/tag_inline_style_processor.py @@ -4,61 +4,62 @@ from typing import List from logging import CRITICAL from bs4 import BeautifulSoup -from premailer import transform from src.livecarta_config import LiveCartaConfig -from src.epub_converter.css_preprocessing import LIVECARTA_STYLE_ATTRS cssutils.log.setLevel(CRITICAL) -class TagStyleConverter: +class TagInlineStyleProcessor: def __init__(self, tag_inline_style): # tag with inline style + style parsed from css file self.tag_inline_style = tag_inline_style - self.style = self.process_inline_style() + self.tag_inline_style.attrs['style'] = self.process_inline_style() @staticmethod def remove_white_if_no_bgcolor(style_, tag): """Function remove text white color if there is no bg color""" - if 'background' in style_: + if "background" in style_: style_ = style_.replace( - 'background:', 'background-color:') + "background:", "background-color:") return style_ # if text color is white, check that we have bg-color - if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): + if ("color:#ffffff" in style_) or ("color:#fff" in style_) or ("color:white" in style_): # if bg color is inherited, just return style as is for parent_tag in tag.parents: - # white bg color not need to be checked as we do not write 'white bg color' - tag_with_bg = ['span', 'td', 'tr', 'p'] + # white bg color not need to be checked as we do not write "white bg color" + tag_with_bg = ["span", "td", "tr", "p"] tag_will_be_saved = parent_tag.name in tag_with_bg - has_bg = parent_tag.attrs.get('style') and ( - 'background' in parent_tag.attrs.get('style')) + has_bg = parent_tag.attrs.get("style") and ( + "background" in parent_tag.attrs.get("style")) if has_bg and tag_will_be_saved: return style_ children = tag.find_all() for child in children: - if child.attrs.get('style') and ('background' in child.attrs.get('style')): - tmp_style = child.attrs['style'] + '; color:#fff; ' - child.attrs['style'] = tmp_style + if child.attrs.get("style") and ("background" in child.attrs.get("style")): + tmp_style = child.attrs["style"] + "; color:#fff; " + child.attrs["style"] = tmp_style - # for child with bg color we added white text color, so this tag don't need white color - style_ = style_.replace('color:#fff;', '') - style_ = style_.replace('color:#ffffff;', '') - style_ = style_.replace('color:white;', '') + # for child with bg color we added white text color, so this tag don"t need white color + style_ = style_.replace("color:#fff;", "") + style_ = style_.replace("color:#ffffff;", "") + style_ = style_.replace("color:white;", "") return style_ - @staticmethod - def duplicate_styles_check(split_style: list) -> list: - style_name2style_value = {} - for list_item in split_style: - key, val = list_item.split(":") - if val not in style_name2style_value.keys(): - style_name2style_value[key] = val - split_style = [k + ":" + v for k, v in style_name2style_value.items()] - return split_style + # @staticmethod + # def duplicate_styles_check(split_style: list) -> list: + # style_name2style_value = {} + # # {key: val for for list_item in split_style} + # splitstrs = (list_item.split(":") for list_item in split_style) + # d = {key: val for key, val in splitstrs} + # for list_item in split_style: + # key, val = list_item.split(":") + # if key not in style_name2style_value.keys(): + # style_name2style_value[key] = val + # split_style = [k + ":" + v for k, v in style_name2style_value.items()] + # return split_style @staticmethod def indents_processing(split_style: list) -> str: @@ -68,7 +69,7 @@ class TagStyleConverter: Parameters ---------- split_style: list - list of styles split by ';' + list of styles split by ";" Returns ---------- @@ -76,12 +77,12 @@ class TagStyleConverter: processed style with counted indent """ - processed_style = ";".join(split_style) + processed_style = ";".join(split_style)+';' margin_left_regexp = re.compile( - r'((margin-left|margin): *(-*\w+);*)') + r"((margin-left|margin): *(-*\w+);*)") text_indent_regexp = re.compile( - r'(text-indent: *(-*\w+);*)') + r"(text-indent: *(-*\w+);*)") has_margin = re.search(margin_left_regexp, processed_style) has_text_indent = re.search(text_indent_regexp, processed_style) @@ -92,21 +93,21 @@ class TagStyleConverter: if has_text_indent: num_ti = abs(int("0" + "".join( filter(str.isdigit, str(has_text_indent.group(2)))))) - processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + - str(abs(num_m - num_ti)) + 'px; ') + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + + str(abs(num_m - num_ti)) + "px; ") processed_style = processed_style.replace( - has_margin.group(1), '') + has_margin.group(1), "") return processed_style - processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' + - str(abs(num_m)) + 'px; ') + processed_style = processed_style.replace(has_margin.group(1), "text-indent: " + + str(abs(num_m)) + "px; ") return processed_style elif has_text_indent: - processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + str(abs(int("0" + "".join( filter(str.isdigit, str(has_text_indent.group(2))))))) - + 'px; ') + + "px; ") return processed_style return processed_style @@ -126,23 +127,20 @@ class TagStyleConverter: processed inline style """ - inline_style = self.tag_inline_style.attrs.get('style') + ';' - # 1. Remove white color if tag doesn't have background color in style + inline_style = self.tag_inline_style.attrs.get("style") + ";" + # 1. Remove white color if tag doesn"t have background color in style inline_style = self.remove_white_if_no_bgcolor( inline_style, self.tag_inline_style) inline_style = inline_style.replace( - 'list-style-image', 'list-style-type') - + "list-style-image", "list-style-type") # 2. Create list of styles from inline style - # replace all spaces between '; & letter' to ';' + # replace all spaces between "; & letter" to ";" style = re.sub(r"; *", ";", inline_style) - # when we split style by ';', last element of the list is '' - None (remove it) - split_inline_style: list = list(filter(None, style.split(';'))) - + # when we split style by ";", last element of the list is "" - None (remove it) + split_inline_style: list = list(filter(None, style.split(";"))) # 3. Duplicate styles check - if the tag had duplicate styles - split_inline_style = self.duplicate_styles_check(split_inline_style) - - # 4. Processing indents# + # split_inline_style = self.duplicate_styles_check(split_inline_style) + # 4. Processing indents inline_style: str = self.indents_processing(split_inline_style) return inline_style @@ -164,19 +162,19 @@ class TagStyleConverter: """ styles_to_remove = [] for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: - if f'{k[0]}:{k[1]}' in style: + if f"{k[0]}:{k[1]}" in style: styles_to_remove.append(k) return styles_to_remove def change_attrs_with_corresponding_tags(self): # adds , , instead of styles - styles_to_remove = self.check_style_to_be_tag(self.style) + styles_to_remove = self.check_style_to_be_tag(self.tag_inline_style.attrs['style']) for i, (attr, value) in enumerate(styles_to_remove): - self.tag_inline_style.attrs['style'] = self.tag_inline_style.attrs['style']\ - .replace(f'{attr}:{value};', '').strip() + self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\ + .replace(f"{attr}:{value};", "").strip() corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( attr, value)] - correspond_tag = BeautifulSoup(features='lxml').new_tag(corr_tag_name) + correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name) for content in reversed(self.tag_inline_style.contents): correspond_tag.insert(0, content.extract()) self.tag_inline_style.append(correspond_tag) @@ -184,75 +182,37 @@ class TagStyleConverter: @staticmethod def wrap_span_in_tag_to_save_style_attrs(initial_tag): """Function designed to save style attrs that cannot be in tag.name -> span""" - dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG)) - if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'): + dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG)) + if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"): styles_can_be_in_tag = [style - for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items() + for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items() if re.match(tag, initial_tag.name) for style in styles] - styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS + styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS if attr not in styles_can_be_in_tag] - span_style = initial_tag.attrs['style'] + span_style = initial_tag.attrs["style"] # here check that this style is exactly the same. - # Not 'align' when we have 'text-align', or 'border' when we have 'border-top' - styles_to_be_saved_in_span = [((attr + ':') in span_style) & ( - '-' + attr not in span_style) for attr in styles_cant_be_in_tag] + # Not "align" when we have "text-align", or "border" when we have "border-top" + styles_to_be_saved_in_span = [((attr + ":") in span_style) & ( + "-" + attr not in span_style) for attr in styles_cant_be_in_tag] if any(styles_to_be_saved_in_span): # if we find styles that cannot be in -> wrap them in span - tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}') - style = '' - possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag] + tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}") + style = "" + possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag] for possible_attr_regexp in possible_attrs_regexp: has_style_attrs = re.search( possible_attr_regexp, span_style) if has_style_attrs and has_style_attrs.group(1): style += has_style_attrs.group(1) span_style = span_style.replace( - has_style_attrs.group(1), '') - tag.attrs['style'] = style - initial_tag.name = 'span' - initial_tag.attrs['style'] = span_style + has_style_attrs.group(1), "") + tag.attrs["style"] = style + initial_tag.name = "span" + initial_tag.attrs["style"] = span_style initial_tag.wrap(tag) def convert_initial_tag(self): self.change_attrs_with_corresponding_tags() self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) return self.tag_inline_style - - -def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: - """ - Function adds styles from .css to inline style. - Parameters - ---------- - html_soup: BeautifulSoup - html page with inline style - css_text: str - css content from css file - Returns - ------- - inline_soup: BeautifulSoup - soup with styles from css - - """ - # remove this specification because it causes problems - css_text = css_text.replace( - '@namespace epub "http://www.idpf.org/2007/ops";', '') - # here we add css styles to inline style - html_with_css_styles: str = transform(str(html_soup), css_text=css_text, - remove_classes=False, - external_styles=False, - allow_network=False, - disable_validation=True, - ) - # soup with converted styles from css - inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') - - tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={'style': re.compile('.*')}) - - # go through the tags with inline style + style parsed from css file - for tag_inline_style in tags_with_inline_style: - style_converter = TagStyleConverter(tag_inline_style) - style_converter.convert_initial_tag() - return inline_soup diff --git a/src/livecarta_config.py b/src/livecarta_config.py index e3e63d4..9ae2d40 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -9,12 +9,12 @@ class LiveCartaConfig: HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} - DEFAULT_ALIGN_STYLE = 'left' + DEFAULT_ALIGN_STYLE = "left" - ALIGN_STYLES = ['justify', 'right', 'center', 'left'] + ALIGN_STYLES = ["justify", "right", "center", "left"] # Main constant values - DEFAULT_FONT_NAME = 'Times New Roman' + DEFAULT_FONT_NAME = "Times New Roman" WORD_DEFAULT_FONT_SIZE = 11 @@ -23,80 +23,56 @@ class LiveCartaConfig: FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE /\ WORD_DEFAULT_FONT_SIZE - FONT_CORRESPONDANCE_TABLE = { - "Arial": "arial,helvetica,sans-serif", - "Comic Sans MS": "comic sans ms,cursive", - "Courier New": "courier new,courier,monospace", - "Georgia": "georgia,serif", - "Lucida Sans Unicode": "lucida sans unicode,lucida grande,sans-serif", - "Tahoma": "tahoma,geneva,sans-serif", - "Times New Roman": "times new roman,times,serif", - "Trebuchet MS": "trebuchet ms,helvetica,sans-serif", - "Verdana": "verdana,geneva,sans-serif", - "monospace": "courier new,courier,monospace", - "sans-serif": "arial,helvetica,sans-serif" - } - COLORS_MAP = { - '#ffff00': 'yellow', - '#00ff00': 'darkYellow', - '#00ffff': 'cyan', - '#ff00ff': 'magenta', - '#0000ff': 'blue', - '#ff0000': 'red', - '#000080': 'darkBlue', - '#008080': 'darkCyan', - '#008000': 'green', - '#800080': 'darkMagenta', - '#808000': 'darkGreen', - '#c0c0c0': 'lightGray', - '#ffffff': 'white', - '#800000': '#800000', - '#808080': '#808080' + "#ffff00": "yellow", + "#00ff00": "darkYellow", + "#00ffff": "cyan", + "#ff00ff": "magenta", + "#0000ff": "blue", + "#ff0000": "red", + "#000080": "darkBlue", + "#008080": "darkCyan", + "#008000": "green", + "#800080": "darkMagenta", + "#808000": "darkGreen", + "#c0c0c0": "lightGray", + "#ffffff": "white", + "#800000": "#800000", + "#808080": "#808080" } HTML42LIVECARTA_COLORS = { - 'yellow': 'yellow', - 'lime': 'green', - 'aqua': 'cyan', - 'fuchsia': 'magenta', - 'blue': 'blue', - 'red': 'red', - 'navy': 'darkBlue', - 'teal': 'darkCyan', - 'green': 'darkGreen', - 'purple': 'darkMagenta', - 'olive': 'darkYellow', - 'silver': 'lightGray', - 'white': 'white', - 'maroon': 'darkRed', # '#800000', - 'gray': 'darkGray', - 'grey': 'darkGray', + "yellow": "yellow", + "lime": "green", + "aqua": "cyan", + "fuchsia": "magenta", + "blue": "blue", + "red": "red", + "navy": "darkBlue", + "teal": "darkCyan", + "green": "darkGreen", + "purple": "darkMagenta", + "olive": "darkYellow", + "silver": "lightGray", + "white": "white", + "maroon": "darkRed", # "#800000", + "gray": "darkGray", + "grey": "darkGray", } - INDENT = '30px' + INDENT = "30px" - sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, - 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, - 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, - 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] - - sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', - '19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', - '30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px', - '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] - - list_types = ['circle', 'disc', 'armenian', 'decimal', - 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', - 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] + list_types = ["circle", "disc", "armenian", "decimal", + "decimal-leading-zero", "georgian", "lower-alpha", "lower-latin", + "lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"] structural_tags_names = [ - 'div', 'section', 'article', 'main', 'body', 'html', 'aside', - 'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p' + "div", "section", "article", "main", "body", "html", "aside", + "canvas", "data", "figure", "footer", "iframe", "span", "p" ] could_have_style_in_livecarta_regexp = re.compile( - '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') + "(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)") """ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } @@ -104,23 +80,60 @@ class LiveCartaConfig:

    + "font-style": ["italic"], # + "text-decoration": ["underline", "line-through"], # , + "text-decoration-line": ["underline", "line-through"], # , + "vertical-align": ["super"], # + "color": [], + "background-color": [], + "background": [], + "width": [], + "border": [], + "border-top-width": [], + "border-right-width": [], + "border-left-width": [], + "border-bottom-width": [], + "border-top": [], + "border-bottom": [], + "list-style-type": [], + "list-style-image": [], + "margin-left": [], + "margin-top": [], + "margin": [], } diff --git a/src/util/color_reader.py b/src/util/color_reader.py index fe44758..82fb451 100644 --- a/src/util/color_reader.py +++ b/src/util/color_reader.py @@ -96,13 +96,13 @@ def str2hex(s: str): if '#' in s and (len(s) <= 7): return s.lower() - if ('rgb' in s) and ('%' in s): + if ('rgb' in s.lower()) and ('%' in s): match = re.search(r'rgba*\(((\d+)%, *(\d+)%, *(\d+)%(, \d\.\d+)*)\)', s) if match: r, g, b = int(match.group(2)), int(match.group(3)), int(match.group(4)) return rgb_percent_to_hex((r, g, b)) - if 'rgb' in s: + if 'rgb' in s.lower(): rgba = re.findall('([0-9] *\.?[0-9]+)', s) r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2]) if len(rgba) == 4: @@ -110,7 +110,7 @@ def str2hex(s: str): r, g, b = rgba2rgb(r, g, b, alpha) return rgb_to_hex((r, g, b)) - if 'hsl' in s: + if 'hsl' in s.lower(): # hsl(hue in {0,360}, saturation [0, 100%], lightness [0, 100%]) match = re.search(r'hsla*\(((\d+), *(\d+)%, *(\d+)%, (\d\.\d+)*)\)', s) if match: