diff --git a/src/access.py b/src/access.py index 896c115..2bb508b 100644 --- a/src/access.py +++ b/src/access.py @@ -95,19 +95,19 @@ class Access: else: raise Exception(f'{response.status_code}') - def get_doc(self, doc_id): + def get_book(self, book_id): if self.is_time_for_refreshing(): self.refresh_token() self.refreshing.wait() - response = requests.get(f'{self.url}/doc-convert/{doc_id}/file', headers=self.headers) + response = requests.get(f'{self.url}/doc-convert/{book_id}/file', headers=self.headers) if response.status_code == 404: raise FileNotFoundError('404 Not Found: file have not found.') elif response.status_code == 200: content = response.content else: - raise Exception(f'Error in getting doc from url: {self.url}/doc-convert/{doc_id}/file, ' + raise Exception(f'Error in getting doc from url: {self.url}/doc-convert/{book_id}/file, ' f'status code:{response.status_code}') return content diff --git a/src/book_solver.py b/src/book_solver.py index 177fdf8..da67342 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -5,11 +5,10 @@ In parallel it updates status of a book conversion on admin panel. Finally sends result to server. Result is a json, JSON schema in book_schema.json """ - -import codecs -import json -import logging import os +import json +import codecs +import logging import pathlib from abc import abstractmethod, ABCMeta @@ -61,11 +60,11 @@ class BookSolver: """ try: self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file') - content = self.access.get_doc(self.book_id) + content = self.access.get_book(self.book_id) self.logger_object.log('File was received from server.') self.save_book_file(content) except FileNotFoundError as f_err: - self.logger_object.log("Can't get docx from server.", logging.ERROR) + self.logger_object.log("Can't get file from server.", logging.ERROR) self.logger_object.log_error_to_main_log() raise f_err except Exception as exc: @@ -109,8 +108,9 @@ class BookSolver: return {} def test_conversion(self): + '''Function + without sending to server''' self.logger_object.log('Beginning of the test.') - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.join(folder_path, f'{self.book_type}') file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}') @@ -121,6 +121,9 @@ class BookSolver: self.logger_object.log('End of the test.') def conversion(self): + '''Function + with downloading book from server + with sending to server''' try: self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.') self.get_book_file() @@ -137,14 +140,14 @@ class BookSolver: raise exc def conversion_local(self): + '''Function + without downloading book from server (local) + with sending to server''' try: - with open('tmp.json') as f: - d = json.load(f) - self.send_json_content_to_server(d) - self.logger_object.log(f'End of the conversion to LiveCarta format. Check {self.output_path}.') - + self.logger_object.log(f'Data has been downloaded from tmp.json file: {self.output_path}') + with codecs.open('json/tmp.json', 'r', encoding='utf-8') as f_json: + content_dict = json.load(f_json) + self.send_json_content_to_server(content_dict) except Exception as exc: - self.status_wrapper.set_error() - self.logger_object.log('Error has occurred while conversion.', logging.ERROR) - self.logger_object.log_error_to_main_log(str(exc)) - raise exc \ No newline at end of file + self.logger_object.log('Error has occurred while reading json file.' + str(exc), logging.ERROR) + diff --git a/src/epub_converter/css_reader.py b/src/epub_converter/css_reader.py index 93e199f..450dd40 100644 --- a/src/epub_converter/css_reader.py +++ b/src/epub_converter/css_reader.py @@ -8,8 +8,9 @@ from bs4 import BeautifulSoup from premailer import transform from itertools import takewhile -from src.livecarta_config import LiveCartaConfig from src.util.color_reader import str2hex +from src.livecarta_config import LiveCartaConfig + cssutils.log.setLevel(CRITICAL) @@ -211,9 +212,9 @@ def build_css_content(css_content): class TagStyleConverter: - def __init__(self, tag_with_initial_style, tag_with_ultimate_style): - self.tag_with_initial_style = tag_with_initial_style # tag with inline style to be updated with style attribute - self.tag_initial_name = tag_with_initial_style.name + def __init__(self, tag_with_inline_style, tag_with_ultimate_style): + self.tag_with_inline_style = tag_with_inline_style # tag with inline style to be updated with style attribute + self.tag_initial_name = tag_with_inline_style.name self.tag_with_ultimate_style = tag_with_ultimate_style # tag with inline style + style parsed from css file self.style = self.preprocess_style() @@ -293,32 +294,39 @@ class TagStyleConverter: ultimate_style = ultimate_style.replace('background:', 'background-color:') ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type') - split_ultimate_style = ultimate_style.split(';') # make for repetition check and convert to px + split_ultimate_style = ultimate_style.replace('; ',';').split(';') - # check for another ; in style string in preprocess_style() + # when we split style by ; and we have at the end ; that's why we have '' in list while '' in split_ultimate_style: split_ultimate_style.remove('') - ultimate_style: str = self.process_indents_to_px(split_ultimate_style) - if self.tag_with_initial_style.attrs.get('style'): + # replace all spaces between ': & letter' to ':' + split_ultimate_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_ultimate_style] - initial_style = self.tag_with_initial_style.attrs['style'] - split_initial_style = initial_style.split(';') + if self.tag_with_inline_style.attrs.get('style'): + inline_style = self.tag_with_inline_style.attrs['style'] - # check for another ; in style string in preprocess_style() - while '' in split_initial_style: - split_initial_style.remove('') + split_inline_style = inline_style.replace('; ',';').split(';') - # repetition check - if tag had already had inline style, add this to style parsed from css - repeat_styles = list(set(split_ultimate_style) & set(split_initial_style)) + # when we split style by ; and we have at the end ; that's why we have '' in list + while '' in split_inline_style: + split_inline_style.remove('') + + # replace all spaces between ': & letter' to ':' + split_inline_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_inline_style] + + # repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css + repeat_styles = list(set(split_ultimate_style) & set(split_inline_style)) for item in repeat_styles: - split_initial_style.remove(item) + split_inline_style.remove(item) - if split_initial_style: - # if initial style is not empty - start convert and add to ultimate style + if split_inline_style: + # if inline style is not empty - start convert and add to ultimate style print('we enter repetition check', '\n') - initial_style: str = self.process_indents_to_px(split_initial_style) - ultimate_style += initial_style + inline_style: str = self.process_indents_to_px(split_inline_style) + ultimate_style += inline_style + + ultimate_style: str = self.process_indents_to_px(split_ultimate_style) return ultimate_style def change_attrs_with_corresponding_tags(self): @@ -330,15 +338,15 @@ class TagStyleConverter: self.style = self.style.replace(s, '') self.style = self.style.strip() if i == 0: - self.tag_with_initial_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] - new_tags.append(self.tag_with_initial_style) + self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] + new_tags.append(self.tag_with_inline_style) else: name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] new_tag = BeautifulSoup(features='lxml').new_tag(name) new_tags[-1].wrap(new_tag) new_tags.append(new_tag) - top_tag = self.tag_with_initial_style + top_tag = self.tag_with_inline_style if new_tags: tmp_attrs = top_tag.attrs.copy() @@ -355,10 +363,12 @@ class TagStyleConverter: @staticmethod def wrap_span_in_p_to_save_style_attrs(tag): - styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS - if attr not in ['text-align', 'text-indent', 'border-bottom']] - + '''Function designed to save style attrs that cannot be in p -> span + that cannot be in span -> p''' if tag.name == 'p' and tag.attrs.get('style'): + styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS + if attr not in ['text-align', 'text-indent', 'border-bottom']] + styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p] if any(styles_to_be_saved): tag.name = 'span' @@ -388,83 +398,81 @@ class TagStyleConverter: tag.wrap(p_tag) @staticmethod - def add_span_to_save_style_attrs_in_li(t): - if t.name == 'li' and t.attrs.get('style'): + def wrap_span_in_li_to_save_style_attrs(tag): + if tag.name == 'li' and tag.attrs.get('style'): styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if - attr not in ['text-align', 'list-style-type', 'border-bottom']] + attr not in ['text-align', 'list-style-type']] - check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li] - if any(check): - t.name = 'span' + styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_li] + if any(styles_to_be_saved): + tag.name = 'span' li_tag = BeautifulSoup(features='lxml').new_tag('li') - old_style = t.attrs['style'] - new_style = '' + span_style = tag.attrs['style'] + li_style = '' for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'), re.compile(r'(list-style-type:(\w+);)')]: - has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style) + has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style) if has_li_style_attrs and has_li_style_attrs.group(1): - new_style += has_li_style_attrs.group(1) - old_style = old_style.replace(has_li_style_attrs.group(1), '') + li_style += has_li_style_attrs.group(1) + span_style = span_style.replace(has_li_style_attrs.group(1), '') - li_tag.attrs['style'] = new_style - t.attrs['style'] = old_style - t.wrap(li_tag) + li_tag.attrs['style'] = li_style + tag.attrs['style'] = span_style + tag.wrap(li_tag) @staticmethod - def add_span_to_save_style_attrs_in_ul_ol(t): - if t.name in ['ul', 'ol'] and t.attrs.get('style'): + def wrap_span_in_ul_ol_to_save_style_attrs(tag): + if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): styles_cant_be_in_ul_ol = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] - check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_ul_ol] + check = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_ul_ol] if any(check): - t.name = 'span' + tag.name = 'span' li_tag = BeautifulSoup(features='lxml').new_tag('ul') - old_style = t.attrs['style'] + span_style = tag.attrs['style'] possible_li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style) + has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style) if has_li_style_attrs and has_li_style_attrs.group(1): - new_style = has_li_style_attrs.group(1) - old_style = old_style.replace(new_style, '') - li_tag.attrs['style'] = new_style - t.attrs['style'] = old_style - t.wrap(li_tag) + oul_style = has_li_style_attrs.group(1) + span_style = span_style.replace(oul_style, '') + li_tag.attrs['style'] = oul_style + tag.attrs['style'] = span_style + tag.wrap(li_tag) @staticmethod - def add_span_to_save_style_attrs(t): - no_style_in_livecarta_regexp = re.compile('(^h[1-9]$)') + def wrap_span_in_h_to_save_style_attrs(tag): + h_regexp = re.compile('(^h[1-9]$)') - if re.search(no_style_in_livecarta_regexp, t.name) and t.attrs.get('style'): - new_tag = BeautifulSoup(features='lxml').new_tag(t.name) - t.name = 'span' - t.wrap(new_tag) - style = t.attrs['style'] + if re.search(h_regexp, tag.name) and tag.attrs.get('style'): + h_tag = BeautifulSoup(features='lxml').new_tag(tag.name) + tag.name = 'span' + tag.wrap(h_tag) + style = tag.attrs['style'] li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') has_li_style_attr = re.search(li_attrs_regexp, style) - t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') + tag.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') def convert_initial_tag(self): - self.tag_with_initial_style = self.change_attrs_with_corresponding_tags() - self.wrap_span_in_p_to_save_style_attrs(self.tag_with_initial_style) - self.add_span_to_save_style_attrs_in_li(self.tag_with_initial_style) - self.add_span_to_save_style_attrs_in_ul_ol(self.tag_with_initial_style) - self.add_span_to_save_style_attrs(self.tag_with_initial_style) - return self.tag_with_initial_style + self.tag_with_inline_style = self.change_attrs_with_corresponding_tags() + self.wrap_span_in_p_to_save_style_attrs(self.tag_with_inline_style) + self.wrap_span_in_li_to_save_style_attrs(self.tag_with_inline_style) + self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_with_inline_style) + self.wrap_span_in_h_to_save_style_attrs(self.tag_with_inline_style) + return self.tag_with_inline_style def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '') livecarta_tmp_ids = [] - h_regex = f'(^h[1-9]$)' - could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex) + could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp) for i, x in enumerate(tags_with_possible_style_attr): x.attrs['livecarta_id'] = i livecarta_tmp_ids.append(i) # here we add css styles to inline style - # sometimes in html_with_css_styles html_with_css_styles: str = transform(str(html_soup), css_text=css_text, remove_classes=False, external_styles=False, @@ -474,6 +482,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') + # go through tags with possible style attrs for i in livecarta_tmp_ids: tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i}) tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i}) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 646ee5b..6edbbc2 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -1,27 +1,28 @@ -import os import re import json import codecs import logging +import os +from os.path import dirname, normpath, join from itertools import chain from collections import defaultdict from typing import Dict, Union, List -from os.path import dirname, normpath, join + import ebooklib from ebooklib import epub -from bs4 import BeautifulSoup, Tag from ebooklib.epub import Link, Section +from bs4 import BeautifulSoup, Tag + from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style -from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \ +from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \ update_src_links_in_images, preprocess_footnotes - class EpubConverter: def __init__(self, file, access=None, logger=None): self.file = file @@ -29,9 +30,9 @@ class EpubConverter: self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file) - self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files - self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file - self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC + self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files + self.html_href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file + self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC # toc tree structure stored as adj.list (NavPoint to list of NavPoints) # key = -1 for top level NavPoints @@ -42,8 +43,8 @@ class EpubConverter: self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {} self.internal_anchors = set() - self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed - self.href2img_bytes = {} # file path to bytes + self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed + self.img_href2img_bytes = {} # file path to bytes self.old_image_path2aws_path = {} # file path from to generated aws path self.footnotes_contents: List[str] = [] # to be sent on server as is self.noterefs: List[Tag] = [] # start of the footnote @@ -54,11 +55,11 @@ class EpubConverter: self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): file_name = x.file_name content = x.content - self.href2img_bytes[file_name] = content + self.img_href2img_bytes[file_name] = content self.logger.log('HTML files reading.') - self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() - + self.html_href2html_body_soup: Dict[str, + BeautifulSoup] = self.build_href2soup_content() self.logger.log('CSS files processing.') self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() @@ -84,12 +85,14 @@ class EpubConverter: # build simple toc from spine if needed if self.is_toc_empty(): self.build_adjacency_list_from_spine() - not_added = [x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] + not_added = [ + x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] self.logger.log(f'Html documents not added to TOC: {not_added}.') self.add_not_added_files_to_adjacency_list(not_added) self.logger.log(f'Html internal links and structure processing.') self.label_chapters_ids_with_tmp_id() - self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed + # used only after parsed toc, ids from toc needed + self.process_html_soup_structure_to_line() self.process_internal_links() self.logger.log(f'Building chapters content.') self.define_chapters_content() @@ -110,7 +113,8 @@ class EpubConverter: path_to_css_from_html = css_href html_folder = dirname(html_href) - path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/') + path_to_css_from_root = normpath( + join(html_folder, path_to_css_from_html)).replace('\\', '/') css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) assert css_obj, f'Css style {css_href} was not in manifest.' css_content: str = css_obj.get_content().decode() @@ -124,14 +128,16 @@ class EpubConverter: ...2... = key2value ''' - html_href2css_href: defaultdict = defaultdict(list) # dictionary: href of html to related css files + # dictionary: href of html to related css files + html_href2css_href: defaultdict = defaultdict(list) css_href2css_content: dict = {} for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_content = item.content html_href = item.file_name soup_html_content = BeautifulSoup(html_content, features='lxml') - for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): #check if file links to css file + # check if file links to css file + for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): continue css_href = tag.attrs.get('href') @@ -144,7 +150,8 @@ class EpubConverter: for i, tag in enumerate(soup_html_content.find_all('style')): css_content = tag.string html_href2css_href[html_href].append(f'href{i}') - css_href2css_content[f'href{i}'] = build_css_content(css_content) + css_href2css_content[f'href{i}'] = build_css_content( + css_content) return html_href2css_href, css_href2css_content, @@ -153,14 +160,14 @@ class EpubConverter: This function is designed to update html_href2html_body_soup And add to html_inline_style css_style_content ''' - for href in self.html_href2html_body_soup: - if self.html_href2css_href.get(href): - css ='' - for key in self.html_href2css_href[href]: - css += self.css_href2css_content[key] - content: BeautifulSoup = self.html_href2html_body_soup[href] + for html_href in self.html_href2html_body_soup: + if self.html_href2css_href.get(html_href): + css = '' + for css_href in self.html_href2css_href[html_href]: + css += self.css_href2css_content[css_href] + content: BeautifulSoup = self.html_href2html_body_soup[html_href] content = convert_html_soup_with_css_style(content, css) - self.html_href2html_body_soup[href] = content + self.html_href2html_body_soup[html_href] = content def build_manifest_id2html_href(self): links = dict() @@ -173,18 +180,18 @@ class EpubConverter: """ self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc - key = -1 if root, value = None if leaf + key = -1 if root(top chapters), + value = None if leaf(least chapters) - :param element: [Link, tuple, list] - element that appears in TOC( usually parsed from nav.ncx) - :param lvl: level of depth + :param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx) + :param lvl: level of depth """ if isinstance(element, Link): - # todo: check if link exists nav_point = NavPoint(element) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.href2subchapter_ids[nav_point.href].append(nav_point.id) + self.html_href2subchapter_ids[nav_point.href].append(nav_point.id) self.adjacency_list[nav_point] = None self.hrefs_added_to_toc.add(nav_point.href) return nav_point @@ -195,11 +202,12 @@ class EpubConverter: nav_point = NavPoint(first) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.href2subchapter_ids[nav_point.href].append(nav_point.id) + self.html_href2subchapter_ids[nav_point.href].append(nav_point.id) sub_nodes = [] for i in second: - sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) + sub_nodes.append( + self.build_adjacency_list_from_toc(i, lvl + 1)) self.adjacency_list[nav_point] = sub_nodes self.hrefs_added_to_toc.add(nav_point.href) @@ -208,39 +216,43 @@ class EpubConverter: elif isinstance(element, list) and (lvl == 0): sub_nodes = [] for i in element: - sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) + sub_nodes.append( + self.build_adjacency_list_from_toc(i, lvl + 1)) self.adjacency_list[-1] = sub_nodes else: - assert 0, f'Error. Element is not tuple/Link instance: {type(element)}' + assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' def is_toc_empty(self): + # there is no toc in ebook or no top chapters if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): return True return False def build_adjacency_list_from_spine(self): - manifest_id2href = self.build_manifest_id2html_href() + manifest_id2html_href = self.build_manifest_id2html_href() self.adjacency_list = { -1: [] } for id_, _ in self.ebooklib_book.spine: - nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) + nav_point = NavPoint( + Section(manifest_id2html_href[id_], manifest_id2html_href[id_])) self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(nav_point.href) def add_not_added_files_to_adjacency_list(self, not_added): for i, file in enumerate(not_added): - nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file)) + nav_point = NavPoint( + Section(f'To check #{i}, filename: {file}', file)) self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(file) def label_chapters_ids_with_tmp_id(self): - for href in self.html_href2html_body_soup: - ids = self.href2subchapter_ids[href] + for html_href in self.html_href2html_body_soup: + ids = self.html_href2subchapter_ids[html_href] for i in ids: - soup = self.html_href2html_body_soup[href] + soup = self.html_href2html_body_soup[html_href] tag = soup.find(id=i) new_h = soup.new_tag('tmp') new_h.attrs['class'] = 'converter-chapter-mark' @@ -249,9 +261,9 @@ class EpubConverter: def process_html_soup_structure_to_line(self): # go to line structure - for href in self.html_href2html_body_soup: - soup = self.html_href2html_body_soup[href] - self.html_href2html_body_soup[href] = unwrap_structural_tags(soup) + for html_href in self.html_href2html_body_soup: + soup = self.html_href2html_body_soup[html_href] + self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup) @staticmethod def create_unique_id(href, id_): @@ -280,8 +292,10 @@ class EpubConverter: :return: """ dir_name = os.path.dirname(cur_file_path) - normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/') - full_path = [path for path in self.hrefs_added_to_toc if normed_path in path] + normed_path = os.path.normpath(os.path.join( + dir_name, href_in_link)).replace('\\', '/') + full_path = [ + path for path in self.hrefs_added_to_toc if normed_path in path] if not full_path: self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' f'While processing href in {internal_link_tag}.') @@ -291,7 +305,7 @@ class EpubConverter: if len(full_path) > 1: self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}' f' while {internal_link_tag} processing. The first one will be chosen.') - + return full_path[0] def process_internal_links(self): @@ -308,13 +322,15 @@ class EpubConverter: tag.attrs['id'] = new_id # 2.a) process anchor which is a whole xhtml file - internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(htm|html|xhtml)$)') + internal_link_reg1 = re.compile( + r'(^(?!https?://).+\.(htm|html|xhtml)$)') for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): a_tag_href = internal_link_tag.attrs['href'] # find full path - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( + toc_href, a_tag_href, internal_link_tag) if not a_tag_href_matched_to_toc: continue new_id = self.create_unique_id(a_tag_href_matched_to_toc, '') @@ -322,7 +338,8 @@ class EpubConverter: if new_id not in self.internal_anchors: anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] new_anchor_span = self.create_new_anchor_span(soup, new_id) - anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file + # insert a new span to the begin of the file + anchor_soup.insert(0, new_anchor_span) self.internal_anchors.add(new_id) del internal_link_tag.attrs['href'] @@ -332,20 +349,26 @@ class EpubConverter: for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): - a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#') + a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split( + '#') # find full path if a_tag_href: a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) else: - a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/') + a_tag_href_matched_to_toc = os.path.normpath( + toc_href).replace('\\', '/') + if not a_tag_href_matched_to_toc: continue - new_id = self.create_unique_id(a_tag_href_matched_to_toc, a_tag_id) + + new_id = self.create_unique_id( + a_tag_href_matched_to_toc, a_tag_id) anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - anchor_tags = anchor_soup.find_all(attrs={'id': new_id}) - anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote + anchor_tags = anchor_soup.find_all(attrs={'id': new_id, }) + anchor_tags = anchor_tags or anchor_soup.find_all( + attrs={'id': a_tag_id}) # if link is a footnote if anchor_tags: if len(anchor_tags) > 1: @@ -359,7 +382,8 @@ class EpubConverter: internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' # create span to have cyclic links, link has 1 type of class, anchor another if anchor_tag.attrs['id'] not in self.internal_anchors: - new_anchor_span = self.create_new_anchor_span(soup, new_id) + new_anchor_span = self.create_new_anchor_span( + soup, new_id) anchor_tag.insert_before(new_anchor_span) self.internal_anchors.add(new_id) del anchor_tag.attrs['id'] @@ -386,11 +410,13 @@ class EpubConverter: """ if nav_point.id: soup = self.html_href2html_body_soup[nav_point.href] - chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup) + chapter_tags = get_tags_between_chapter_marks( + first_id=nav_point.id, href=nav_point.href, html_soup=soup) new_tree = BeautifulSoup('', 'html.parser') for tag in chapter_tags: new_tree.append(tag) - self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] = new_tree + self.href_chapter_id2soup_html[( + nav_point.href, nav_point.id)] = new_tree if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: @@ -405,25 +431,27 @@ class EpubConverter: def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: title = nav_point.title if nav_point.id: - content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] + content: BeautifulSoup = self.href_chapter_id2soup_html[( + nav_point.href, nav_point.id)] else: content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href] self.old_image_path2aws_path = update_src_links_in_images(content, - self.href2img_bytes, + self.img_href2img_bytes, path_to_html=nav_point.href, access=self.access, path2aws_path=self.old_image_path2aws_path) is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS - title_preprocessed, content_preprocessed = prepare_title_and_content(title, content, - remove_title_from_chapter=is_chapter) - + title_preprocessed = prepare_title(title) + content_preprocessed = prepare_content(title_preprocessed, content, + remove_title_from_chapter=is_chapter) sub_nodes = [] - # warning! not EpubHtmlItems won;t be added to chapter + # warning! not EpubHtmlItems won't be added to chapter if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: - sub_chapter_item = self.node_to_livecarta_chapter_item(sub_node, lvl + 1) + sub_chapter_item = self.node_to_livecarta_chapter_item( + sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) if self.logger: @@ -451,16 +479,16 @@ class EpubConverter: if __name__ == "__main__": logger = logging.getLogger('epub') - file_handler = logging.StreamHandler() - logger.addHandler(file_handler) - file_handler = logging.FileHandler('../epub.log', mode='w+') + stream_handler = logging.StreamHandler() + logger.addHandler(stream_handler) + file_handler = logging.FileHandler('../../epub.log', mode='w+') logger.addHandler(file_handler) logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) - json_converter = EpubConverter('../../epub/Cook.epub', + json_converter = EpubConverter('../../epub/9781634259804.epub', logger=logger_object) tmp = json_converter.convert_to_dict() - with codecs.open('tmp.json', 'w', encoding='utf-8') as f: - json.dump(tmp, f, ensure_ascii=False) + with codecs.open('../../json/tmp.json', 'w', encoding='utf-8') as f: + json.dump(tmp, f, ensure_ascii=False) \ No newline at end of file diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index f842f63..2359af1 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -11,7 +11,8 @@ from src.livecarta_config import LiveCartaConfig def save_image_locally(img_file_path, img_content, book_id): folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join(folder_path, f'../json/img_{book_id}/')) + new_path = pathlib.Path(os.path.join( + folder_path, f'../json/img_{book_id}/')) new_path.mkdir(exist_ok=True) new_img_path = new_path / os.path.basename(img_file_path) @@ -23,7 +24,8 @@ def save_image_locally(img_file_path, img_content, book_id): def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id): - link = access.send_image(img_file_path, doc_id=book_id, img_content=img_content) + link = access.send_image( + img_file_path, doc_id=book_id, img_content=img_content) return link @@ -37,7 +39,8 @@ def update_src_links_in_images(body_tag: Tag, for img in img_tags: path_to_img_from_html = img.attrs.get('src') html_folder = os.path.dirname(path_to_html) - path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html)).replace('\\', '/') + path_to_img_from_root = os.path.normpath(os.path.join( + html_folder, path_to_img_from_html)).replace('\\', '/') assert path_to_img_from_root in href2img_content, \ f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' @@ -47,10 +50,12 @@ def update_src_links_in_images(body_tag: Tag, if path_to_img_from_root in path2aws_path: new_folder = path2aws_path[path_to_img_from_root] else: - new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id') + new_folder = save_image_to_aws( + access, path_to_img_from_root, img_content, 'book_id') path2aws_path[path_to_img_from_root] = new_folder else: - new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id') + new_folder = save_image_locally( + path_to_img_from_root, img_content, 'book_id') img.attrs['src'] = str(new_folder) if img.attrs.get('width'): @@ -71,7 +76,8 @@ def preprocess_table(body_tag: BeautifulSoup): style = td.get('style') width = '' if style: - width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) + width_match = re.search( + r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) if width_match: size = width_match.group(1) units = width_match.group(2) @@ -96,10 +102,10 @@ def process_lists(body_tag): """ li_tags = body_tag.find_all("li") - for il_tag in li_tags: - if il_tag.p: - il_tag.attrs.update(il_tag.p.attrs) - il_tag.p.unwrap() + for li_tag in li_tags: + if li_tag.p: + li_tag.attrs.update(li_tag.p.attrs) + li_tag.p.unwrap() def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): @@ -111,11 +117,12 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): def clean_headings_content(content: Tag, title: str): - def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag): + def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag): if tag_to_be_removed.attrs.get('id'): insert_span_with_attrs_before_tag(body_tag, tag_to_be_removed, - id_=tag_to_be_removed.attrs.get('id'), + id_=tag_to_be_removed.attrs.get( + 'id'), class_=tag_to_be_removed.attrs.get('class')) for sub_tag in tag_to_be_removed.find_all(): @@ -136,10 +143,10 @@ def clean_headings_content(content: Tag, title: str): text = re.sub(r' +', ' ', text).strip() text = text.lower() if title == text: - _add_span_to_save_ids_for_links(child, content) + add_span_to_save_ids_for_links(child, content) child.extract() elif (title in text) and (child.name in ['h1', 'h2', 'h3']): - _add_span_to_save_ids_for_links(child, content) + add_span_to_save_ids_for_links(child, content) child.extract() break @@ -187,9 +194,12 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note """ footnotes = [] - noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'}) - bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')]) - noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags] + noterefs_tags = source_html_tag.find_all( + attrs={noteref_attr_name: 'noteref'}) + bad_noterefs_tags = set( + [tag for tag in noterefs_tags if not tag.attrs.get('href')]) + noterefs_tags = [ + tag for tag in noterefs_tags if tag not in bad_noterefs_tags] new_noterefs_tags = [] new_footnotes_tags = [] [tag.decompose() for tag in bad_noterefs_tags] @@ -204,7 +214,8 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note if len(tags) == 0: anchored_tags = list(target_html_tag.find_all(id=element_id)) if len(anchored_tags): - print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}') + print( + f'Warning. Href for tag is detected as footnote:\n{noteref_tag}') return anchored_tags else: assert 0, f'Error, No element with id: {href} found.' @@ -219,7 +230,8 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note else: target_html_tag = href2soup_html.get(file) if not target_html_tag: - print(f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.') + print( + f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.') continue possible_footnote = 'note|footnote|endnote|rearenote' @@ -230,11 +242,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note footnote_tag = expected_footnote_tags[0] if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote': footnote_tag = footnote_tag.parent - new_noterefs_tags.append(replace_with_livecarta_anchor_tag(noteref_tag, i)) + new_noterefs_tags.append( + replace_with_livecarta_anchor_tag(noteref_tag, i)) content = footnote_tag.text # footnote_tag.decompose() footnotes.append(content) - footnote_tag = footnote_tag.find(attrs={'role': 'doc-backlink'}) or footnote_tag + footnote_tag = footnote_tag.find( + attrs={'role': 'doc-backlink'}) or footnote_tag new_footnotes_tags.append(footnote_tag) return footnotes, new_noterefs_tags, new_footnotes_tags @@ -262,7 +276,8 @@ def unwrap_structural_tags(body_tag): def _preserve_class_in_aside_tag(tag_): # to save css style inherited from class, copy class to aside tag (which is parent to tag_) # this is for Wiley books with boxes - tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0] + tag_class = tag_.attrs['class'] if not isinstance( + tag_.attrs['class'], list) else tag_.attrs['class'][0] if tag_.parent.name == 'aside': if not tag_.parent.attrs.get('class'): tag_.parent.attrs['class'] = tag_class @@ -272,7 +287,8 @@ def unwrap_structural_tags(body_tag): # this is for Wiley books with boxes # returns True, if
could be unwrapped - tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0] + tag_class = tag_.attrs['class'] if not isinstance( + tag_.attrs['class'], list) else tag_.attrs['class'][0] if 'feature' not in tag_class: return True child_p_tags = tag_.find_all("p") @@ -288,51 +304,56 @@ def unwrap_structural_tags(body_tag): else: return True - def add_table_to_abc_books(tag_, border, bg_color): - wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color) - def add_span_to_save_ids_for_links(tag_to_be_removed): if tag_to_be_removed.attrs.get('id'): insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed, id_=tag_to_be_removed.attrs['id'], class_=tag_to_be_removed.attrs.get('class')) - structural_tags_names = [ - 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', - 'figure', 'footer', 'iframe', 'span', 'p' - ] + def replace_div_tag_with_table(): + for div in body_tag.find_all("div"): + if div.attrs.get('class'): + div_class = div.attrs['class'] if not isinstance( + div.attrs['class'], list) else div.attrs['class'][0] + if div_class in ['C409', 'C409a']: + wrap_block_tag_with_table( + body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9') + elif div_class in ['C441', 'C816']: + wrap_block_tag_with_table( + body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8') + + if div.attrs.get('style'): + if 'background-color' in div.attrs['style']: + end_index = div.attrs['style'].find( + 'background-color') + len('background-color') + start_index_of_color = end_index + 2 + bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7] + wrap_block_tag_with_table( + body_tag, old_tag=div, width='100', border='', bg_color=bg_color) + elif div.attrs.get('style') == '': + del div.attrs['style'] + + structural_tags_names = [ + 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', + 'figure', 'footer', 'iframe', 'span', 'p' + ] + + if div.contents: + is_not_struct_tag = [ + child.name not in structural_tags_names for child in div.contents] + if all(is_not_struct_tag): + div.name = 'p' + continue + add_span_to_save_ids_for_links(div) + div.unwrap() # comments removal for tag in body_tag.find_all(): for element in tag(text=lambda text: isinstance(text, Comment)): element.extract() - for div in body_tag.find_all("div"): - if div.attrs.get('class'): - div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0] - if div_class in ['C409', 'C409a']: - add_table_to_abc_books(div, border='solid 3px', bg_color='#e7e7e9') - elif div_class in ['C441', 'C816']: - add_table_to_abc_books(div, border='solid #6e6e70 1px', bg_color='#e7e7e8') - - if div.attrs.get('style'): - if 'background-color' in div.attrs['style']: - end_index = div.attrs['style'].find('background-color') + len('background-color') - start_index_of_color = end_index + 2 - bg_color = div.attrs['style'][start_index_of_color:start_index_of_color+7] - add_table_to_abc_books(div, border='', bg_color=bg_color) - - if div.attrs.get('style') == '': - del div.attrs['style'] - if div.contents: - is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents] - if all(is_not_struct_tag): - div.name = 'p' - continue - - add_span_to_save_ids_for_links(div) - div.unwrap() + replace_div_tag_with_table() for s in body_tag.find_all("section"): could_be_unwrapped = True @@ -348,7 +369,8 @@ def unwrap_structural_tags(body_tag): for s in body_tag.find_all("figure"): s.name = 'p' - s.attrs['style'] = "text-align: center;" # to center image inside this tag + # to center image inside this tag + s.attrs['style'] = "text-align: center;" for s in body_tag.find_all("figcaption"): add_span_to_save_ids_for_links(s) @@ -383,7 +405,8 @@ def unwrap_structural_tags(body_tag): x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases parents_marks_are_body = [x.parent == body_tag for x in marks] - assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' + assert all( + parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' heading_tag_to_p_tag(body_tag) @@ -411,7 +434,8 @@ def get_tags_between_chapter_marks(first_id, href, html_soup): :param html_soup: soup object of current file :return: list [Tag, NavigableString]; chapter's tags """ - marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'}) + marked_tags = html_soup.find( + attrs={'id': first_id, 'class': 'converter-chapter-mark'}) if marked_tags: next_tag = marked_tags.next_sibling tags = [] @@ -484,16 +508,20 @@ def preprocess_block_tags(chapter_tag): if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: clean_wiley_block(block) - color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None - color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color + color = '#DDDDDD' if block.attrs.get( + 'class') == 'feature1' else None + color = '#EEEEEE' if block.attrs.get( + 'class') == 'feature2' else color wrap_block_tag_with_table(chapter_tag, block, bg_color=color) block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) block.unwrap() for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): clean_wiley_block(future_block) - color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None - color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color + color = '#DDDDDD' if future_block.attrs.get( + 'class') == 'feature1' else None + color = '#EEEEEE' if future_block.attrs.get( + 'class') == 'feature2' else color wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) @@ -512,7 +540,8 @@ def preprocess_pre_tags(chapter_tag): new_tag = BeautifulSoup(features='lxml').new_tag("span") new_tag.attrs = pre.attrs.copy() spans = pre.find_all("span") - to_add_br = len(spans) > 1 # if in
 there are multiple , we need to add 
after each content + # if in
 there are multiple , we need to add 
after each content + to_add_br = len(spans) > 1 for child in pre.children: if isinstance(child, NavigableString): @@ -520,7 +549,8 @@ def preprocess_pre_tags(chapter_tag): sub_strings = re.split('\r\n|\n|\r', cleaned_text) for string in sub_strings: new_tag.append(NavigableString(string)) - new_tag.append(BeautifulSoup(features='lxml').new_tag('br')) + new_tag.append(BeautifulSoup( + features='lxml').new_tag('br')) else: for sub_child in child.children: if isinstance(sub_child, NavigableString): @@ -531,7 +561,8 @@ def preprocess_pre_tags(chapter_tag): cleaned_tag = child.extract() new_tag.append(cleaned_tag) if to_add_br: - new_tag.append(BeautifulSoup(features='lxml').new_tag('br')) + new_tag.append(BeautifulSoup( + features='lxml').new_tag('br')) new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \ "font-size: 14px; white-space: nowrap;" @@ -551,40 +582,41 @@ def preprocess_code_tags(chapter_tag): code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;' -def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]: +def prepare_title(title_of_chapter: str) -> str: """ Final processing/cleaning function. - - :param title: title of the chapter - :param chapter_tag: soup object - :param remove_title_from_chapter: bool - :return: tuple[str, str] """ - title_str = BeautifulSoup(title, features='lxml').string + title_str = BeautifulSoup(title_of_chapter, features='lxml').string title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r' +', ' ', title_str).rstrip() + title_str = clean_title_from_numbering(title_str) + return title_str + + +def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: + """ + Final processing/cleaning function. + """ # 0. cleaning \n to_remove = [] - for child in chapter_tag.contents: + for child in content_tag.contents: if isinstance(child, NavigableString): s = re.sub(r'([\n\t])', '', child.string) if s == '': to_remove.append(child) - [x.extract() for x in to_remove] # 1. heading removal if remove_title_from_chapter: - clean_headings_content(chapter_tag, title_str) - process_lists(chapter_tag) - preprocess_table(chapter_tag) - preprocess_code_tags(chapter_tag) - preprocess_pre_tags(chapter_tag) - preprocess_block_tags(chapter_tag) + clean_headings_content(content_tag, title_str) + process_lists(content_tag) + preprocess_table(content_tag) + preprocess_code_tags(content_tag) + preprocess_pre_tags(content_tag) + preprocess_block_tags(content_tag) + # 2. class removal - for tag in chapter_tag.find_all(recursive=True): + for tag in content_tag.find_all(recursive=True): if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', 'footnote-element']): del tag.attrs['class'] - # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) - title_str = clean_title_from_numbering(title_str) - return title_str, str(chapter_tag) + return str(content_tag) \ No newline at end of file