From 552741fbb5afea7094db1967a429b1b2db76c95b Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 29 Sep 2021 18:13:09 +0300 Subject: [PATCH] upgrade inline processor & make repetition check --- src/css_reader.py | 239 ++++++++++++++++++---------------- src/epub_converter.py | 144 ++++++++++---------- src/html_epub_preprocessor.py | 2 +- 3 files changed, 202 insertions(+), 183 deletions(-) diff --git a/src/css_reader.py b/src/css_reader.py index 92f8814..6cad7eb 100644 --- a/src/css_reader.py +++ b/src/css_reader.py @@ -58,8 +58,8 @@ def convert_font_size(value): def convert_indents(value): # 30px = 3.2% = 1.25em = 23pt - positive_text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)') - has_style_attrs = re.search(positive_text_indent_regexp, value) + text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)') + has_style_attrs = re.search(text_indent_regexp, value) if has_style_attrs: if has_style_attrs.group(1): value = value.replace(has_style_attrs.group(1), @@ -89,8 +89,8 @@ LIVECARTA_STYLE_ATTRS = { 'text-indent': [], 'font-variant': ['small-caps'], 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], - 'align': [], # ??? - 'font': [], # ??? + 'align': [], + 'font': [], 'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys() if x != LiveCartaConfig.DEFAULT_FONT_NAME], 'font-size': [], @@ -182,41 +182,40 @@ def check_style_to_be_tag(style) -> List[tuple]: to_remove.append(k) return to_remove - -def update_property_to_livecarta_convention(rule, property_): - if property_.name not in LIVECARTA_STYLE_ATTRS: +def update_css_style_types_to_livecarta_convention(css_rule, style_type): + if style_type.name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file - rule.style[property_.name] = '' + css_rule.style[style_type.name] = '' return - cleaned_value = property_.value.replace('\"', '') - there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(property_.name) - value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[property_.name] + cleaned_value = style_type.value.replace('\"', '') # value of style + there_are_constraints_on_value = LIVECARTA_STYLE_ATTRS.get(style_type.name) + value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[style_type.name] if there_are_constraints_on_value and value_not_in_possible_values_list: - # property + value not in LIVECARTA_STYLE_ATTRS, remove from css file - rule.style[property_.name] = '' + # style_type + value not in LIVECARTA_STYLE_ATTRS, remove from css file + css_rule.style[style_type.name] = '' else: - if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING: - func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name] - rule.style[property_.name] = func(cleaned_value) + if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: + func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] # function that converts our data + css_rule.style[style_type.name] = func(cleaned_value) +def build_css_content(css_content): + sheet = cssutils.parseString(css_content, validate=False) -def clean_css(css): - sheet = cssutils.parseString(css, validate=False) - for rule in sheet: - if rule.type == rule.STYLE_RULE: - for property_ in rule.style: - update_property_to_livecarta_convention(rule, property_) + for css_rule in sheet: + if css_rule.type == css_rule.STYLE_RULE: + for style_type in css_rule.style: + update_css_style_types_to_livecarta_convention(css_rule, style_type) css_text = sheet._getCssText().decode() return css_text class TagStyleConverter: - def __init__(self, tag, tag_with_style): - self.tag = tag # tag to be updated with style attribute - self.tag_initial_name = tag.name - self.tag_with_style = tag_with_style # tag with inline style parsed from css file + def __init__(self, tag_with_initial_style, tag_with_ultimate_style): + self.tag_with_initial_style = tag_with_initial_style # tag with inline style to be updated with style attribute + self.tag_initial_name = tag_with_initial_style.name + self.tag_with_ultimate_style = tag_with_ultimate_style # tag with inline style + style parsed from css file self.style = self.preprocess_style() @staticmethod @@ -248,76 +247,83 @@ class TagStyleConverter: return style_ @staticmethod - def convert_indentions_to_px(style): + def process_indents_in_px(split_style: list) -> str: + # clean with convert_indents() style string and make new clean_style + clean_style = '' + for item in split_style: + item = item.split(':') + item[1] = convert_indents(item[1]) + clean_style += item[0] + ': ' + item[1] + '; ' + margin_left_regexp = re.compile( - r'(margin-left:( *-*\w+%);*)|(margin-left:( *-*\w+);*)') + r'(margin-left:( *-*\w+);*)') text_indent_regexp = re.compile( - r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)') + r'(text-indent:( *-*\w+);*)') - has_margin_left = re.search(margin_left_regexp, style) - has_text_indent = re.search(text_indent_regexp, style) - # consider that 5% = 30px + has_margin_left = re.search(margin_left_regexp, clean_style) + has_text_indent = re.search(text_indent_regexp, clean_style) + #formula_of_indent: indent = abs(margin_left - text_indent) if has_margin_left: - hml_group = 0 num_ml = 0 if has_margin_left.group(1): - hml_group = has_margin_left.group(1) num_ml = abs(int("".join( - filter(str.isdigit, str(has_margin_left.group(2))))) * 6) - - elif has_margin_left.group(3): - hml_group = has_margin_left.group(3) - num_ml = abs(int("".join( - filter(str.isdigit, str(has_margin_left.group(4)))))) + filter(str.isdigit, str(has_margin_left.group(2)))))) if has_text_indent: if has_text_indent.group(1): num_ti = abs(int("".join( - filter(str.isdigit, str(has_text_indent.group(2))))) * 6) - style = style.replace(has_text_indent.group(1), 'text-indent: ' + + filter(str.isdigit, str(has_text_indent.group(2)))))) + clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + str(abs(num_ml - num_ti)) + 'px; ') - style = style.replace(hml_group, '') - return style + clean_style = clean_style.replace(has_margin_left.group(1), '') + return clean_style - elif has_text_indent.group(3): - num_ti = abs(int("".join( - filter(str.isdigit, str(has_text_indent.group(4)))))) - style = style.replace(has_text_indent.group(3), 'text-indent: ' + - str(abs(num_ml - num_ti)) + 'px; ') - style = style.replace(hml_group, '') - return style - - style = style.replace(hml_group, 'text-indent: ' + + clean_style = clean_style.replace(has_margin_left.group(1), 'text-indent: ' + str(abs(num_ml)) + 'px; ') - return style + return clean_style elif has_text_indent: if has_text_indent.group(1): - style = style.replace(has_text_indent.group(1), 'text-indent: ' + + clean_style = clean_style.replace(has_text_indent.group(1), 'text-indent: ' + str(abs(int("".join( - filter(str.isdigit, str(has_text_indent.group(2))))) * 6)) + 'px; ') - return style - elif has_text_indent.group(3): - style = style.replace(has_text_indent.group(3), 'text-indent: ' + - str("".join( - filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ') - return style - return style + filter(str.isdigit, str(has_text_indent.group(2))))))) + 'px; ') + return clean_style + return clean_style def preprocess_style(self): - style = self.tag_with_style.attrs.get('style') + ';' - style = self.remove_white_if_no_bgcolor(style, self.tag_with_style) - style = style.replace('background:', 'background-color:') - style = style.replace('list-style-image', 'list-style-type') + ultimate_style = self.tag_with_ultimate_style.attrs.get('style') + ';' + ultimate_style = self.remove_white_if_no_bgcolor(ultimate_style, self.tag_with_ultimate_style) + ultimate_style = ultimate_style.replace('background:', 'background-color:') + ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type') - # todo: make hmtl_reader + do a repetition check with inline_style - style = self.convert_indentions_to_px(style) - # if tag had already had inline style, add this to style parsed from css - if self.tag.attrs.get('style'): - inline_style = self.convert_indentions_to_px(self.tag.attrs['style']) - style += inline_style + split_ultimate_style = ultimate_style.replace(' ', '').split(';') # make for repetition check and convert to px - return style + # check for another ; in style string in preprocess_style() + while '' in split_ultimate_style: + split_ultimate_style.remove('') + ultimate_style: str = self.process_indents_in_px(split_ultimate_style) + + if self.tag_with_initial_style.attrs.get('style'): + + initial_style = self.tag_with_initial_style.attrs['style'] + split_initial_style = initial_style.replace(' ', '').split(';') + + # check for another ; in style string in preprocess_style() + while '' in split_initial_style: + split_initial_style.remove('') + + # repetition check - if tag had already had inline style, add this to style parsed from css + repeat_styles = list(set(split_ultimate_style) & set(split_initial_style)) + for item in repeat_styles: + split_initial_style.remove(item) + + if split_initial_style: + # if initial style is not empty - start convert and add to ultimate style + print('we enter repetition check', '\n') + initial_style: str = self.process_indents_in_px(split_initial_style) + ultimate_style += initial_style + + return ultimate_style def change_attrs_with_corresponding_tags(self): # adds , , , etc @@ -328,15 +334,15 @@ class TagStyleConverter: self.style = self.style.replace(s, '') self.style = self.style.strip() if i == 0: - self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] - new_tags.append(self.tag) + self.tag_with_initial_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] + new_tags.append(self.tag_with_initial_style) else: name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)] new_tag = BeautifulSoup(features='lxml').new_tag(name) new_tags[-1].wrap(new_tag) new_tags.append(new_tag) - top_tag = self.tag + top_tag = self.tag_with_initial_style if new_tags: tmp_attrs = top_tag.attrs.copy() @@ -363,21 +369,22 @@ class TagStyleConverter: p_tag = BeautifulSoup(features='lxml').new_tag('p') span_style = tag.attrs['style'] p_style = '' - possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)') - has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style) - if has_p_style_attrs: - if has_p_style_attrs.group(1): - p_style += has_p_style_attrs.group(1) - span_style = span_style.replace(has_p_style_attrs.group(1), '') - if has_p_style_attrs.group(3): - p_style += has_p_style_attrs.group(3) - span_style = span_style.replace(has_p_style_attrs.group(3), '') + for i in range(span_style.count(';')): + possible_p_attrs_regexp = re.compile(r'(text-align:( *\w+);*)|(text-indent:( *\w+);*)') + has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style) + if has_p_style_attrs: + if has_p_style_attrs.group(1): + p_style += has_p_style_attrs.group(1) + span_style = span_style.replace(has_p_style_attrs.group(1), '') + if has_p_style_attrs.group(3): + p_style += has_p_style_attrs.group(3) + span_style = span_style.replace(has_p_style_attrs.group(3), '') - p_tag.attrs['style'] = p_style + p_tag.attrs['style'] = p_style - li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') - has_li_style_attr = re.search(li_attrs_regexp, span_style) - span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '') + li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') + has_li_style_attr = re.search(li_attrs_regexp, span_style) + span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '') tag.attrs['style'] = span_style tag.wrap(p_tag) @@ -439,49 +446,53 @@ class TagStyleConverter: t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') def convert_initial_tag(self): - self.tag = self.change_attrs_with_corresponding_tags() - self.wrap_span_in_p_to_save_style_attrs(self.tag) - self.add_span_to_save_style_attrs_in_li(self.tag) - self.add_span_to_save_style_attrs_in_ul_ol(self.tag) - self.add_span_to_save_style_attrs(self.tag) - return self.tag + self.tag_with_initial_style = self.change_attrs_with_corresponding_tags() + self.wrap_span_in_p_to_save_style_attrs(self.tag_with_initial_style) + self.add_span_to_save_style_attrs_in_li(self.tag_with_initial_style) + self.add_span_to_save_style_attrs_in_ul_ol(self.tag_with_initial_style) + self.add_span_to_save_style_attrs(self.tag_with_initial_style) + return self.tag_with_initial_style -def add_inline_style_to_html_soup(soup1: BeautifulSoup, css_text: str): +def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '') livecarta_tmp_ids = [] h_regex = f'(^h[1-9]$)' could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex) - tags_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp) + tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp) for i, x in enumerate(tags_with_possible_style_attr): x.attrs['livecarta_id'] = i livecarta_tmp_ids.append(i) - html_with_inline_style: str = transform(str(soup1), css_text=css_text, - remove_classes=False, - external_styles=False, - allow_network=False, - disable_validation=True, - ) - soup2 = BeautifulSoup(html_with_inline_style, features='lxml') + + # here we add css styles to inline style + # sometimes in html_with_css_styles + html_with_css_styles: str = transform(str(html_soup), css_text=css_text, + remove_classes=False, + external_styles=False, + allow_network=False, + disable_validation=True, + ) + + inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') for i in livecarta_tmp_ids: - tag = soup1.find(attrs={'livecarta_id': i}) - tag_with_style = soup2.find(attrs={'livecarta_id': i}) - del tag.attrs['livecarta_id'] - if tag_with_style.attrs.get('style'): - style_converter = TagStyleConverter(tag, tag_with_style) + tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i}) + tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i}) + del tag_with_initial_style.attrs['livecarta_id'] + if tag_with_ultimate_style.attrs.get('style'): + style_converter = TagStyleConverter(tag_with_initial_style, tag_with_ultimate_style) style_converter.convert_initial_tag() - return soup1 + return html_soup if __name__ == '__main__': - file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub' + file = '../epub/9781627222174.epub' ebooklib_book = epub.read_epub(file) css_ = ebooklib_book.get_item_with_href('css/epub.css') css_ = css_.get_content().decode() - css_cleaned = clean_css(css_) + css_cleaned = build_css_content(css_) html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode() html_soup = BeautifulSoup(html_, features='lxml') - print(add_inline_style_to_html_soup(html_soup, css_cleaned)) + print(convert_html_soup_with_css_style(html_soup, css_cleaned)) diff --git a/src/epub_converter.py b/src/epub_converter.py index b86b13a..4ac4ae1 100644 --- a/src/epub_converter.py +++ b/src/epub_converter.py @@ -17,7 +17,7 @@ from data_objects import ChapterItem, NavPoint from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \ update_src_links_in_images, preprocess_footnotes -from css_reader import clean_css, add_inline_style_to_html_soup +from css_reader import build_css_content, convert_html_soup_with_css_style from livecarta_config import LiveCartaConfig from util.helpers import BookLogger @@ -29,11 +29,11 @@ class EpubConverter: self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file) - self.href2soup_html: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files + self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file - self.added_to_toc_hrefs = set() # enumerate all file paths that where added to TOC + self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC - # toc tree structure stored as adj.list (NavPoint to list of NavPoints) + # toc tree structure stored as adj.list (NavPoint to list of NavPoints) # key = -1 for top level NavPoints self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} @@ -44,7 +44,7 @@ class EpubConverter: self.internal_anchors = set() self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed self.href2img_bytes = {} # file path to bytes - self.old_image_path2_aws_path = {} # file path from to generated aws path + self.old_image_path2aws_path = {} # file path from to generated aws path self.footnotes_contents: List[str] = [] # to be sent on server as is self.noterefs: List[Tag] = [] # start of the footnote self.footnotes: List[Tag] = [] # end of the footnote @@ -57,17 +57,18 @@ class EpubConverter: self.href2img_bytes[file_name] = content self.logger.log('HTML files reading.') - self.href2soup_html = self.build_href2soup_content() + self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() + self.logger.log('CSS files processing.') - self.css_href2content, self.html_href2css_href = self.build_css_content() + self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() self.logger.log('CSS styles adding.') - self.add_css_styles2soup() + self.add_css_styles_to_html_soup() self.logger.log('Footnotes processing.') - for href in self.href2soup_html: - content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href], - self.href2soup_html) + for href in self.html_href2html_body_soup: + content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], + self.html_href2html_body_soup) self.footnotes_contents.extend(content) self.noterefs.extend(noterefs) self.footnotes.extend(footnotes_tags) @@ -83,7 +84,7 @@ class EpubConverter: # build simple toc from spine if needed if self.is_toc_empty(): self.build_adjacency_list_from_spine() - not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs] + not_added = [x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] self.logger.log(f'Html documents not added to TOC: {not_added}.') self.add_not_added_files_to_adjacency_list(not_added) self.logger.log(f'Html internal links and structure processing.') @@ -96,62 +97,69 @@ class EpubConverter: def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: # using EpubElements # for now just for HTML objects, as it is simplest chapter - # todo: check if other chapters exist + nodes = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_body_text = item.get_body_content() # html.parser closes tags if needed soup = BeautifulSoup(html_body_text, features='html.parser') nodes[item.file_name] = soup - return nodes - def _read_css(self, css_href, html_path): - ''' + def get_css_content(self, css_href, html_href): - ''' path_to_css_from_html = css_href - html_folder = dirname(html_path) + html_folder = dirname(html_href) path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/') css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) assert css_obj, f'Css style {css_href} was not in manifest.' css_content: str = css_obj.get_content().decode() return css_content - def build_css_content(self): - css_href2content, html_href2css_href = {}, {} - html_href2css_href = defaultdict(list) - # html_href2css_href 1-to-many + def build_html_and_css_relations(self): + ''' + This function is designed to get 2 dictionaries: + The first is css_href2css_content. It is created to connect href of css to content of css + The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html + ...2... = key2value + ''' + + html_href2css_href: defaultdict = defaultdict(list) # dictionary: href of html to related css files + css_href2css_content: dict = {} + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): - html_text = item.content - html_path = item.file_name - soup = BeautifulSoup(html_text, features='lxml') - for tag in soup.find_all('link', attrs={"type": "text/css"}): + html_content = item.content + html_href = item.file_name + soup_html_content = BeautifulSoup(html_content, features='lxml') + for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): #check if file links to css file if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): continue css_href = tag.attrs.get('href') - html_href2css_href[html_path].append(css_href) - if css_href not in css_href2content: - css_href2content[css_href] = clean_css(self._read_css(css_href, html_path)) + html_href2css_href[html_href].append(css_href) + if css_href not in css_href2css_content: + # css_href not in css_href2css_content, add to this dict + css_href2css_content[css_href] = build_css_content( + self.get_css_content(css_href, html_href)) - for i, tag in enumerate(soup.find_all('style')): + for i, tag in enumerate(soup_html_content.find_all('style')): css_content = tag.string - html_href2css_href[html_path].append(f'href{i}') - css_href2content[f'href{i}'] = clean_css(css_content) + html_href2css_href[html_href].append(f'href{i}') + css_href2css_content[f'href{i}'] = build_css_content(css_content) - return css_href2content, html_href2css_href + return html_href2css_href, css_href2css_content, - def add_css_styles2soup(self): - for href in self.href2soup_html: + def add_css_styles_to_html_soup(self): + for href in self.html_href2html_body_soup: if self.html_href2css_href.get(href): css ='' for key in self.html_href2css_href[href]: - css += self.css_href2content[key] - content: BeautifulSoup = self.href2soup_html[href] - content = add_inline_style_to_html_soup(content, css) - self.href2soup_html[href] = content + css += self.css_href2css_content[key] + content: BeautifulSoup = self.html_href2html_body_soup[href] + # todo func here to make content + content = convert_html_soup_with_css_style(content, css) + self.html_href2html_body_soup[href] = content - def build_manifest_id2href(self): + def build_manifest_id2html_href(self): links = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): links[item.id] = item.file_name @@ -160,7 +168,7 @@ class EpubConverter: def build_adjacency_list_from_toc(self, element, lvl=0): """ - self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc + self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc key = -1 if root, value = None if leaf @@ -175,7 +183,7 @@ class EpubConverter: self.id_anchor_exist_in_nav_points = True self.href2subchapter_ids[nav_point.href].append(nav_point.id) self.adjacency_list[nav_point] = None - self.added_to_toc_hrefs.add(nav_point.href) + self.hrefs_added_to_toc.add(nav_point.href) return nav_point elif isinstance(element, tuple): @@ -191,7 +199,7 @@ class EpubConverter: sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) self.adjacency_list[nav_point] = sub_nodes - self.added_to_toc_hrefs.add(nav_point.href) + self.hrefs_added_to_toc.add(nav_point.href) return nav_point elif isinstance(element, list) and (lvl == 0): @@ -210,26 +218,26 @@ class EpubConverter: return False def build_adjacency_list_from_spine(self): - manifest_id2href = self.build_manifest_id2href() + manifest_id2href = self.build_manifest_id2html_href() self.adjacency_list = { -1: [] } for id_, _ in self.ebooklib_book.spine: nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) self.adjacency_list[-1].append(nav_point) - self.added_to_toc_hrefs.add(nav_point.href) + self.hrefs_added_to_toc.add(nav_point.href) def add_not_added_files_to_adjacency_list(self, not_added): for i, file in enumerate(not_added): nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file)) self.adjacency_list[-1].append(nav_point) - self.added_to_toc_hrefs.add(file) + self.hrefs_added_to_toc.add(file) def label_chapters_ids_with_tmp_id(self): - for href in self.href2soup_html: + for href in self.html_href2html_body_soup: ids = self.href2subchapter_ids[href] for i in ids: - soup = self.href2soup_html[href] + soup = self.html_href2html_body_soup[href] tag = soup.find(id=i) new_h = soup.new_tag('tmp') new_h.attrs['class'] = 'converter-chapter-mark' @@ -238,9 +246,9 @@ class EpubConverter: def process_html_soup_structure_to_line(self): # go to line structure - for href in self.href2soup_html: - soup = self.href2soup_html[href] - self.href2soup_html[href] = unwrap_structural_tags(soup) + for href in self.html_href2html_body_soup: + soup = self.html_href2html_body_soup[href] + self.html_href2html_body_soup[href] = unwrap_structural_tags(soup) @staticmethod def _create_unique_id(href, id_): @@ -270,7 +278,7 @@ class EpubConverter: """ dir_name = os.path.dirname(cur_file_path) normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/') - full_path = [path for path in self.added_to_toc_hrefs if normed_path in path] + full_path = [path for path in self.hrefs_added_to_toc if normed_path in path] if not full_path: self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' f'While processing href in {internal_link_tag}.') @@ -285,8 +293,8 @@ class EpubConverter: def process_internal_links(self): # 1. rebuild ids to be unique in all documents - for toc_href in self.added_to_toc_hrefs: - for tag in self.href2soup_html[toc_href].find_all(attrs={'id': re.compile(r'.+')}): + for toc_href in self.hrefs_added_to_toc: + for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}): if tag.attrs.get('class') == 'converter-chapter-mark': continue @@ -298,8 +306,8 @@ class EpubConverter: # 2.a) process anchor which is a whole xhtml file internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)') - for toc_href in self.added_to_toc_hrefs: - soup = self.href2soup_html[toc_href] + for toc_href in self.hrefs_added_to_toc: + soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): a_tag_href = internal_link_tag.attrs['href'] # find full path @@ -309,7 +317,7 @@ class EpubConverter: new_id = self._create_unique_id(a_tag_href_matched_to_toc, '') internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' if new_id not in self.internal_anchors: - anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc] + anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] new_anchor_span = self._create_new_anchor_span(soup, new_id) anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file self.internal_anchors.add(new_id) @@ -318,8 +326,8 @@ class EpubConverter: # 2.b) process anchor which is a an element in xhtml file internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') - for toc_href in self.added_to_toc_hrefs: - soup = self.href2soup_html[toc_href] + for toc_href in self.hrefs_added_to_toc: + soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#') # find full path @@ -332,7 +340,7 @@ class EpubConverter: continue new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id) - anchor_soup = self.href2soup_html[a_tag_href_matched_to_toc] + anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] anchor_tags = anchor_soup.find_all(attrs={'id': new_id}) anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote @@ -374,7 +382,7 @@ class EpubConverter: """ if nav_point.id: - soup = self.href2soup_html[nav_point.href] + soup = self.html_href2html_body_soup[nav_point.href] chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup) new_tree = BeautifulSoup('', 'html.parser') for tag in chapter_tags: @@ -396,13 +404,13 @@ class EpubConverter: if nav_point.id: content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] else: - content: BeautifulSoup = self.href2soup_html[nav_point.href] + content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href] - self.old_image_path2_aws_path = update_src_links_in_images(content, - self.href2img_bytes, - path_to_html=nav_point.href, - access=self.access, - path2aws_path=self.old_image_path2_aws_path) + self.old_image_path2aws_path = update_src_links_in_images(content, + self.href2img_bytes, + path_to_html=nav_point.href, + access=self.access, + path2aws_path=self.old_image_path2aws_path) is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS title_preprocessed, content_preprocessed = prepare_title_and_content(title, content, @@ -447,7 +455,7 @@ if __name__ == "__main__": logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) - json_converter = EpubConverter('../epub/index_with_html.epub', + json_converter = EpubConverter('../epub/9781641050692.epub', logger=logger_object) tmp = json_converter.convert_to_dict() diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 3065171..d10e300 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -1,7 +1,7 @@ import os import pathlib import re -from typing import List, Tuple +from typing import Tuple from bs4 import BeautifulSoup, NavigableString, Tag, Comment