diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index 31127d6..ccfed07 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -41,9 +41,7 @@ class EpubPostprocessor: self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() self.logger.log('CSS files processing.') - self.html_href2css_href = {} - self.css_href2content = {} - self.build_css_content() + self.css_href2content, self.html_href2css_href = self.build_css_content() # add css self.logger.log('CSS styles adding.') self.add_css_styles2soup() @@ -84,26 +82,37 @@ class EpubPostprocessor: return nodes + def _read_css(self, css_href, html_path): + path_to_css_from_html = css_href + html_folder = dirname(html_path) + path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)) + css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) + assert css_obj, f'Css style {css_href} was not in manifest.' + css_content: str = css_obj.get_content().decode() + return css_content + def build_css_content(self): + css_href2content, html_href2css_href = {}, {} + # html_href2css_href 1-to-1, todo: 1-to-many + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_text = item.content + html_path = item.file_name soup = BeautifulSoup(html_text, features='lxml') for tag in soup.find_all('link', attrs={"type": "text/css"}): + if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): + continue css_href = tag.attrs.get('href') - self.html_href2css_href[item.file_name] = css_href - if css_href not in self.css_href2content: - path_to_css_from_html = css_href - html_folder = dirname(item.file_name) - path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)) - css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) - assert css_obj, f'Css style {css_href} was not in manifest.' - css_content: str = css_obj.get_content().decode() - self.css_href2content[css_href] = clean_css(css_content) + html_href2css_href[html_path] = css_href + if css_href not in css_href2content: + css_href2content[css_href] = clean_css(self._read_css(css_href, html_path)) for i, tag in enumerate(soup.find_all('style')): css_content = tag.string - self.html_href2css_href[item.file_name] = f'href{i}' - self.css_href2content[f'href{i}'] = clean_css(css_content) + html_href2css_href[html_path] = f'href{i}' + css_href2content[f'href{i}'] = clean_css(css_content) + + return css_href2content, html_href2css_href def add_css_styles2soup(self): for href in self.href2soup_html: