From 24210c99994f5242c3bc08a89f88e3acbe965318 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 26 Oct 2021 14:35:40 +0300 Subject: [PATCH] Add htm support in processing anchors --- src/epub_converter/css_reader.py | 8 +++-- src/epub_converter/epub_converter.py | 36 ++++++++++---------- src/epub_converter/html_epub_preprocessor.py | 15 ++++---- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/src/epub_converter/css_reader.py b/src/epub_converter/css_reader.py index c767f0b..ff35717 100644 --- a/src/epub_converter/css_reader.py +++ b/src/epub_converter/css_reader.py @@ -100,11 +100,12 @@ LIVECARTA_STYLE_ATTRS = { 'background-color': [], 'background': [], 'width': [], + 'border': [], 'border-top-width': [], 'border-right-width': [], 'border-left-width': [], 'border-bottom-width': [], - 'border': [], + 'border-bottom': [], 'list-style-type': [], 'list-style-image': [], 'margin-left': [] @@ -145,6 +146,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = { 'border-right-width': lambda x: x if x != '0' else '', 'border-left-width': lambda x: x if x != '0' else '', 'border-bottom-width': lambda x: x if x != '0' else '', + 'border-bottom': lambda x: x if x != '0' else '', 'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-image': lambda x: 'disc', 'margin-left': convert_indents @@ -409,9 +411,9 @@ class TagStyleConverter: @staticmethod def add_span_to_save_style_attrs_in_ul_ol(t): if t.name in ['ul', 'ol'] and t.attrs.get('style'): - styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] + styles_cant_be_in_ul_ol = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] - check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li] + check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_ul_ol] if any(check): t.name = 'span' li_tag = BeautifulSoup(features='lxml').new_tag('ul') diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index c1e8827..646ee5b 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -254,18 +254,18 @@ class EpubConverter: self.html_href2html_body_soup[href] = unwrap_structural_tags(soup) @staticmethod - def _create_unique_id(href, id_): + def create_unique_id(href, id_): return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) @staticmethod - def _create_new_anchor_span(soup, id_): + def create_new_anchor_span(soup, id_): new_anchor_span = soup.new_tag("span") new_anchor_span.attrs['id'] = id_ new_anchor_span.attrs['class'] = 'link-anchor' new_anchor_span.string = "\xa0" return new_anchor_span - def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag): + def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag): """ TOC: a/b/c.xhtml @@ -304,44 +304,44 @@ class EpubConverter: if tag.attrs.get('class') == 'footnote-element': continue - new_id = self._create_unique_id(toc_href, tag.attrs['id']) + new_id = self.create_unique_id(toc_href, tag.attrs['id']) tag.attrs['id'] = new_id # 2.a) process anchor which is a whole xhtml file - internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(html|xhtml)$)') + internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(htm|html|xhtml)$)') for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): a_tag_href = internal_link_tag.attrs['href'] # find full path - a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) if not a_tag_href_matched_to_toc: continue - new_id = self._create_unique_id(a_tag_href_matched_to_toc, '') + new_id = self.create_unique_id(a_tag_href_matched_to_toc, '') internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' if new_id not in self.internal_anchors: anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - new_anchor_span = self._create_new_anchor_span(soup, new_id) + new_anchor_span = self.create_new_anchor_span(soup, new_id) anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file self.internal_anchors.add(new_id) del internal_link_tag.attrs['href'] - # 2.b) process anchor which is a an element in xhtml file - internal_link_reg2 = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') + # 2.b) process anchor which is an element in xhtml file + internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)') for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#') # find full path if a_tag_href: - a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, - internal_link_tag) + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, + internal_link_tag) else: a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/') if not a_tag_href_matched_to_toc: continue - new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id) + new_id = self.create_unique_id(a_tag_href_matched_to_toc, a_tag_id) anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] anchor_tags = anchor_soup.find_all(attrs={'id': new_id}) @@ -359,7 +359,7 @@ class EpubConverter: internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' # create span to have cyclic links, link has 1 type of class, anchor another if anchor_tag.attrs['id'] not in self.internal_anchors: - new_anchor_span = self._create_new_anchor_span(soup, new_id) + new_anchor_span = self.create_new_anchor_span(soup, new_id) anchor_tag.insert_before(new_anchor_span) self.internal_anchors.add(new_id) del anchor_tag.attrs['id'] @@ -402,7 +402,7 @@ class EpubConverter: for point in top_level_nav_points: self.build_one_chapter(point) - def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: + def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: title = nav_point.title if nav_point.id: content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] @@ -423,7 +423,7 @@ class EpubConverter: # warning! not EpubHtmlItems won;t be added to chapter if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: - sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1) + sub_chapter_item = self.node_to_livecarta_chapter_item(sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) if self.logger: @@ -436,7 +436,7 @@ class EpubConverter: top_level_chapters = [] for nav_point in top_level_nav_points: - chapter = self.node2livecarta_chapter_item(nav_point) + chapter = self.node_to_livecarta_chapter_item(nav_point) top_level_chapters.append(chapter) top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] @@ -458,7 +458,7 @@ if __name__ == "__main__": logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) - json_converter = EpubConverter('../../epub/9781634252221.epub', + json_converter = EpubConverter('../../epub/Cook.epub', logger=logger_object) tmp = json_converter.convert_to_dict() diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 51a39e9..91bdf79 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -468,10 +468,11 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co return table -def _clean_wiley_block(block): +def clean_wiley_block(block): hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) for hr in hrs: hr.extract() + print(hr) h = block.find(re.compile("h[1-9]")) if h: h.name = "p" @@ -481,7 +482,7 @@ def _clean_wiley_block(block): def preprocess_block_tags(chapter_tag): for block in chapter_tag.find_all("blockquote"): if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: - _clean_wiley_block(block) + clean_wiley_block(block) color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color @@ -490,13 +491,13 @@ def preprocess_block_tags(chapter_tag): block.unwrap() for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): - _clean_wiley_block(future_block) + clean_wiley_block(future_block) color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) -def _prepare_formatted(text): +def prepare_formatted(text): # replace <,> to save them as is in html code text = text.replace("<", "\x3C") text = text.replace(">", "\x3E") @@ -515,7 +516,7 @@ def preprocess_pre_tags(chapter_tag): for child in pre.children: if isinstance(child, NavigableString): - cleaned_text = _prepare_formatted(str(child)) + cleaned_text = prepare_formatted(str(child)) sub_strings = re.split('\r\n|\n|\r', cleaned_text) for string in sub_strings: new_tag.append(NavigableString(string)) @@ -523,10 +524,10 @@ def preprocess_pre_tags(chapter_tag): else: for sub_child in child.children: if isinstance(sub_child, NavigableString): - cleaned_text2 = _prepare_formatted(str(sub_child)) + cleaned_text2 = prepare_formatted(str(sub_child)) sub_child.replace_with(NavigableString(cleaned_text2)) else: - sub_child.string = _prepare_formatted(sub_child.text) + sub_child.string = prepare_formatted(sub_child.text) cleaned_tag = child.extract() new_tag.append(cleaned_tag) if to_add_br: