diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index c433c0f..c0d720c 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -1,6 +1,7 @@ import codecs import json import logging +import re from os.path import dirname, normpath, join from collections import defaultdict from typing import Dict, Union @@ -22,9 +23,9 @@ class EpubPostprocessor: def __init__(self, file, access=None, logger=None): self.file = file self.access = access - self.logger = logger + self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib - + self.internal_links_found = 0 self.logger.log('Image processing.') self.href2img_bytes = {} self.old_image_path2_aws_path = {} @@ -42,13 +43,13 @@ class EpubPostprocessor: self.id_anchor_exist_in_nav_points = False self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() - self.logger.log('CSS processing.') + self.logger.log('CSS files processing.') self.html_href2css_href = {} self.css_href2content = {} self.build_css_content() # add css - # self.logger.log('CSS styles adding processing.') - # self.add_css_styles2soup() + self.logger.log('CSS styles adding.') + self.add_css_styles2soup() self.logger.log('Footnotes processing.') self.footnotes = [] @@ -57,16 +58,17 @@ class EpubPostprocessor: self.logger.log(f'Added {len(self.footnotes)} footnotes.') self.logger.log('TOC processing.') self.href2ids = defaultdict(list) - self.added_to_toc_hrefs = [] + self.added_to_toc_hrefs = set() self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed if not self.is_toc_valid(): self.build_adjacency_list_from_spine() not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs] - self.logger.log(f'html documents not added to TOC: {not_added}') + self.logger.log(f'Html documents not added to TOC: {not_added}.') # read anchored blocks, split html into separate block self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed + self.process_internal_links() self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} self.build_anchor2soup() @@ -131,7 +133,7 @@ class EpubPostprocessor: self.id_anchor_exist_in_nav_points = True self.href2ids[node.href].append(node.id) self.adjacency_list[node] = None - self.added_to_toc_hrefs.append(node.href) + self.added_to_toc_hrefs.add(node.href) return node elif isinstance(element, tuple): @@ -147,7 +149,7 @@ class EpubPostprocessor: sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) self.adjacency_list[node] = sub_nodes - self.added_to_toc_hrefs.append(node.href) + self.added_to_toc_hrefs.add(node.href) return node elif isinstance(element, list) and (lvl == 0): @@ -173,7 +175,7 @@ class EpubPostprocessor: for id_, _ in self.ebooklib_book.spine: node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) self.adjacency_list[-1].append(node) - self.added_to_toc_hrefs.append(node.href) + self.added_to_toc_hrefs.add(node.href) def mark_and_line_href2soup_html(self): # mark @@ -182,8 +184,8 @@ class EpubPostprocessor: for i in ids: soup = self.href2soup_html[href] tag = soup.find(id=i) - new_h = soup.new_tag('h1') - new_h.attrs['class'] = 'internal-mark' + new_h = soup.new_tag('tmp') + new_h.attrs['class'] = 'converter-chapter-mark' new_h.attrs['id'] = i tag.insert_before(new_h) @@ -192,6 +194,64 @@ class EpubPostprocessor: soup = self.href2soup_html[href] self.href2soup_html[href] = unwrap_structural_tags(soup) + @staticmethod + def _create_unique_id(href, id_): + return re.sub(r'([^\w\s])|_|-', '', href) + id_ + + def process_internal_links(self): + # rebuild ids to be unique in all documents + for href in self.added_to_toc_hrefs: + for tag in self.href2soup_html[href].find_all(attrs={'id': re.compile(r'.+')}): + if tag.attrs.get('class') == 'converter-chapter-mark': + continue + + new_id = self._create_unique_id(href, tag.attrs['id']) + tag.attrs['id'] = new_id + + # write placeholder to all internal links + internal_link_reg = re.compile(r'(^.+\.(html|xhtml)\#.+)|(^\#.+)') + for href in self.added_to_toc_hrefs: + soup = self.href2soup_html[href] + for internal_link_tag in soup.find_all('a', {'href': internal_link_reg}): + href_in_link, id_in_link = internal_link_tag.attrs['href'].split('#') + if not href_in_link: + href_in_link = href + # find full path + full_path = [path for path in self.added_to_toc_hrefs if href_in_link in path] + if not full_path: + self.logger.log(f'Error in {href} file. No {href_in_link} file found in added to TOC documents. ' + f'While processing href in {internal_link_tag}.') + internal_link_tag.attrs['converter-mark'] = 'bad-link' + continue + + if len(full_path) > 1: + self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}' + f' while {internal_link_tag} processing. The first one will be chosen.') + + href_in_link = full_path[0] + new_id = self._create_unique_id(href_in_link, id_in_link) + + anchor_soup = self.href2soup_html[href_in_link] + anchor_tags = anchor_soup.find_all(attrs={'id': new_id}) + if anchor_tags: + if len(anchor_tags) > 1: + self.logger.log(f'Warning in {href}: multiple anchors: {anchor_tags} found.' + f' While processing {internal_link_tag}') + + anchor_tag = anchor_tags[0] + # if anchor is found we could add placeholder for link creation on server side. + internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' + anchor_tag.attrs['class'] = 'link-anchor' + del internal_link_tag.attrs['href'] + self.internal_links_found += 1 + + else: + internal_link_tag.attrs['converter-mark'] = 'bad-link' + if 'page' not in id_in_link: + self.logger.log(f'Error in {href}. While processing {internal_link_tag} no anchor found.' + f' Should be anchor with new id={new_id} in {href_in_link} file.' + f' Old id={id_in_link}') + def build_one_anchored_section(self, node): """ к этому моементу html soup уже существует в линейном виде @@ -248,11 +308,12 @@ class EpubPostprocessor: # warning! not EpubHtmlItems won;t be added to chapter if self.adjacency_list.get(node): for sub_node in self.adjacency_list[node]: - sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl+1) + sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) if self.logger: - self.logger.log(f'Chapter: {title} is prepared.') + indent = ' ' * lvl + self.logger.log(f'{indent}Chapter: {title} is prepared.') return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) def convert_to_dict(self): @@ -264,6 +325,8 @@ class EpubPostprocessor: top_level_chapters.append(chapter) top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] + self.logger.log(f'Internal links found: {self.internal_links_found}.') + self.logger.log('End conversion.') return { "content": top_level_dict_chapters, @@ -275,6 +338,8 @@ if __name__ == "__main__": logger = logging.getLogger('epub') file_handler = logging.StreamHandler() logger.addHandler(file_handler) + file_handler = logging.FileHandler('epub.log', mode='w+') + logger.addHandler(file_handler) logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 4486f51..9b813b4 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -115,8 +115,12 @@ def _process_lists(body_tag): def clean_headings_content(content: Tag, title: str): for child in content.contents: - if child.text and re.sub(r'([\n\t\xa0])', '', child.text): - text = re.sub(r'([\n\t\xa0])', ' ', child.text) + if isinstance(child, NavigableString): + text = child + else: + text = child.text + if text and re.sub(r'([\n\t\xa0])', '', text): + text = re.sub(r'([\n\t\xa0])', ' ', text) text = re.sub(r' +', ' ', text).strip() if title == text: child.extract() @@ -196,10 +200,6 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note return unicode_string.strip() - def remove_internal_links_with_text(t): - for tag_a in t.find_all('a', {'href': re.compile('(^.+\.(html|xhtml)#.+)|(^#.+)')}): - tag_a.decompose() - for i, noteref_tag in enumerate(noterefs_tags): href = noteref_tag.attrs['href'] file, element_id = parse_a_tag_href(href) @@ -208,7 +208,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note else: target_html_tag = href2soup_html.get(file) if not target_html_tag: - print(f'Error. for\n{noteref_tag}\ninvalid path: {file} found.') + print(f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.') continue possible_footnote = 'note|footnote|endnote|rearenote' @@ -218,7 +218,6 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) footnote_tag = expected_footnote_tags[0] replace_with_livecarta_anchor_tag(noteref_tag, i) - remove_internal_links_with_text(footnote_tag) content = get_footnote_tags2str(footnote_tag) footnote_tag.decompose() @@ -227,45 +226,19 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note return footnotes -def add_fonts(): - pass - - def unwrap_structural_tags(body_tag): + + def _add_span_to_save_ids_for_links(tag_to_be_removed): + if tag_to_be_removed.attrs.get('id'): + new_tag = body_tag.new_tag("span") + new_tag.attrs['id'] = tag_to_be_removed.attrs['id'] + tag_to_be_removed.insert_before(new_tag) + structural_tags_names = [ 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p' ] - - for div in body_tag.find_all("div"): - if div.contents: - is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents] - if all(is_not_struct_tag): - div.name = 'p' - continue - div.unwrap() - - for s in body_tag.find_all("section"): - s.unwrap() - - for s in body_tag.find_all("article"): - s.unwrap() - - for s in body_tag.find_all("aside"): - s.name = 'blockquote' - - for s in body_tag.find_all("main"): - s.unwrap() - - for s in body_tag.find_all("body"): - s.unwrap() - - for s in body_tag.find_all("html"): - s.unwrap() - - for s in body_tag.find_all("header"): - s.name = 'span' - + # should be before other tags processing, not to remove converter empty tags with id # not all cases, if span has
s and NavigableString, it won't unwrap for s in body_tag.find_all("span"): if s.contents: @@ -274,6 +247,55 @@ def unwrap_structural_tags(body_tag): continue s.unwrap() + for div in body_tag.find_all("div"): + if div.contents: + is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents] + if all(is_not_struct_tag): + div.name = 'p' + continue + + _add_span_to_save_ids_for_links(div) + div.unwrap() + + for s in body_tag.find_all("section"): + _add_span_to_save_ids_for_links(s) + s.unwrap() + + for s in body_tag.find_all("article"): + _add_span_to_save_ids_for_links(s) + s.unwrap() + + for s in body_tag.find_all("aside"): + s.name = 'blockquote' + + for s in body_tag.find_all("main"): + _add_span_to_save_ids_for_links(s) + s.unwrap() + + for s in body_tag.find_all("body"): + _add_span_to_save_ids_for_links(s) + s.unwrap() + + for s in body_tag.find_all("html"): + _add_span_to_save_ids_for_links(s) + s.unwrap() + + for s in body_tag.find_all("header"): + s.name = 'span' + + # check marks for chapter starting are on the same 1 level + marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'}) + parents_marks_are_body = [x.parent == body_tag for x in marks] + + # fix marks to be on 1 level + if not all(parents_marks_are_body): + for x in marks: + while x.parent != body_tag: + x.parent.unwrap() # warning! could reflect on formatting/internal links in some cases + + parents_marks_are_body = [x.parent == body_tag for x in marks] + assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level.' + _preprocessing_headings(body_tag) for node in body_tag: @@ -290,12 +312,12 @@ def unwrap_structural_tags(body_tag): def get_tags_between_ids(first_id, href, html_soup): - h_marked = html_soup.find(attrs={'id': first_id, 'class': 'internal-mark'}) + h_marked = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'}) if h_marked: p = h_marked.next_sibling tags = [] while p: - if p.name == 'h1' and p.attrs.get('class') == 'internal-mark': + if p.name == 'tmp' and p.attrs.get('class') == 'converter-chapter-mark': break tags.append(p) p = p.next_sibling @@ -330,7 +352,7 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr preprocess_table(chapter_tag) # 2. class removal for tag in chapter_tag.find_all(recursive=True): - if hasattr(tag, 'attrs') and tag.attrs.get('class'): + if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']): del tag.attrs['class'] # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))