diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index 6d32940..dc83b62 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -1,7 +1,5 @@ import codecs import json -import re -import os from collections import defaultdict from typing import Dict, Union @@ -9,7 +7,6 @@ import ebooklib from bs4 import BeautifulSoup from ebooklib import epub from ebooklib.epub import Link, Section -from ebooklib.utils import debug from src.data_objects import ChapterItem, NavPoint from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \ @@ -27,19 +24,22 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ # todo: https://docs.python.org/3/howto/unicode.html +# поиск toc в epublib: +# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap +# если его там нет, пробуют искать nav tag в manifest -> EpubNav. + class EpubPostprocessor: - def __init__(self, file): + def __init__(self, file, access=None): self.file = file + self.access = access self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib self.href2img_bytes = {} - for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE): - debug(x) file_name = x.file_name content = x.content # todo: check how file path is count in lib self.href2img_bytes[file_name] = content - + # read html self.id_anchor_exist_in_nav_points = False self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() self.footnotes = [] @@ -193,7 +193,7 @@ class EpubPostprocessor: else: content: BeautifulSoup = self.href2soup_html[node.href] - preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None) + preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access) title_preprocessed, content_preprocessed = prepare_title_and_content(title, content) sub_nodes = [] diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index c615f35..ef372c3 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -6,6 +6,7 @@ from typing import List from bs4 import BeautifulSoup, NavigableString, Tag from src.access import Access +from src.config import LawCartaConfig def save_image_locally(img_file_path, img_content, book_id): @@ -54,10 +55,6 @@ def preprocess_table(): pass -def preprocess_quote(): - pass - - def _process_lists(body_tag): """ Function to process tags
  • . @@ -71,14 +68,39 @@ def _process_lists(body_tag): il_tag.p.unwrap() -def clean_heading_in_content(content: Tag, title: str): +def clean_headings_content(content: Tag, title: str): for child in content.contents: if child.text and re.sub(r'([\n\t\xa0])', '', child.text): - if title == child.text: + text = re.sub(r'([\n\t\xa0])', ' ', child.text) + text = re.sub(r' +', ' ', text).rstrip() + if title == text: + child.extract() + elif (title in text) and (child.name in ['h1', 'h2', 'h3']): child.extract() break +def _preprocessing_headings(body_tag): + """ + Function to convert all lower level headings to p tags + """ + pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$' + header_tags = body_tag.find_all(re.compile(pattern)) + for tag in header_tags: + tag.name = 'p' + + +def clean_title_from_numbering(title: str): + """ + Function to remove digits from headers. + """ + title = re.sub(r'^(\s+)+', '', title) + title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) + # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title + title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) + return title + + def replace_with_livecarta_anchor_tag(anchor, i): new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' @@ -164,7 +186,7 @@ def add_fonts(): def unwrap_structural_tags(body_tag): structural_tags_names = [ 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', - 'figure', 'footer', 'iframe', 'span' + 'figure', 'footer', 'iframe', 'span', 'p' ] divs = body_tag.find_all("div") @@ -240,6 +262,8 @@ def get_tags_between_ids(first_id, href, html_soup): def prepare_title_and_content(title, content_tag: BeautifulSoup): title_str = BeautifulSoup(title, features='lxml').string + title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) + title_str = re.sub(r' +', ' ', title_str).rstrip() # 0. cleaning \n to_remove = [] for child in content_tag.contents: @@ -250,9 +274,10 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup): [x.extract() for x in to_remove] # 1. rule#1 for heading removal - clean_heading_in_content(content_tag, title_str) + clean_headings_content(content_tag, title_str) _process_lists(content_tag) + _preprocessing_headings(content_tag) content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) - title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) + title_str = clean_title_from_numbering(title_str) return title_str, content_str