epub converter: add access object for image processing

- update headings cleaning
- add h tag removal
-
This commit is contained in:
shirshasa
2021-04-21 17:27:50 +03:00
parent dce0f871a8
commit ea0814fb4c
2 changed files with 42 additions and 17 deletions

View File

@@ -1,7 +1,5 @@
import codecs import codecs
import json import json
import re
import os
from collections import defaultdict from collections import defaultdict
from typing import Dict, Union from typing import Dict, Union
@@ -9,7 +7,6 @@ import ebooklib
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ebooklib import epub from ebooklib import epub
from ebooklib.epub import Link, Section from ebooklib.epub import Link, Section
from ebooklib.utils import debug
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
@@ -27,19 +24,22 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_
# todo: https://docs.python.org/3/howto/unicode.html # todo: https://docs.python.org/3/howto/unicode.html
# поиск toc в epublib:
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
# если его там нет, пробуют искать nav tag в manifest -> EpubNav.
class EpubPostprocessor: class EpubPostprocessor:
def __init__(self, file): def __init__(self, file, access=None):
self.file = file self.file = file
self.access = access
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
self.href2img_bytes = {} self.href2img_bytes = {}
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE): for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
debug(x)
file_name = x.file_name file_name = x.file_name
content = x.content content = x.content
# todo: check how file path is count in lib # todo: check how file path is count in lib
self.href2img_bytes[file_name] = content self.href2img_bytes[file_name] = content
# read html
self.id_anchor_exist_in_nav_points = False self.id_anchor_exist_in_nav_points = False
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.footnotes = [] self.footnotes = []
@@ -193,7 +193,7 @@ class EpubPostprocessor:
else: else:
content: BeautifulSoup = self.href2soup_html[node.href] content: BeautifulSoup = self.href2soup_html[node.href]
preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None) preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content) title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
sub_nodes = [] sub_nodes = []

View File

@@ -6,6 +6,7 @@ from typing import List
from bs4 import BeautifulSoup, NavigableString, Tag from bs4 import BeautifulSoup, NavigableString, Tag
from src.access import Access from src.access import Access
from src.config import LawCartaConfig
def save_image_locally(img_file_path, img_content, book_id): def save_image_locally(img_file_path, img_content, book_id):
@@ -54,10 +55,6 @@ def preprocess_table():
pass pass
def preprocess_quote():
pass
def _process_lists(body_tag): def _process_lists(body_tag):
""" """
Function to process tags <li>. Function to process tags <li>.
@@ -71,14 +68,39 @@ def _process_lists(body_tag):
il_tag.p.unwrap() il_tag.p.unwrap()
def clean_heading_in_content(content: Tag, title: str): def clean_headings_content(content: Tag, title: str):
for child in content.contents: for child in content.contents:
if child.text and re.sub(r'([\n\t\xa0])', '', child.text): if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
if title == child.text: text = re.sub(r'([\n\t\xa0])', ' ', child.text)
text = re.sub(r' +', ' ', text).rstrip()
if title == text:
child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
child.extract() child.extract()
break break
def _preprocessing_headings(body_tag):
"""
Function to convert all lower level headings to p tags
"""
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
def clean_title_from_numbering(title: str):
"""
Function to remove digits from headers.
"""
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
return title
def replace_with_livecarta_anchor_tag(anchor, i): def replace_with_livecarta_anchor_tag(anchor, i):
new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element' new_tag['class'] = 'footnote-element'
@@ -164,7 +186,7 @@ def add_fonts():
def unwrap_structural_tags(body_tag): def unwrap_structural_tags(body_tag):
structural_tags_names = [ structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span' 'figure', 'footer', 'iframe', 'span', 'p'
] ]
divs = body_tag.find_all("div") divs = body_tag.find_all("div")
@@ -240,6 +262,8 @@ def get_tags_between_ids(first_id, href, html_soup):
def prepare_title_and_content(title, content_tag: BeautifulSoup): def prepare_title_and_content(title, content_tag: BeautifulSoup):
title_str = BeautifulSoup(title, features='lxml').string title_str = BeautifulSoup(title, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
# 0. cleaning \n # 0. cleaning \n
to_remove = [] to_remove = []
for child in content_tag.contents: for child in content_tag.contents:
@@ -250,9 +274,10 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
[x.extract() for x in to_remove] [x.extract() for x in to_remove]
# 1. rule#1 for heading removal # 1. rule#1 for heading removal
clean_heading_in_content(content_tag, title_str) clean_headings_content(content_tag, title_str)
_process_lists(content_tag) _process_lists(content_tag)
_preprocessing_headings(content_tag)
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = clean_title_from_numbering(title_str)
return title_str, content_str return title_str, content_str