forked from LiveCarta/BookConverter
epub converter: add access object for image processing
- update headings cleaning - add h tag removal -
This commit is contained in:
@@ -1,7 +1,5 @@
|
|||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
import re
|
|
||||||
import os
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
|
|
||||||
@@ -9,7 +7,6 @@ import ebooklib
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
from ebooklib.epub import Link, Section
|
from ebooklib.epub import Link, Section
|
||||||
from ebooklib.utils import debug
|
|
||||||
|
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
|
from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
|
||||||
@@ -27,19 +24,22 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_
|
|||||||
# todo: https://docs.python.org/3/howto/unicode.html
|
# todo: https://docs.python.org/3/howto/unicode.html
|
||||||
|
|
||||||
|
|
||||||
|
# поиск toc в epublib:
|
||||||
|
# если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap
|
||||||
|
# если его там нет, пробуют искать nav tag в manifest -> EpubNav.
|
||||||
|
|
||||||
class EpubPostprocessor:
|
class EpubPostprocessor:
|
||||||
def __init__(self, file):
|
def __init__(self, file, access=None):
|
||||||
self.file = file
|
self.file = file
|
||||||
|
self.access = access
|
||||||
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
|
self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib
|
||||||
self.href2img_bytes = {}
|
self.href2img_bytes = {}
|
||||||
|
|
||||||
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
|
for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
|
||||||
debug(x)
|
|
||||||
file_name = x.file_name
|
file_name = x.file_name
|
||||||
content = x.content
|
content = x.content
|
||||||
# todo: check how file path is count in lib
|
# todo: check how file path is count in lib
|
||||||
self.href2img_bytes[file_name] = content
|
self.href2img_bytes[file_name] = content
|
||||||
|
# read html
|
||||||
self.id_anchor_exist_in_nav_points = False
|
self.id_anchor_exist_in_nav_points = False
|
||||||
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
||||||
self.footnotes = []
|
self.footnotes = []
|
||||||
@@ -193,7 +193,7 @@ class EpubPostprocessor:
|
|||||||
else:
|
else:
|
||||||
content: BeautifulSoup = self.href2soup_html[node.href]
|
content: BeautifulSoup = self.href2soup_html[node.href]
|
||||||
|
|
||||||
preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=None)
|
preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
|
||||||
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
|
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
|
||||||
|
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from typing import List
|
|||||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||||
|
|
||||||
from src.access import Access
|
from src.access import Access
|
||||||
|
from src.config import LawCartaConfig
|
||||||
|
|
||||||
|
|
||||||
def save_image_locally(img_file_path, img_content, book_id):
|
def save_image_locally(img_file_path, img_content, book_id):
|
||||||
@@ -54,10 +55,6 @@ def preprocess_table():
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def preprocess_quote():
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def _process_lists(body_tag):
|
def _process_lists(body_tag):
|
||||||
"""
|
"""
|
||||||
Function to process tags <li>.
|
Function to process tags <li>.
|
||||||
@@ -71,14 +68,39 @@ def _process_lists(body_tag):
|
|||||||
il_tag.p.unwrap()
|
il_tag.p.unwrap()
|
||||||
|
|
||||||
|
|
||||||
def clean_heading_in_content(content: Tag, title: str):
|
def clean_headings_content(content: Tag, title: str):
|
||||||
for child in content.contents:
|
for child in content.contents:
|
||||||
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
|
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
|
||||||
if title == child.text:
|
text = re.sub(r'([\n\t\xa0])', ' ', child.text)
|
||||||
|
text = re.sub(r' +', ' ', text).rstrip()
|
||||||
|
if title == text:
|
||||||
|
child.extract()
|
||||||
|
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
|
||||||
child.extract()
|
child.extract()
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocessing_headings(body_tag):
|
||||||
|
"""
|
||||||
|
Function to convert all lower level headings to p tags
|
||||||
|
"""
|
||||||
|
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||||
|
header_tags = body_tag.find_all(re.compile(pattern))
|
||||||
|
for tag in header_tags:
|
||||||
|
tag.name = 'p'
|
||||||
|
|
||||||
|
|
||||||
|
def clean_title_from_numbering(title: str):
|
||||||
|
"""
|
||||||
|
Function to remove digits from headers.
|
||||||
|
"""
|
||||||
|
title = re.sub(r'^(\s+)+', '', title)
|
||||||
|
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
|
||||||
|
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
|
||||||
|
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
|
||||||
|
return title
|
||||||
|
|
||||||
|
|
||||||
def replace_with_livecarta_anchor_tag(anchor, i):
|
def replace_with_livecarta_anchor_tag(anchor, i):
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||||
new_tag['class'] = 'footnote-element'
|
new_tag['class'] = 'footnote-element'
|
||||||
@@ -164,7 +186,7 @@ def add_fonts():
|
|||||||
def unwrap_structural_tags(body_tag):
|
def unwrap_structural_tags(body_tag):
|
||||||
structural_tags_names = [
|
structural_tags_names = [
|
||||||
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
|
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
|
||||||
'figure', 'footer', 'iframe', 'span'
|
'figure', 'footer', 'iframe', 'span', 'p'
|
||||||
]
|
]
|
||||||
|
|
||||||
divs = body_tag.find_all("div")
|
divs = body_tag.find_all("div")
|
||||||
@@ -240,6 +262,8 @@ def get_tags_between_ids(first_id, href, html_soup):
|
|||||||
|
|
||||||
def prepare_title_and_content(title, content_tag: BeautifulSoup):
|
def prepare_title_and_content(title, content_tag: BeautifulSoup):
|
||||||
title_str = BeautifulSoup(title, features='lxml').string
|
title_str = BeautifulSoup(title, features='lxml').string
|
||||||
|
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||||
|
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
||||||
# 0. cleaning \n
|
# 0. cleaning \n
|
||||||
to_remove = []
|
to_remove = []
|
||||||
for child in content_tag.contents:
|
for child in content_tag.contents:
|
||||||
@@ -250,9 +274,10 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
|
|||||||
|
|
||||||
[x.extract() for x in to_remove]
|
[x.extract() for x in to_remove]
|
||||||
# 1. rule#1 for heading removal
|
# 1. rule#1 for heading removal
|
||||||
clean_heading_in_content(content_tag, title_str)
|
clean_headings_content(content_tag, title_str)
|
||||||
_process_lists(content_tag)
|
_process_lists(content_tag)
|
||||||
|
_preprocessing_headings(content_tag)
|
||||||
|
|
||||||
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
||||||
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
title_str = clean_title_from_numbering(title_str)
|
||||||
return title_str, content_str
|
return title_str, content_str
|
||||||
|
|||||||
Reference in New Issue
Block a user