forked from LiveCarta/BookConverter
epub converter: fix headings levels
This commit is contained in:
@@ -2,7 +2,7 @@ import re
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ebooklib.epub import Section, Link
|
from ebooklib.epub import Section, Link
|
||||||
|
from livecarta_config import LawCartaConfig
|
||||||
|
|
||||||
"""
|
"""
|
||||||
These are data structures which form mapping from NCX to python data structures.
|
These are data structures which form mapping from NCX to python data structures.
|
||||||
@@ -16,7 +16,7 @@ class NavPoint:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_href_id(item: Union[Link, Section]):
|
def parse_href_id(item: Union[Link, Section]):
|
||||||
reg = '(.+\..+\#)(.+)'
|
reg = r'(.+\..+\#)(.+)'
|
||||||
match = re.search(reg, item.href)
|
match = re.search(reg, item.href)
|
||||||
href, div_id = None, None
|
href, div_id = None, None
|
||||||
if match:
|
if match:
|
||||||
@@ -24,7 +24,7 @@ class NavPoint:
|
|||||||
if match.group(1):
|
if match.group(1):
|
||||||
href = match.group(1)[:-1]
|
href = match.group(1)[:-1]
|
||||||
else:
|
else:
|
||||||
reg2 = '(.+\..+)'
|
reg2 = r'(.+\..+)'
|
||||||
match2 = re.search(reg2, item.href)
|
match2 = re.search(reg2, item.href)
|
||||||
if match2 and match2.group(1):
|
if match2 and match2.group(1):
|
||||||
href = match2.group(1)
|
href = match2.group(1)
|
||||||
@@ -39,6 +39,14 @@ class NavPoint:
|
|||||||
These are data structures which form mapping to livecarta json structure.
|
These are data structures which form mapping to livecarta json structure.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
atom = lambda x: not isinstance(x, list)
|
||||||
|
nil = lambda x: not x
|
||||||
|
car = lambda x: x[0]
|
||||||
|
cdr = lambda x: x[1:]
|
||||||
|
cons = lambda x, y: x + y
|
||||||
|
|
||||||
|
flatten = lambda x: [x] if atom(x) else x if nil(x) else cons(*map(flatten, [car(x), cdr(x)]))
|
||||||
|
|
||||||
|
|
||||||
class ChapterItem:
|
class ChapterItem:
|
||||||
def __init__(self, title, content, sub_items):
|
def __init__(self, title, content, sub_items):
|
||||||
@@ -46,16 +54,30 @@ class ChapterItem:
|
|||||||
self.content = content
|
self.content = content
|
||||||
self.sub_items = sub_items
|
self.sub_items = sub_items
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self, lvl=1):
|
||||||
tmp = []
|
sub_dicts = []
|
||||||
if self.sub_items:
|
if self.sub_items:
|
||||||
for i in self.sub_items:
|
for i in self.sub_items:
|
||||||
tmp.append(i.to_dict())
|
sub_dicts.append(i.to_dict(lvl + 1))
|
||||||
|
|
||||||
|
if lvl > LawCartaConfig.SUPPORTED_LEVELS:
|
||||||
|
return {
|
||||||
|
"title": self.title,
|
||||||
|
"contents": [self.content] + [x['contents'] for x in sub_dicts],
|
||||||
|
"sub_items": []
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lvl == LawCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
|
||||||
|
return {
|
||||||
|
"title": self.title,
|
||||||
|
"contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),
|
||||||
|
"sub_items": []
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"title": self.title,
|
"title": self.title,
|
||||||
"contents": [self.content],
|
"contents": [self.content],
|
||||||
"sub_items": tmp
|
"sub_items": sub_dicts
|
||||||
}
|
}
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids,
|
|||||||
update_src_links_in_images, preprocess_footnotes
|
update_src_links_in_images, preprocess_footnotes
|
||||||
|
|
||||||
from css_reader import clean_css, add_inline_style_to_html_soup
|
from css_reader import clean_css, add_inline_style_to_html_soup
|
||||||
|
from livecarta_config import LawCartaConfig
|
||||||
|
|
||||||
|
|
||||||
class EpubPostprocessor:
|
class EpubPostprocessor:
|
||||||
@@ -209,7 +210,7 @@ class EpubPostprocessor:
|
|||||||
for point in nav_points:
|
for point in nav_points:
|
||||||
self.build_one_anchored_section(point)
|
self.build_one_anchored_section(point)
|
||||||
|
|
||||||
def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
|
def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem:
|
||||||
title = node.title
|
title = node.title
|
||||||
if node.id:
|
if node.id:
|
||||||
content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
|
content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
|
||||||
@@ -217,13 +218,16 @@ class EpubPostprocessor:
|
|||||||
content: BeautifulSoup = self.href2soup_html[node.href]
|
content: BeautifulSoup = self.href2soup_html[node.href]
|
||||||
|
|
||||||
update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
|
update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
|
||||||
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
|
|
||||||
|
is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS
|
||||||
|
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
|
||||||
|
remove_title_from_chapter=is_chapter)
|
||||||
|
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
# warning! not EpubHtmlItems won;t be added to chapter
|
# warning! not EpubHtmlItems won;t be added to chapter
|
||||||
if self.adjacency_list.get(node):
|
if self.adjacency_list.get(node):
|
||||||
for sub_node in self.adjacency_list[node]:
|
for sub_node in self.adjacency_list[node]:
|
||||||
sub_chapter_item = self.node2livecarta_chapter_item(sub_node)
|
sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl+1)
|
||||||
sub_nodes.append(sub_chapter_item)
|
sub_nodes.append(sub_chapter_item)
|
||||||
|
|
||||||
# print(f'Chapter: {title} is prepared.')
|
# print(f'Chapter: {title} is prepared.')
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List, Tuple
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||||
|
|
||||||
@@ -87,7 +87,6 @@ def preprocess_table(body_tag: BeautifulSoup):
|
|||||||
|
|
||||||
if border_sizes:
|
if border_sizes:
|
||||||
border_size = sum(border_sizes) / len(border_sizes)
|
border_size = sum(border_sizes) / len(border_sizes)
|
||||||
print(border_size)
|
|
||||||
table.attrs['border'] = f'{border_size:.2}'
|
table.attrs['border'] = f'{border_size:.2}'
|
||||||
|
|
||||||
|
|
||||||
@@ -108,7 +107,7 @@ def clean_headings_content(content: Tag, title: str):
|
|||||||
for child in content.contents:
|
for child in content.contents:
|
||||||
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
|
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
|
||||||
text = re.sub(r'([\n\t\xa0])', ' ', child.text)
|
text = re.sub(r'([\n\t\xa0])', ' ', child.text)
|
||||||
text = re.sub(r' +', ' ', text).rstrip()
|
text = re.sub(r' +', ' ', text).strip()
|
||||||
if title == text:
|
if title == text:
|
||||||
child.extract()
|
child.extract()
|
||||||
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
|
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
|
||||||
@@ -294,29 +293,30 @@ def get_tags_between_ids(first_id, href, html_soup):
|
|||||||
return tags
|
return tags
|
||||||
|
|
||||||
|
|
||||||
def prepare_title_and_content(title, content_tag: BeautifulSoup):
|
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
|
||||||
title_str = BeautifulSoup(title, features='lxml').string
|
title_str = BeautifulSoup(title, features='lxml').string
|
||||||
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||||
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
||||||
# 0. cleaning \n
|
# 0. cleaning \n
|
||||||
to_remove = []
|
to_remove = []
|
||||||
for child in content_tag.contents:
|
for child in chapter_tag.contents:
|
||||||
if isinstance(child, NavigableString):
|
if isinstance(child, NavigableString):
|
||||||
s = re.sub(r'([\n\t\xa0])', '', child.string)
|
s = re.sub(r'([\n\t\xa0])', '', child.string)
|
||||||
if s == '':
|
if s == '':
|
||||||
to_remove.append(child)
|
to_remove.append(child)
|
||||||
|
|
||||||
[x.extract() for x in to_remove]
|
[x.extract() for x in to_remove]
|
||||||
# 1. rule#1 for heading removal
|
# 1. heading removal
|
||||||
clean_headings_content(content_tag, title_str)
|
if remove_title_from_chapter:
|
||||||
_process_lists(content_tag)
|
clean_headings_content(chapter_tag, title_str)
|
||||||
_preprocessing_headings(content_tag)
|
_process_lists(chapter_tag)
|
||||||
preprocess_table(content_tag)
|
_preprocessing_headings(chapter_tag)
|
||||||
|
preprocess_table(chapter_tag)
|
||||||
# 2. class removal
|
# 2. class removal
|
||||||
for tag in content_tag.find_all(recursive=True):
|
for tag in chapter_tag.find_all(recursive=True):
|
||||||
if hasattr(tag, 'attrs') and tag.attrs.get('class'):
|
if hasattr(tag, 'attrs') and tag.attrs.get('class'):
|
||||||
del tag.attrs['class']
|
del tag.attrs['class']
|
||||||
|
|
||||||
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
||||||
title_str = clean_title_from_numbering(title_str)
|
title_str = clean_title_from_numbering(title_str)
|
||||||
return title_str, str(content_tag)
|
return title_str, str(chapter_tag)
|
||||||
|
|||||||
Reference in New Issue
Block a user