epub converter: fix headings levels

This commit is contained in:
shirshasa
2021-04-30 15:21:35 +03:00
parent d21b11f99a
commit b472c5b9f7
3 changed files with 49 additions and 23 deletions

View File

@@ -2,7 +2,7 @@ import re
from typing import Union from typing import Union
from ebooklib.epub import Section, Link from ebooklib.epub import Section, Link
from livecarta_config import LawCartaConfig
""" """
These are data structures which form mapping from NCX to python data structures. These are data structures which form mapping from NCX to python data structures.
@@ -10,13 +10,13 @@ These are data structures which form mapping from NCX to python data structures.
class NavPoint: class NavPoint:
def __init__(self, obj: Union[Link, Section]=None, ): def __init__(self, obj: Union[Link, Section] = None, ):
self.href, self.id = self.parse_href_id(obj) self.href, self.id = self.parse_href_id(obj)
self.title = obj.title self.title = obj.title
@staticmethod @staticmethod
def parse_href_id(item: Union[Link, Section]): def parse_href_id(item: Union[Link, Section]):
reg = '(.+\..+\#)(.+)' reg = r'(.+\..+\#)(.+)'
match = re.search(reg, item.href) match = re.search(reg, item.href)
href, div_id = None, None href, div_id = None, None
if match: if match:
@@ -24,7 +24,7 @@ class NavPoint:
if match.group(1): if match.group(1):
href = match.group(1)[:-1] href = match.group(1)[:-1]
else: else:
reg2 = '(.+\..+)' reg2 = r'(.+\..+)'
match2 = re.search(reg2, item.href) match2 = re.search(reg2, item.href)
if match2 and match2.group(1): if match2 and match2.group(1):
href = match2.group(1) href = match2.group(1)
@@ -39,6 +39,14 @@ class NavPoint:
These are data structures which form mapping to livecarta json structure. These are data structures which form mapping to livecarta json structure.
""" """
atom = lambda x: not isinstance(x, list)
nil = lambda x: not x
car = lambda x: x[0]
cdr = lambda x: x[1:]
cons = lambda x, y: x + y
flatten = lambda x: [x] if atom(x) else x if nil(x) else cons(*map(flatten, [car(x), cdr(x)]))
class ChapterItem: class ChapterItem:
def __init__(self, title, content, sub_items): def __init__(self, title, content, sub_items):
@@ -46,16 +54,30 @@ class ChapterItem:
self.content = content self.content = content
self.sub_items = sub_items self.sub_items = sub_items
def to_dict(self): def to_dict(self, lvl=1):
tmp = [] sub_dicts = []
if self.sub_items: if self.sub_items:
for i in self.sub_items: for i in self.sub_items:
tmp.append(i.to_dict()) sub_dicts.append(i.to_dict(lvl + 1))
if lvl > LawCartaConfig.SUPPORTED_LEVELS:
return {
"title": self.title,
"contents": [self.content] + [x['contents'] for x in sub_dicts],
"sub_items": []
}
if (lvl == LawCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
return {
"title": self.title,
"contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),
"sub_items": []
}
return { return {
"title": self.title, "title": self.title,
"contents": [self.content], "contents": [self.content],
"sub_items": tmp "sub_items": sub_dicts
} }
def __str__(self): def __str__(self):

View File

@@ -14,6 +14,7 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids,
update_src_links_in_images, preprocess_footnotes update_src_links_in_images, preprocess_footnotes
from css_reader import clean_css, add_inline_style_to_html_soup from css_reader import clean_css, add_inline_style_to_html_soup
from livecarta_config import LawCartaConfig
class EpubPostprocessor: class EpubPostprocessor:
@@ -209,7 +210,7 @@ class EpubPostprocessor:
for point in nav_points: for point in nav_points:
self.build_one_anchored_section(point) self.build_one_anchored_section(point)
def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem: def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem:
title = node.title title = node.title
if node.id: if node.id:
content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)] content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
@@ -217,13 +218,16 @@ class EpubPostprocessor:
content: BeautifulSoup = self.href2soup_html[node.href] content: BeautifulSoup = self.href2soup_html[node.href]
update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access) update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
remove_title_from_chapter=is_chapter)
sub_nodes = [] sub_nodes = []
# warning! not EpubHtmlItems won;t be added to chapter # warning! not EpubHtmlItems won;t be added to chapter
if self.adjacency_list.get(node): if self.adjacency_list.get(node):
for sub_node in self.adjacency_list[node]: for sub_node in self.adjacency_list[node]:
sub_chapter_item = self.node2livecarta_chapter_item(sub_node) sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl+1)
sub_nodes.append(sub_chapter_item) sub_nodes.append(sub_chapter_item)
# print(f'Chapter: {title} is prepared.') # print(f'Chapter: {title} is prepared.')

View File

@@ -1,7 +1,7 @@
import os import os
import pathlib import pathlib
import re import re
from typing import List from typing import List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag from bs4 import BeautifulSoup, NavigableString, Tag
@@ -87,7 +87,6 @@ def preprocess_table(body_tag: BeautifulSoup):
if border_sizes: if border_sizes:
border_size = sum(border_sizes) / len(border_sizes) border_size = sum(border_sizes) / len(border_sizes)
print(border_size)
table.attrs['border'] = f'{border_size:.2}' table.attrs['border'] = f'{border_size:.2}'
@@ -108,7 +107,7 @@ def clean_headings_content(content: Tag, title: str):
for child in content.contents: for child in content.contents:
if child.text and re.sub(r'([\n\t\xa0])', '', child.text): if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
text = re.sub(r'([\n\t\xa0])', ' ', child.text) text = re.sub(r'([\n\t\xa0])', ' ', child.text)
text = re.sub(r' +', ' ', text).rstrip() text = re.sub(r' +', ' ', text).strip()
if title == text: if title == text:
child.extract() child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']): elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
@@ -294,29 +293,30 @@ def get_tags_between_ids(first_id, href, html_soup):
return tags return tags
def prepare_title_and_content(title, content_tag: BeautifulSoup): def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
title_str = BeautifulSoup(title, features='lxml').string title_str = BeautifulSoup(title, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip() title_str = re.sub(r' +', ' ', title_str).rstrip()
# 0. cleaning \n # 0. cleaning \n
to_remove = [] to_remove = []
for child in content_tag.contents: for child in chapter_tag.contents:
if isinstance(child, NavigableString): if isinstance(child, NavigableString):
s = re.sub(r'([\n\t\xa0])', '', child.string) s = re.sub(r'([\n\t\xa0])', '', child.string)
if s == '': if s == '':
to_remove.append(child) to_remove.append(child)
[x.extract() for x in to_remove] [x.extract() for x in to_remove]
# 1. rule#1 for heading removal # 1. heading removal
clean_headings_content(content_tag, title_str) if remove_title_from_chapter:
_process_lists(content_tag) clean_headings_content(chapter_tag, title_str)
_preprocessing_headings(content_tag) _process_lists(chapter_tag)
preprocess_table(content_tag) _preprocessing_headings(chapter_tag)
preprocess_table(chapter_tag)
# 2. class removal # 2. class removal
for tag in content_tag.find_all(recursive=True): for tag in chapter_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class'): if hasattr(tag, 'attrs') and tag.attrs.get('class'):
del tag.attrs['class'] del tag.attrs['class']
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str) title_str = clean_title_from_numbering(title_str)
return title_str, str(content_tag) return title_str, str(chapter_tag)