diff --git a/src/data_objects.py b/src/data_objects.py index c1a9517..abcbde0 100644 --- a/src/data_objects.py +++ b/src/data_objects.py @@ -2,7 +2,7 @@ import re from typing import Union from ebooklib.epub import Section, Link - +from livecarta_config import LawCartaConfig """ These are data structures which form mapping from NCX to python data structures. @@ -10,13 +10,13 @@ These are data structures which form mapping from NCX to python data structures. class NavPoint: - def __init__(self, obj: Union[Link, Section]=None, ): + def __init__(self, obj: Union[Link, Section] = None, ): self.href, self.id = self.parse_href_id(obj) self.title = obj.title @staticmethod def parse_href_id(item: Union[Link, Section]): - reg = '(.+\..+\#)(.+)' + reg = r'(.+\..+\#)(.+)' match = re.search(reg, item.href) href, div_id = None, None if match: @@ -24,7 +24,7 @@ class NavPoint: if match.group(1): href = match.group(1)[:-1] else: - reg2 = '(.+\..+)' + reg2 = r'(.+\..+)' match2 = re.search(reg2, item.href) if match2 and match2.group(1): href = match2.group(1) @@ -39,6 +39,14 @@ class NavPoint: These are data structures which form mapping to livecarta json structure. """ +atom = lambda x: not isinstance(x, list) +nil = lambda x: not x +car = lambda x: x[0] +cdr = lambda x: x[1:] +cons = lambda x, y: x + y + +flatten = lambda x: [x] if atom(x) else x if nil(x) else cons(*map(flatten, [car(x), cdr(x)])) + class ChapterItem: def __init__(self, title, content, sub_items): @@ -46,16 +54,30 @@ class ChapterItem: self.content = content self.sub_items = sub_items - def to_dict(self): - tmp = [] + def to_dict(self, lvl=1): + sub_dicts = [] if self.sub_items: for i in self.sub_items: - tmp.append(i.to_dict()) + sub_dicts.append(i.to_dict(lvl + 1)) + + if lvl > LawCartaConfig.SUPPORTED_LEVELS: + return { + "title": self.title, + "contents": [self.content] + [x['contents'] for x in sub_dicts], + "sub_items": [] + } + + if (lvl == LawCartaConfig.SUPPORTED_LEVELS) and sub_dicts: + return { + "title": self.title, + "contents": [self.content] + flatten([x['contents'] for x in sub_dicts]), + "sub_items": [] + } return { "title": self.title, "contents": [self.content], - "sub_items": tmp + "sub_items": sub_dicts } def __str__(self): diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index 900f22b..ecef400 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -14,6 +14,7 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, update_src_links_in_images, preprocess_footnotes from css_reader import clean_css, add_inline_style_to_html_soup +from livecarta_config import LawCartaConfig class EpubPostprocessor: @@ -209,7 +210,7 @@ class EpubPostprocessor: for point in nav_points: self.build_one_anchored_section(point) - def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem: + def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem: title = node.title if node.id: content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)] @@ -217,13 +218,16 @@ class EpubPostprocessor: content: BeautifulSoup = self.href2soup_html[node.href] update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access) - title_preprocessed, content_preprocessed = prepare_title_and_content(title, content) + + is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS + title_preprocessed, content_preprocessed = prepare_title_and_content(title, content, + remove_title_from_chapter=is_chapter) sub_nodes = [] # warning! not EpubHtmlItems won;t be added to chapter if self.adjacency_list.get(node): for sub_node in self.adjacency_list[node]: - sub_chapter_item = self.node2livecarta_chapter_item(sub_node) + sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl+1) sub_nodes.append(sub_chapter_item) # print(f'Chapter: {title} is prepared.') diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 1687835..7514693 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -1,7 +1,7 @@ import os import pathlib import re -from typing import List +from typing import List, Tuple from bs4 import BeautifulSoup, NavigableString, Tag @@ -87,7 +87,6 @@ def preprocess_table(body_tag: BeautifulSoup): if border_sizes: border_size = sum(border_sizes) / len(border_sizes) - print(border_size) table.attrs['border'] = f'{border_size:.2}' @@ -108,7 +107,7 @@ def clean_headings_content(content: Tag, title: str): for child in content.contents: if child.text and re.sub(r'([\n\t\xa0])', '', child.text): text = re.sub(r'([\n\t\xa0])', ' ', child.text) - text = re.sub(r' +', ' ', text).rstrip() + text = re.sub(r' +', ' ', text).strip() if title == text: child.extract() elif (title in text) and (child.name in ['h1', 'h2', 'h3']): @@ -294,29 +293,30 @@ def get_tags_between_ids(first_id, href, html_soup): return tags -def prepare_title_and_content(title, content_tag: BeautifulSoup): +def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]: title_str = BeautifulSoup(title, features='lxml').string title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r' +', ' ', title_str).rstrip() # 0. cleaning \n to_remove = [] - for child in content_tag.contents: + for child in chapter_tag.contents: if isinstance(child, NavigableString): s = re.sub(r'([\n\t\xa0])', '', child.string) if s == '': to_remove.append(child) [x.extract() for x in to_remove] - # 1. rule#1 for heading removal - clean_headings_content(content_tag, title_str) - _process_lists(content_tag) - _preprocessing_headings(content_tag) - preprocess_table(content_tag) + # 1. heading removal + if remove_title_from_chapter: + clean_headings_content(chapter_tag, title_str) + _process_lists(chapter_tag) + _preprocessing_headings(chapter_tag) + preprocess_table(chapter_tag) # 2. class removal - for tag in content_tag.find_all(recursive=True): + for tag in chapter_tag.find_all(recursive=True): if hasattr(tag, 'attrs') and tag.attrs.get('class'): del tag.attrs['class'] # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) title_str = clean_title_from_numbering(title_str) - return title_str, str(content_tag) + return title_str, str(chapter_tag)