epub converter: fix headings levels

2021-04-30 15:21:35 +03:00
parent d21b11f99a
commit b472c5b9f7
3 changed files with 49 additions and 23 deletions
--- a/src/data_objects.py
+++ b/src/data_objects.py
@@ -2,7 +2,7 @@ import re
 from typing import Union
 from ebooklib.epub import Section, Link
-
+from livecarta_config import LawCartaConfig
 """
 These are data structures which form mapping from NCX to python data structures.
@@ -16,7 +16,7 @@ class NavPoint:
    @staticmethod
    def parse_href_id(item: Union[Link, Section]):
-        reg = '(.+\..+\#)(.+)'
+        reg = r'(.+\..+\#)(.+)'
        match = re.search(reg, item.href)
        href, div_id = None, None
        if match:
@@ -24,7 +24,7 @@ class NavPoint:
            if match.group(1):
                href = match.group(1)[:-1]
        else:
-            reg2 = '(.+\..+)'
+            reg2 = r'(.+\..+)'
            match2 = re.search(reg2, item.href)
            if match2 and match2.group(1):
                href = match2.group(1)
@@ -39,6 +39,14 @@ class NavPoint:
 These are data structures which form mapping to livecarta json structure.
 """
 atom = lambda x: not isinstance(x, list)
 nil = lambda x: not x
 car = lambda x: x[0]
 cdr = lambda x: x[1:]
 cons = lambda x, y: x + y
 flatten = lambda x: [x] if atom(x) else x if nil(x) else cons(*map(flatten, [car(x), cdr(x)]))
 class ChapterItem:
    def __init__(self, title, content, sub_items):
@@ -46,16 +54,30 @@ class ChapterItem:
        self.content = content
        self.sub_items = sub_items
-    def to_dict(self):
+    def to_dict(self, lvl=1):
-        tmp = []
+        sub_dicts = []
        if self.sub_items:
            for i in self.sub_items:
-                tmp.append(i.to_dict())
+                sub_dicts.append(i.to_dict(lvl + 1))
        if lvl > LawCartaConfig.SUPPORTED_LEVELS:
            return {
                "title": self.title,
                "contents": [self.content] + [x['contents'] for x in sub_dicts],
                "sub_items": []
            }
        if (lvl == LawCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
            return {
                "title": self.title,
                "contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),
                "sub_items": []
            }
        return {
            "title": self.title,
            "contents": [self.content],
-            "sub_items": tmp
+            "sub_items": sub_dicts
        }
    def __str__(self):
--- a/src/epub_postprocessor.py
+++ b/src/epub_postprocessor.py
@@ -14,6 +14,7 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids,
    update_src_links_in_images, preprocess_footnotes
 from css_reader import clean_css, add_inline_style_to_html_soup
 from livecarta_config import LawCartaConfig
 class EpubPostprocessor:
@@ -209,7 +210,7 @@ class EpubPostprocessor:
            for point in nav_points:
                self.build_one_anchored_section(point)
-    def node2livecarta_chapter_item(self, node: NavPoint) -> ChapterItem:
+    def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem:
        title = node.title
        if node.id:
            content: BeautifulSoup = self.id_anchor2soup[(node.href, node.id)]
@@ -217,13 +218,16 @@ class EpubPostprocessor:
            content: BeautifulSoup = self.href2soup_html[node.href]
        update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
-        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)
+
        is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS
        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
                                                                             remove_title_from_chapter=is_chapter)
        sub_nodes = []
        # warning! not EpubHtmlItems won;t be added to chapter
        if self.adjacency_list.get(node):
            for sub_node in self.adjacency_list[node]:
-                sub_chapter_item = self.node2livecarta_chapter_item(sub_node)
+                sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl+1)
                sub_nodes.append(sub_chapter_item)
        # print(f'Chapter: {title} is prepared.')
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -1,7 +1,7 @@
 import os
 import pathlib
 import re
-from typing import List
+from typing import List, Tuple
 from bs4 import BeautifulSoup, NavigableString, Tag
@@ -87,7 +87,6 @@ def preprocess_table(body_tag: BeautifulSoup):
        if border_sizes:
            border_size = sum(border_sizes) / len(border_sizes)
            print(border_size)
            table.attrs['border'] = f'{border_size:.2}'
@@ -108,7 +107,7 @@ def clean_headings_content(content: Tag, title: str):
    for child in content.contents:
        if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
            text = re.sub(r'([\n\t\xa0])', ' ', child.text)
-            text = re.sub(r' +', ' ', text).rstrip()
+            text = re.sub(r' +', ' ', text).strip()
            if title == text:
                child.extract()
            elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
@@ -294,29 +293,30 @@ def get_tags_between_ids(first_id, href, html_soup):
    return tags
-def prepare_title_and_content(title, content_tag: BeautifulSoup):
+def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
    title_str = BeautifulSoup(title, features='lxml').string
    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
    title_str = re.sub(r' +', ' ', title_str).rstrip()
    # 0. cleaning \n
    to_remove = []
-    for child in content_tag.contents:
+    for child in chapter_tag.contents:
        if isinstance(child, NavigableString):
            s = re.sub(r'([\n\t\xa0])', '', child.string)
            if s == '':
                to_remove.append(child)
    [x.extract() for x in to_remove]
-    # 1. rule#1 for heading removal
+    # 1. heading removal
-    clean_headings_content(content_tag, title_str)
+    if remove_title_from_chapter:
-    _process_lists(content_tag)
+        clean_headings_content(chapter_tag, title_str)
-    _preprocessing_headings(content_tag)
+    _process_lists(chapter_tag)
-    preprocess_table(content_tag)
+    _preprocessing_headings(chapter_tag)
    preprocess_table(chapter_tag)
    # 2. class removal
-    for tag in content_tag.find_all(recursive=True):
+    for tag in chapter_tag.find_all(recursive=True):
        if hasattr(tag, 'attrs') and tag.attrs.get('class'):
            del tag.attrs['class']
    # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
    title_str = clean_title_from_numbering(title_str)
-    return title_str, str(content_tag)
+    return title_str, str(chapter_tag)