epub converter: add access object for image processing

- update headings cleaning - add h tag removal -
2021-04-21 17:27:50 +03:00
parent dce0f871a8
commit ea0814fb4c
2 changed files with 42 additions and 17 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -6,6 +6,7 @@ from typing import List
 from bs4 import BeautifulSoup, NavigableString, Tag

 from src.access import Access
+from src.config import LawCartaConfig


 def save_image_locally(img_file_path, img_content, book_id):
@@ -54,10 +55,6 @@ def preprocess_table():
    pass


-def preprocess_quote():
-    pass
-
-
 def _process_lists(body_tag):
    """
    Function to process tags <li>.
@@ -71,14 +68,39 @@ def _process_lists(body_tag):
            il_tag.p.unwrap()


-def clean_heading_in_content(content: Tag, title: str):
+def clean_headings_content(content: Tag, title: str):
    for child in content.contents:
        if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
-            if title == child.text:
+            text = re.sub(r'([\n\t\xa0])', ' ', child.text)
+            text = re.sub(r' +', ' ', text).rstrip()
+            if title == text:
+                child.extract()
+            elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
                child.extract()
            break


+def _preprocessing_headings(body_tag):
+    """
+    Function to convert all lower level headings to p tags
+    """
+    pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
+    header_tags = body_tag.find_all(re.compile(pattern))
+    for tag in header_tags:
+        tag.name = 'p'
+
+
+def clean_title_from_numbering(title: str):
+    """
+    Function to remove digits  from headers.
+    """
+    title = re.sub(r'^(\s+)+', '', title)
+    title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
+    # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title)  # delete chapter numbering from the title
+    title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
+    return title
+
+
 def replace_with_livecarta_anchor_tag(anchor, i):
    new_tag = BeautifulSoup(features='lxml').new_tag('sup')
    new_tag['class'] = 'footnote-element'
@@ -164,7 +186,7 @@ def add_fonts():
 def unwrap_structural_tags(body_tag):
    structural_tags_names = [
        'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
-        'figure', 'footer', 'iframe', 'span'
+        'figure', 'footer', 'iframe', 'span', 'p'
    ]

    divs = body_tag.find_all("div")
@@ -240,6 +262,8 @@ def get_tags_between_ids(first_id, href, html_soup):

 def prepare_title_and_content(title, content_tag: BeautifulSoup):
    title_str = BeautifulSoup(title, features='lxml').string
+    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
+    title_str = re.sub(r' +', ' ', title_str).rstrip()
    # 0. cleaning \n
    to_remove = []
    for child in content_tag.contents:
@@ -250,9 +274,10 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):

    [x.extract() for x in to_remove]
    # 1. rule#1 for heading removal
-    clean_heading_in_content(content_tag, title_str)
+    clean_headings_content(content_tag, title_str)
    _process_lists(content_tag)
+    _preprocessing_headings(content_tag)

    content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
-    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
+    title_str = clean_title_from_numbering(title_str)
    return title_str, content_str