epub converter: prettified and comments added

2021-09-01 16:46:59 +03:00
parent c4c776ea3e
commit 50193eb25b
1 changed files with 42 additions and 14 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -63,10 +63,6 @@ def update_src_links_in_images(body_tag: Tag,
    return path2aws_path


-def preprocess_figure():
-    pass
-
-
 def preprocess_table(body_tag: BeautifulSoup):
    tables = body_tag.find_all("table")
    for table in tables:
@@ -81,10 +77,7 @@ def preprocess_table(body_tag: BeautifulSoup):
                    units = width_match.group(2)
                    width = size+'px'

-            width = td.get('width') or width
-
-            if width:
-                td.attrs['width'] = width
+            td.attrs['width'] = td.get('width') or width

            if td.attrs.get('style'):
                td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
@@ -151,7 +144,7 @@ def clean_headings_content(content: Tag, title: str):
            break


-def _preprocessing_headings(body_tag):
+def _heading_tag2p_tag(body_tag):
    """
    Function to convert all lower level headings to p tags
    """
@@ -184,8 +177,8 @@ def replace_with_livecarta_anchor_tag(anchor, i):
    return new_tag


-def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> Tuple[
-    list, list, list]:
+def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
+        -> Tuple[list, list, list]:
    """
    This function should be earlier that adding fonts in pipeline.

@@ -248,6 +241,23 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note


 def unwrap_structural_tags(body_tag):
+    """
+    Main function that works with structure of html.
+    Make changes inplace.
+
+    1. Extracts tags that are not needed
+
+    2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
+    Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
+    This tag must have a body_tag as a parent.
+    Otherwise, it is wrapped with some tags. Like:
+        <p> <span id='123', class='converter-chapter-mark'> </span> </p>
+
+    3. Headings that are not supported by livecarta converts to <p>
+    4. Wrapping NavigableString
+    :param body_tag: Tag, soup object
+    :return: None
+    """

    def _preserve_class_in_aside_tag(tag_):
        # to save css style inherited from class, copy class to aside tag (which is parent to tag_)
@@ -362,8 +372,9 @@ def unwrap_structural_tags(body_tag):
    parents_marks_are_body = [x.parent == body_tag for x in marks]
    assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'

-    _preprocessing_headings(body_tag)
+    _heading_tag2p_tag(body_tag)

+    # wrap NavigableString with <p>
    for node in body_tag:
        if isinstance(node, NavigableString):
            content = str(node)
@@ -378,19 +389,28 @@ def unwrap_structural_tags(body_tag):


 def get_tags_between_chapter_marks(first_id, href, html_soup):
+    """
+    After processing on a first_id that corresponds to current chapter,
+    from initial html_soup all tags from current chapter are extracted
+
+    :param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
+    :param href: name of current chapter's file
+    :param html_soup: soup object of current  file
+    :return: list [Tag, NavigableString]; chapter's tags
+    """
    marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
    if marked_tags:
        next_tag = marked_tags.next_sibling
        tags = []
        while next_tag:
-            # TODO: why we hve there NavString
-
            if not isinstance(next_tag, NavigableString) and\
                    (next_tag.attrs.get('class') == 'converter-chapter-mark'):
                break
            tags.append(next_tag)
            next_tag = next_tag.next_sibling

+        # remove tags between first_id and next found id
+        # save them in list for next steps
        tags = [tag.extract() for tag in tags]
        html_soup.smooth()

@@ -513,6 +533,14 @@ def preprocess_code_tags(chapter_tag):


 def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
+    """
+    Final processing/cleaning function.
+
+    :param title: title of the chapter
+    :param chapter_tag: soup object
+    :param remove_title_from_chapter: bool
+    :return: tuple[str, str]
+    """
    title_str = BeautifulSoup(title, features='lxml').string
    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
    title_str = re.sub(r' +', ' ', title_str).rstrip()