epub converter: prettified and comments added

2021-09-01 16:46:59 +03:00
parent c4c776ea3e
commit 50193eb25b
1 changed files with 42 additions and 14 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -63,10 +63,6 @@ def update_src_links_in_images(body_tag: Tag,
    return path2aws_path
 def preprocess_figure():
    pass
 def preprocess_table(body_tag: BeautifulSoup):
    tables = body_tag.find_all("table")
    for table in tables:
@@ -81,10 +77,7 @@ def preprocess_table(body_tag: BeautifulSoup):
                    units = width_match.group(2)
                    width = size+'px'
-            width = td.get('width') or width
+            td.attrs['width'] = td.get('width') or width
            if width:
                td.attrs['width'] = width
            if td.attrs.get('style'):
                td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
@@ -151,7 +144,7 @@ def clean_headings_content(content: Tag, title: str):
            break
-def _preprocessing_headings(body_tag):
+def _heading_tag2p_tag(body_tag):
    """
    Function to convert all lower level headings to p tags
    """
@@ -184,8 +177,8 @@ def replace_with_livecarta_anchor_tag(anchor, i):
    return new_tag
-def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> Tuple[
+def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-    list, list, list]:
+        -> Tuple[list, list, list]:
    """
    This function should be earlier that adding fonts in pipeline.
@@ -248,6 +241,23 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
 def unwrap_structural_tags(body_tag):
    """
    Main function that works with structure of html.
    Make changes inplace.
    1. Extracts tags that are not needed
    2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
    Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
    This tag must have a body_tag as a parent.
    Otherwise, it is wrapped with some tags. Like:
        <p> <span id='123', class='converter-chapter-mark'> </span> </p>
    3. Headings that are not supported by livecarta converts to <p>
    4. Wrapping NavigableString
    :param body_tag: Tag, soup object
    :return: None
    """
    def _preserve_class_in_aside_tag(tag_):
        # to save css style inherited from class, copy class to aside tag (which is parent to tag_)
@@ -362,8 +372,9 @@ def unwrap_structural_tags(body_tag):
    parents_marks_are_body = [x.parent == body_tag for x in marks]
    assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
-    _preprocessing_headings(body_tag)
+    _heading_tag2p_tag(body_tag)
    # wrap NavigableString with <p>
    for node in body_tag:
        if isinstance(node, NavigableString):
            content = str(node)
@@ -378,19 +389,28 @@ def unwrap_structural_tags(body_tag):
 def get_tags_between_chapter_marks(first_id, href, html_soup):
    """
    After processing on a first_id that corresponds to current chapter,
    from initial html_soup all tags from current chapter are extracted
    :param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
    :param href: name of current chapter's file
    :param html_soup: soup object of current  file
    :return: list [Tag, NavigableString]; chapter's tags
    """
    marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
    if marked_tags:
        next_tag = marked_tags.next_sibling
        tags = []
        while next_tag:
            # TODO: why we hve there NavString
            if not isinstance(next_tag, NavigableString) and\
                    (next_tag.attrs.get('class') == 'converter-chapter-mark'):
                break
            tags.append(next_tag)
            next_tag = next_tag.next_sibling
        # remove tags between first_id and next found id
        # save them in list for next steps
        tags = [tag.extract() for tag in tags]
        html_soup.smooth()
@@ -513,6 +533,14 @@ def preprocess_code_tags(chapter_tag):
 def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
    """
    Final processing/cleaning function.
    :param title: title of the chapter
    :param chapter_tag: soup object
    :param remove_title_from_chapter: bool
    :return: tuple[str, str]
    """
    title_str = BeautifulSoup(title, features='lxml').string
    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
    title_str = re.sub(r' +', ' ', title_str).rstrip()