Change processing of <pre><code>

2022-06-07 15:46:55 +03:00
parent acb2ce48c2
commit 5caec46f3c
6 changed files with 272 additions and 301 deletions
--- a/src/epub_converter/html_epub_preprocessor.py
+++ b/src/epub_converter/html_epub_preprocessor.py
@@ -9,178 +9,6 @@ from src.access import Access
 from src.livecarta_config import LiveCartaConfig


-def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
-    """Function saves all images locally"""
-    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    new_path = pathlib.Path(os.path.join(
-        folder_path, f'../json/img_{book_id}/'))
-    new_path.mkdir(exist_ok=True)
-
-    new_img_path = new_path / os.path.basename(img_file_path)
-    f = open(new_img_path, 'wb+')
-    f.write(img_content)
-    f.close()
-
-    return new_img_path
-
-
-def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
-    """Function saves all images to Amazon web service"""
-    link_path = access.send_image(
-        img_file_path, doc_id=book_id, img_content=img_content)
-    return link_path
-
-
-def update_images_src_links(body_tag: BeautifulSoup,
-                            href2img_content: dict,
-                            path_to_html: str,
-                            access=None,
-                            path2aws_path: dict = None,
-                            book_id: str = None) -> dict:
-    """Function makes dictionary image_src_path -> Amazon web service_path"""
-    img_tags = body_tag.find_all('img')
-
-    for img in img_tags:
-        path_to_img_from_html = img.attrs.get('src')
-        html_folder = os.path.dirname(path_to_html)
-        path_to_img_from_root = os.path.normpath(os.path.join(
-            html_folder, path_to_img_from_html)).replace('\\', '/')
-
-        assert path_to_img_from_root in href2img_content, \
-            f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
-
-        img_content = href2img_content[path_to_img_from_root]
-        if access is not None:
-            if path_to_img_from_root in path2aws_path:
-                new_folder = path2aws_path[path_to_img_from_root]
-            else:
-                new_folder = save_image_to_aws(
-                    access, path_to_img_from_root, img_content, book_id)
-                path2aws_path[path_to_img_from_root] = new_folder
-        else:
-            new_folder = save_image_locally(
-                path_to_img_from_root, img_content, 'book_id')
-
-        img.attrs['src'] = str(new_folder)
-        if img.attrs.get('width'):
-            del img.attrs['width']
-        if img.attrs.get('height'):
-            del img.attrs['height']
-        if img.attrs.get('style'):
-            del img.attrs['style']
-    return path2aws_path
-
-
-def _preprocess_table(body_tag: BeautifulSoup):
-    """Function to preprocess tables and tags(td|th|tr): style"""
-    tables = body_tag.find_all("table")
-    for table in tables:
-        t_tags = table.find_all(re.compile("td|th|tr"))
-        for t_tag in t_tags:
-            style = t_tag.get('style')
-            width = ''
-            if style:
-                width_match = re.search(
-                    r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
-                if width_match:
-                    size = width_match.group(1)
-                    width = size+'px'
-
-            t_tag.attrs['width'] = t_tag.get('width') or width
-
-            if t_tag.attrs.get('style'):
-                t_tag.attrs['style'] = t_tag.attrs['style'].replace(
-                    'border:0;', '')
-
-            elif t_tag.attrs.get('style') == '':
-                del t_tag.attrs['style']
-
-        if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
-            table.attrs['border'] = '1'
-
-
-def _process_lists(body_tag: BeautifulSoup):
-    """
-    Function
-    - process tags <li>.
-    - unwrap <p> tags.
-    Parameters
-    ----------
-    body_tag: Tag, soup object
-
-    Returns
-    -------
-    None
-
-    """
-    li_tags = body_tag.find_all("li")
-    for li_tag in li_tags:
-        if li_tag.p:
-            li_tag.attrs.update(li_tag.p.attrs)
-            li_tag.p.unwrap()
-
-
-def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
-    """Function inserts span before tag aren't supported by livecarta"""
-    new_tag = main_tag.new_tag("span")
-    new_tag.attrs['id'] = id_ or ''
-    new_tag.attrs['class'] = class_ or ''
-    new_tag.string = "\xa0"
-    tag.insert_before(new_tag)
-
-
-def _clean_headings_content(content: BeautifulSoup, title: str):
-    def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
-        if tag_to_be_removed.attrs.get('id'):
-            _insert_span_with_attrs_before_tag(body_tag,
-                                               tag_to_be_removed,
-                                               id_=tag_to_be_removed.attrs.get(
-                                                   'id'),
-                                               class_=tag_to_be_removed.attrs.get('class'))
-
-        for sub_tag in tag_to_be_removed.find_all():
-            if sub_tag.attrs.get('id'):
-                _insert_span_with_attrs_before_tag(body_tag,
-                                                   tag_to_be_removed,
-                                                   id_=sub_tag.attrs['id'],
-                                                   class_=sub_tag.attrs.get('class'))
-
-    title = title.lower()
-    for child in content.contents:
-        if isinstance(child, NavigableString):
-            text = child
-        else:
-            text = child.text
-        if text and re.sub(r'([\n\t\xa0])', '', text):
-            text = re.sub(r'([\n\t\xa0])', ' ', text)
-            text = re.sub(r' +', ' ', text).strip()
-            text = text.lower()
-            if title == text:
-                add_span_to_save_ids_for_links(child, content)
-                child.extract()
-            elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
-                add_span_to_save_ids_for_links(child, content)
-                child.extract()
-            break
-
-
-def _heading_tag_to_p_tag(body_tag):
-    """Function to convert all lower level headings to p tags"""
-    pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
-    header_tags = body_tag.find_all(re.compile(pattern))
-    for tag in header_tags:
-        tag.name = 'p'
-
-
-def _clean_title_from_numbering(title: str):
-    """Function removes numbering from titles"""
-    title = re.sub(r'^(\s+)+', '', title)
-    # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
-    # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title)  # delete chapter numbering(letters) from the title
-    # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
-    return title
-
-
 def _replace_with_livecarta_anchor_tag(anchor, i):
    """Function replace noteref_tag(anchor) with new livecarta tag"""
    new_tag = BeautifulSoup(features='lxml').new_tag('sup')
@@ -381,6 +209,13 @@ def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup:
            _add_span_to_save_ids_for_links(div)
            div.unwrap()

+    def _heading_tag_to_p_tag(body_tag):
+        """Function to convert all lower level headings to p tags"""
+        pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
+        header_tags = body_tag.find_all(re.compile(pattern))
+        for tag in header_tags:
+            tag.name = 'p'
+
    # comments removal
    for tag in body_tag.find_all():
        for element in tag(text=lambda text: isinstance(text, Comment)):
@@ -497,6 +332,248 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
    return tags


+def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
+    """Function saves all images to Amazon web service"""
+    link_path = access.send_image(
+        img_file_path, doc_id=book_id, img_content=img_content)
+    return link_path
+
+
+def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
+    """Function saves all images locally"""
+    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    new_path = pathlib.Path(os.path.join(
+        folder_path, f'../json/img_{book_id}/'))
+    new_path.mkdir(exist_ok=True)
+
+    new_img_path = new_path / os.path.basename(img_file_path)
+    f = open(new_img_path, 'wb+')
+    f.write(img_content)
+    f.close()
+
+    return new_img_path
+
+
+def update_images_src_links(body_tag: BeautifulSoup,
+                            href2img_content: dict,
+                            path_to_html: str,
+                            access=None,
+                            path2aws_path: dict = None,
+                            book_id: str = None) -> dict:
+    """Function makes dictionary image_src_path -> Amazon web service_path"""
+    img_tags = body_tag.find_all('img')
+
+    for img in img_tags:
+        path_to_img_from_html = img.attrs.get('src')
+        html_folder = os.path.dirname(path_to_html)
+        path_to_img_from_root = os.path.normpath(os.path.join(
+            html_folder, path_to_img_from_html)).replace('\\', '/')
+
+        assert path_to_img_from_root in href2img_content, \
+            f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
+
+        img_content = href2img_content[path_to_img_from_root]
+        if access is not None:
+            if path_to_img_from_root in path2aws_path:
+                new_folder = path2aws_path[path_to_img_from_root]
+            else:
+                new_folder = save_image_to_aws(
+                    access, path_to_img_from_root, img_content, book_id)
+                path2aws_path[path_to_img_from_root] = new_folder
+        else:
+            new_folder = save_image_locally(
+                path_to_img_from_root, img_content, 'book_id')
+
+        img.attrs['src'] = str(new_folder)
+        if img.attrs.get('width'):
+            del img.attrs['width']
+        if img.attrs.get('height'):
+            del img.attrs['height']
+        if img.attrs.get('style'):
+            del img.attrs['style']
+    return path2aws_path
+
+
+def _clean_title_from_numbering(title: str):
+    """Function removes numbering from titles"""
+    title = re.sub(r'^(\s+)+', '', title)
+    # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
+    # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title)  # delete chapter numbering(letters) from the title
+    # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
+    return title
+
+
+def prepare_title(title_of_chapter: str) -> str:
+    """Function finalise processing/cleaning title"""
+    title_str = BeautifulSoup(title_of_chapter, features='lxml').string
+    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
+    title_str = re.sub(r' +', ' ', title_str).rstrip()
+    title_str = _clean_title_from_numbering(title_str)
+    return title_str
+
+
+def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
+    """Function inserts span before tag aren't supported by livecarta"""
+    new_tag = main_tag.new_tag("span")
+    new_tag.attrs['id'] = id_ or ''
+    new_tag.attrs['class'] = class_ or ''
+    new_tag.string = "\xa0"
+    tag.insert_before(new_tag)
+
+
+def _clean_headings_content(content: BeautifulSoup, title: str):
+    def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
+        if tag_to_be_removed.attrs.get('id'):
+            _insert_span_with_attrs_before_tag(body_tag,
+                                               tag_to_be_removed,
+                                               id_=tag_to_be_removed.attrs.get(
+                                                   'id'),
+                                               class_=tag_to_be_removed.attrs.get('class'))
+
+        for sub_tag in tag_to_be_removed.find_all():
+            if sub_tag.attrs.get('id'):
+                _insert_span_with_attrs_before_tag(body_tag,
+                                                   tag_to_be_removed,
+                                                   id_=sub_tag.attrs['id'],
+                                                   class_=sub_tag.attrs.get('class'))
+
+    title = title.lower()
+    for child in content.contents:
+        if isinstance(child, NavigableString):
+            text = child
+        else:
+            text = child.text
+        if text and re.sub(r'([\n\t\xa0])', '', text):
+            text = re.sub(r'([\n\t\xa0])', ' ', text)
+            text = re.sub(r' +', ' ', text).strip()
+            text = text.lower()
+            if title == text:
+                add_span_to_save_ids_for_links(child, content)
+                child.extract()
+            elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
+                add_span_to_save_ids_for_links(child, content)
+                child.extract()
+            break
+
+
+def _process_lists(body_tag: BeautifulSoup):
+    """
+    Function
+    - process tags <li>.
+    - unwrap <p> tags.
+    Parameters
+    ----------
+    body_tag: Tag, soup object
+
+    Returns
+    -------
+    None
+
+    """
+    li_tags = body_tag.find_all("li")
+    for li_tag in li_tags:
+        if li_tag.p:
+            li_tag.attrs.update(li_tag.p.attrs)
+            li_tag.p.unwrap()
+
+
+def _preprocess_table(body_tag: BeautifulSoup):
+    """Function to preprocess tables and tags(td|th|tr): style"""
+    tables = body_tag.find_all("table")
+    for table in tables:
+        t_tags = table.find_all(re.compile("td|th|tr"))
+        for t_tag in t_tags:
+            style = t_tag.get('style')
+            width = ''
+            if style:
+                width_match = re.search(
+                    r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
+                if width_match:
+                    size = width_match.group(1)
+                    width = size + 'px'
+
+            t_tag.attrs['width'] = t_tag.get('width') or width
+
+            if t_tag.attrs.get('style'):
+                t_tag.attrs['style'] = t_tag.attrs['style'].replace(
+                    'border:0;', '')
+
+            elif t_tag.attrs.get('style') == '':
+                del t_tag.attrs['style']
+
+        if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
+            table.attrs['border'] = '1'
+
+
+def _preprocess_code_tags(chapter_tag: BeautifulSoup):
+    """
+    Function
+    - transform <code>, <kdb>, <var> tags into span
+    - add code style to this tags
+    Parameters
+    ----------
+    chapter_tag: Tag, soup object
+
+    Returns
+    -------
+    None
+
+    """
+    for code in chapter_tag.find_all(re.compile("code|kbd|var")):
+        if not code.parent.name == "pre":
+            code.name = "span"
+            continue
+        # if tag isn't in pre and doesn't have style
+        if not code.attrs.get('style'):
+            code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
+
+
+def _prepare_formatted(text: str) -> str:
+    """Function replaces special symbols with their Unicode representation"""
+    text = text.replace("<", "\x3C")
+    text = text.replace(">", "\x3E")
+    text = text.replace('\t', "\xa0 \xa0 ")  # &nbsp; &nbsp;
+    text = text.replace(' ', "\xa0")
+    text = text.replace('𝑓', "\xf0\x9d\x91\x93")
+    return text
+
+
+def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
+    """
+    Function preprocessing <pre> tags
+    Wrap string of the tag with <code> if it's necessary
+    Parameters
+    ----------
+    chapter_tag: Tag, soup object
+
+    Returns
+    ----------
+    None
+        Modified chapter tag
+
+    """
+    for pre in chapter_tag.find_all("pre"):
+        if pre.find_all("code|kbd|var"):
+            continue
+        else:
+            code = chapter_tag.new_tag("code")
+            # insert all items that was in pre to code and remove from pre
+            for content in reversed(pre.contents):
+                code.insert(0, content.extract())
+            # wrap code with items
+            pre.append(code)
+
+
+def _clean_wiley_block(block):
+    hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
+    for hr in hrs:
+        hr.extract()
+    h = block.find(re.compile("h[1-9]"))
+    if h:
+        h.name = "p"
+        h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
+
+
 def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
    """Function wraps <block> with <table>"""
    table = main_tag.new_tag("table")
@@ -517,16 +594,6 @@ def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_c
    return table


-def _clean_wiley_block(block):
-    hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
-    for hr in hrs:
-        hr.extract()
-    h = block.find(re.compile("h[1-9]"))
-    if h:
-        h.name = "p"
-        h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
-
-
 def _preprocess_block_tags(chapter_tag: Tag):
    """Function preprocessing <block> tags"""
    for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
@@ -548,114 +615,6 @@ def _preprocess_block_tags(chapter_tag: Tag):
        _wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)


-def _prepare_formatted(text: str) -> str:
-    """Function replaces special symbols with their Unicode representation"""
-    text = text.replace("<", "\x3C")
-    text = text.replace(">", "\x3E")
-    text = text.replace('\t', "\xa0 \xa0 ")  # &nbsp; &nbsp;
-    text = text.replace(' ', "\xa0")
-    text = text.replace('𝑓', "\xf0\x9d\x91\x93")
-    return text
-
-
-def _wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
-    """Function wraps <span> with <table>"""
-    table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
-        "tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
-    table.attrs['border'], table.attrs['style'] = '1px #ccc;', 'width:100%;'
-    td.attrs['bgcolor'] = '#f5f5f5'
-    # td.attrs['border-radius'] = '4px'
-    span_tag.wrap(td)
-    td.wrap(tr)
-    tr.wrap(tbody)
-    tbody.wrap(table)
-    return table
-
-
-def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
-    """
-    Function preprocessing <pre> tags
-    Parameters
-    ----------
-    chapter_tag: Tag, soup object
-
-    Steps
-    ----------
-    1. Process NavigableString
-    2. Process Tags and their children
-
-    """
-    for pre in chapter_tag.find_all("pre"):
-        new_tag = BeautifulSoup(features='lxml').new_tag("span")
-        new_tag.attrs = pre.attrs.copy()
-        new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
-                                 "font-size: 14px; white-space: nowrap;"
-        # if in <pre> there are multiple <span>, we need to add <br> after each content
-        to_add_br = len(pre.find_all("span")) > 1
-        copy_contents = pre.contents[:]
-        for child in copy_contents:
-            # Navigable String
-            if isinstance(child, NavigableString):
-                cleaned_text = _prepare_formatted(str(child))
-                sub_strings = re.split('\r\n|\n|\r', cleaned_text)
-                for string in sub_strings[:-1]:
-                    new_tag.append(NavigableString(string))
-                    new_tag.append(BeautifulSoup(
-                        features='lxml').new_tag('br'))
-                new_tag.append(NavigableString(sub_strings[-1]))
-            # Tag
-            else:
-                for sub_child in child.children:
-                    if isinstance(sub_child, NavigableString):
-                        cleaned_text = _prepare_formatted(str(sub_child))
-                        sub_child.replace_with(NavigableString(cleaned_text))
-                    else:
-                        sub_child.string = _prepare_formatted(sub_child.text)
-                cleaned_tag = child.extract()
-                new_tag.append(cleaned_tag)
-                if to_add_br:
-                    new_tag.append(BeautifulSoup(
-                        features='lxml').new_tag('br'))
-        pre.replace_with(new_tag)
-        table = _wrap_preformatted_span_with_table(chapter_tag, new_tag)
-        # add <p> to save brs
-        p_for_br = chapter_tag.new_tag("p")
-        p_for_br.string = "\xa0"
-        table.insert_after(p_for_br)
-
-
-def _preprocess_code_tags(chapter_tag: BeautifulSoup):
-    """
-    Function
-    - transform <code>, <kdb>, <var> tags into span
-    - add code style to this tags
-    Parameters
-    ----------
-    chapter_tag: Tag, soup object
-
-    Returns
-    -------
-    None
-
-    """
-    for code in chapter_tag.find_all(re.compile("code|kbd|var")):
-        code.name = "span"
-        if code.parent.name == "pre":
-            continue
-        # if tags aren't in pre and don't have style
-        if not code.attrs.get('style'):
-            code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
-
-
-def prepare_title(title_of_chapter: str) -> str:
-    """Function finalise processing/cleaning title"""
-    title_str = BeautifulSoup(title_of_chapter, features='lxml').string
-    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
-    title_str = re.sub(r' +', ' ', title_str).rstrip()
-    title_str = _clean_title_from_numbering(title_str)
-    return title_str
-
-
 def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
    """
    Function finalise processing/cleaning content