epub converter: add <pre>, <code> processing

2021-07-05 13:14:47 +03:00
parent e1ea4f5605
commit e712947d7a
1 changed files with 38 additions and 0 deletions
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -1,6 +1,7 @@
 import os
 import pathlib
 import re
+from html import escape
 from typing import List, Tuple

 from bs4 import BeautifulSoup, NavigableString, Tag
@@ -381,6 +382,41 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
    return tags


+def wrap_text_with_table(main_tag, text, old_tag):
+    table = main_tag.new_tag("table")
+    table.attrs['border'] = '0'
+    table.attrs['style'] = 'width:100%;'
+    tbody = main_tag.new_tag("tbody")
+    tr = main_tag.new_tag("tr")
+    td = main_tag.new_tag("td")
+    td.attrs['style'] = 'font-family: courier new,courier,monospace;'
+    td.attrs['bgcolor'] = '#f5f5f5'
+    td.insert(0, str(text))
+    old_tag.replace_with(td)
+    td.wrap(tr)
+    tr.wrap(tbody)
+    tbody.wrap(table)
+    return table
+
+
+def preprocess_pre_tags(chapter_tag):
+    for pre in chapter_tag.find_all("pre"):
+        if not pre.children:
+            assert 1, 'Pre tag has other tags.'
+        else:
+            wrap_text_with_table(chapter_tag, escape(pre.text), pre)
+
+
+def preprocess_code_tags(chapter_tag):
+    for code in chapter_tag.find_all("code"):
+        if not code.children:
+            assert 1, 'Code tag has other tags.'
+        else:
+            code.string = escape(code.text)
+            code.name = 'span'
+            code.attrs['style'] = 'color:#c7254e; font-family: courier new,courier,monospace;'
+
+
 def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
    title_str = BeautifulSoup(title, features='lxml').string
    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
@@ -399,6 +435,8 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr
        clean_headings_content(chapter_tag, title_str)
    _process_lists(chapter_tag)
    preprocess_table(chapter_tag)
+    preprocess_pre_tags(chapter_tag)
+    preprocess_code_tags(chapter_tag)
    # 2. class removal
    for tag in chapter_tag.find_all(recursive=True):
        if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']):