diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index ed38dab..85d5900 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -1,6 +1,7 @@ import os import pathlib import re +from html import escape from typing import List, Tuple from bs4 import BeautifulSoup, NavigableString, Tag @@ -381,6 +382,41 @@ def get_tags_between_chapter_marks(first_id, href, html_soup): return tags +def wrap_text_with_table(main_tag, text, old_tag): + table = main_tag.new_tag("table") + table.attrs['border'] = '0' + table.attrs['style'] = 'width:100%;' + tbody = main_tag.new_tag("tbody") + tr = main_tag.new_tag("tr") + td = main_tag.new_tag("td") + td.attrs['style'] = 'font-family: courier new,courier,monospace;' + td.attrs['bgcolor'] = '#f5f5f5' + td.insert(0, str(text)) + old_tag.replace_with(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + return table + + +def preprocess_pre_tags(chapter_tag): + for pre in chapter_tag.find_all("pre"): + if not pre.children: + assert 1, 'Pre tag has other tags.' + else: + wrap_text_with_table(chapter_tag, escape(pre.text), pre) + + +def preprocess_code_tags(chapter_tag): + for code in chapter_tag.find_all("code"): + if not code.children: + assert 1, 'Code tag has other tags.' + else: + code.string = escape(code.text) + code.name = 'span' + code.attrs['style'] = 'color:#c7254e; font-family: courier new,courier,monospace;' + + def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]: title_str = BeautifulSoup(title, features='lxml').string title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) @@ -399,6 +435,8 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr clean_headings_content(chapter_tag, title_str) _process_lists(chapter_tag) preprocess_table(chapter_tag) + preprocess_pre_tags(chapter_tag) + preprocess_code_tags(chapter_tag) # 2. class removal for tag in chapter_tag.find_all(recursive=True): if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']):