epub converter: add <pre>, <code> processing

This commit is contained in:
shirshasa
2021-07-05 13:14:47 +03:00
parent e1ea4f5605
commit e712947d7a

View File

@@ -1,6 +1,7 @@
import os import os
import pathlib import pathlib
import re import re
from html import escape
from typing import List, Tuple from typing import List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag from bs4 import BeautifulSoup, NavigableString, Tag
@@ -381,6 +382,41 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
return tags return tags
def wrap_text_with_table(main_tag, text, old_tag):
table = main_tag.new_tag("table")
table.attrs['border'] = '0'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['style'] = 'font-family: courier new,courier,monospace;'
td.attrs['bgcolor'] = '#f5f5f5'
td.insert(0, str(text))
old_tag.replace_with(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def preprocess_pre_tags(chapter_tag):
for pre in chapter_tag.find_all("pre"):
if not pre.children:
assert 1, 'Pre tag has other tags.'
else:
wrap_text_with_table(chapter_tag, escape(pre.text), pre)
def preprocess_code_tags(chapter_tag):
for code in chapter_tag.find_all("code"):
if not code.children:
assert 1, 'Code tag has other tags.'
else:
code.string = escape(code.text)
code.name = 'span'
code.attrs['style'] = 'color:#c7254e; font-family: courier new,courier,monospace;'
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]: def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
title_str = BeautifulSoup(title, features='lxml').string title_str = BeautifulSoup(title, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
@@ -399,6 +435,8 @@ def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_fr
clean_headings_content(chapter_tag, title_str) clean_headings_content(chapter_tag, title_str)
_process_lists(chapter_tag) _process_lists(chapter_tag)
preprocess_table(chapter_tag) preprocess_table(chapter_tag)
preprocess_pre_tags(chapter_tag)
preprocess_code_tags(chapter_tag)
# 2. class removal # 2. class removal
for tag in chapter_tag.find_all(recursive=True): for tag in chapter_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']): if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor']):