forked from LiveCarta/BookConverter
epub converter: fix headings levels
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
from typing import List
|
||||
from typing import List, Tuple
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
|
||||
@@ -87,7 +87,6 @@ def preprocess_table(body_tag: BeautifulSoup):
|
||||
|
||||
if border_sizes:
|
||||
border_size = sum(border_sizes) / len(border_sizes)
|
||||
print(border_size)
|
||||
table.attrs['border'] = f'{border_size:.2}'
|
||||
|
||||
|
||||
@@ -108,7 +107,7 @@ def clean_headings_content(content: Tag, title: str):
|
||||
for child in content.contents:
|
||||
if child.text and re.sub(r'([\n\t\xa0])', '', child.text):
|
||||
text = re.sub(r'([\n\t\xa0])', ' ', child.text)
|
||||
text = re.sub(r' +', ' ', text).rstrip()
|
||||
text = re.sub(r' +', ' ', text).strip()
|
||||
if title == text:
|
||||
child.extract()
|
||||
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
|
||||
@@ -294,29 +293,30 @@ def get_tags_between_ids(first_id, href, html_soup):
|
||||
return tags
|
||||
|
||||
|
||||
def prepare_title_and_content(title, content_tag: BeautifulSoup):
|
||||
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
|
||||
title_str = BeautifulSoup(title, features='lxml').string
|
||||
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
||||
# 0. cleaning \n
|
||||
to_remove = []
|
||||
for child in content_tag.contents:
|
||||
for child in chapter_tag.contents:
|
||||
if isinstance(child, NavigableString):
|
||||
s = re.sub(r'([\n\t\xa0])', '', child.string)
|
||||
if s == '':
|
||||
to_remove.append(child)
|
||||
|
||||
[x.extract() for x in to_remove]
|
||||
# 1. rule#1 for heading removal
|
||||
clean_headings_content(content_tag, title_str)
|
||||
_process_lists(content_tag)
|
||||
_preprocessing_headings(content_tag)
|
||||
preprocess_table(content_tag)
|
||||
# 1. heading removal
|
||||
if remove_title_from_chapter:
|
||||
clean_headings_content(chapter_tag, title_str)
|
||||
_process_lists(chapter_tag)
|
||||
_preprocessing_headings(chapter_tag)
|
||||
preprocess_table(chapter_tag)
|
||||
# 2. class removal
|
||||
for tag in content_tag.find_all(recursive=True):
|
||||
for tag in chapter_tag.find_all(recursive=True):
|
||||
if hasattr(tag, 'attrs') and tag.attrs.get('class'):
|
||||
del tag.attrs['class']
|
||||
|
||||
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
||||
title_str = clean_title_from_numbering(title_str)
|
||||
return title_str, str(content_tag)
|
||||
return title_str, str(chapter_tag)
|
||||
|
||||
Reference in New Issue
Block a user