forked from LiveCarta/BookConverter
fix heading support
This commit is contained in:
27
src/book.py
27
src/book.py
@@ -28,7 +28,11 @@ class Book:
|
||||
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
|
||||
"Verdana": "verdana,geneva,sans-serif"
|
||||
}
|
||||
SUPPORTED_HEADERS = ["h1", "h2", "h3"]
|
||||
|
||||
SUPPPORTED_LEVELS = 3
|
||||
SUPPORTED_HEADERS = {"h1", "h2", "h3"}
|
||||
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
||||
|
||||
|
||||
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
|
||||
self.book_id = book_id
|
||||
@@ -45,6 +49,9 @@ class Book:
|
||||
self.images = list()
|
||||
self.content_dict = dict()
|
||||
|
||||
assert self.SUPPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
|
||||
"Length of headers doesn't match allowd levels."
|
||||
|
||||
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
|
||||
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
|
||||
"""
|
||||
@@ -514,16 +521,16 @@ class Book:
|
||||
"""
|
||||
Function to convert all lower level headings to p tags
|
||||
"""
|
||||
header_tags = self.body_tag.find_all(re.compile("^h[4-6]$"))
|
||||
pattern = f'^h[{self.SUPPPORTED_LEVELS + 1}-9]$'
|
||||
header_tags = self.body_tag.find_all(re.compile(pattern))
|
||||
for tag in header_tags:
|
||||
tag.name = 'p'
|
||||
print(tag)
|
||||
|
||||
def _process_headings(self):
|
||||
"""
|
||||
Function to process tags <h>.
|
||||
"""
|
||||
header_tags = self.body_tag.find_all(re.compile("^h[1-6]$"))
|
||||
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
|
||||
for tag in header_tags:
|
||||
if tag.parent.name == "li":
|
||||
tag.parent.unwrap()
|
||||
@@ -535,18 +542,14 @@ class Book:
|
||||
if title == "":
|
||||
tag.unwrap()
|
||||
else:
|
||||
assert tag.name not in ["h4", "h5", "h6"], 'Preprocessing went wrong, there is still h4-h6 headings.'
|
||||
assert tag.name in self.SUPPORTED_HEADERS, \
|
||||
f'Preprocessing went wrong, there is still h{self.SUPPPORTED_LEVELS+1}-h9 headings.'
|
||||
# if tag.name in ["h4", "h5", "h6"]:
|
||||
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
|
||||
|
||||
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
|
||||
new_tag.string = title
|
||||
if new_tag.name == "p":
|
||||
new_tag.attrs = tag.attrs
|
||||
print(tag)
|
||||
print(new_tag)
|
||||
|
||||
tag.replace_with(new_tag)
|
||||
|
||||
def write_html_from_list(self, file_name='url_test.html'):
|
||||
@@ -711,10 +714,6 @@ class Book:
|
||||
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.content_dict, f, ensure_ascii=False)
|
||||
self.log('Data has been saved to .json file.')
|
||||
|
||||
from pprint import pprint
|
||||
|
||||
pprint(self.content_dict)
|
||||
except Exception as exc:
|
||||
self.log('Error has occurred while writing json file.', logging.ERROR)
|
||||
# self.log_error_to_main_log()
|
||||
|
||||
Reference in New Issue
Block a user