forked from LiveCarta/BookConverter
fix heading support
This commit is contained in:
@@ -13,5 +13,7 @@ COPY . /app/
|
|||||||
|
|
||||||
WORKDIR /app/
|
WORKDIR /app/
|
||||||
VOLUME /app/logs
|
VOLUME /app/logs
|
||||||
|
# VOLUME /app/html
|
||||||
|
# VOLUME /app/json
|
||||||
|
|
||||||
CMD python /app/src/util/check_packs.py && python /app/src/util/check_dirs.py && python /app/src/consumer.py
|
CMD python /app/src/util/check_packs.py && python /app/src/util/check_dirs.py && python /app/src/consumer.py
|
||||||
|
|||||||
27
src/book.py
27
src/book.py
@@ -28,7 +28,11 @@ class Book:
|
|||||||
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
|
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
|
||||||
"Verdana": "verdana,geneva,sans-serif"
|
"Verdana": "verdana,geneva,sans-serif"
|
||||||
}
|
}
|
||||||
SUPPORTED_HEADERS = ["h1", "h2", "h3"]
|
|
||||||
|
SUPPPORTED_LEVELS = 3
|
||||||
|
SUPPORTED_HEADERS = {"h1", "h2", "h3"}
|
||||||
|
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
|
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
|
||||||
self.book_id = book_id
|
self.book_id = book_id
|
||||||
@@ -45,6 +49,9 @@ class Book:
|
|||||||
self.images = list()
|
self.images = list()
|
||||||
self.content_dict = dict()
|
self.content_dict = dict()
|
||||||
|
|
||||||
|
assert self.SUPPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
|
||||||
|
"Length of headers doesn't match allowd levels."
|
||||||
|
|
||||||
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
|
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
|
||||||
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
|
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
|
||||||
"""
|
"""
|
||||||
@@ -514,16 +521,16 @@ class Book:
|
|||||||
"""
|
"""
|
||||||
Function to convert all lower level headings to p tags
|
Function to convert all lower level headings to p tags
|
||||||
"""
|
"""
|
||||||
header_tags = self.body_tag.find_all(re.compile("^h[4-6]$"))
|
pattern = f'^h[{self.SUPPPORTED_LEVELS + 1}-9]$'
|
||||||
|
header_tags = self.body_tag.find_all(re.compile(pattern))
|
||||||
for tag in header_tags:
|
for tag in header_tags:
|
||||||
tag.name = 'p'
|
tag.name = 'p'
|
||||||
print(tag)
|
|
||||||
|
|
||||||
def _process_headings(self):
|
def _process_headings(self):
|
||||||
"""
|
"""
|
||||||
Function to process tags <h>.
|
Function to process tags <h>.
|
||||||
"""
|
"""
|
||||||
header_tags = self.body_tag.find_all(re.compile("^h[1-6]$"))
|
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
|
||||||
for tag in header_tags:
|
for tag in header_tags:
|
||||||
if tag.parent.name == "li":
|
if tag.parent.name == "li":
|
||||||
tag.parent.unwrap()
|
tag.parent.unwrap()
|
||||||
@@ -535,18 +542,14 @@ class Book:
|
|||||||
if title == "":
|
if title == "":
|
||||||
tag.unwrap()
|
tag.unwrap()
|
||||||
else:
|
else:
|
||||||
assert tag.name not in ["h4", "h5", "h6"], 'Preprocessing went wrong, there is still h4-h6 headings.'
|
assert tag.name in self.SUPPORTED_HEADERS, \
|
||||||
|
f'Preprocessing went wrong, there is still h{self.SUPPPORTED_LEVELS+1}-h9 headings.'
|
||||||
# if tag.name in ["h4", "h5", "h6"]:
|
# if tag.name in ["h4", "h5", "h6"]:
|
||||||
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
|
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
|
||||||
|
|
||||||
|
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
|
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
|
||||||
new_tag.string = title
|
new_tag.string = title
|
||||||
if new_tag.name == "p":
|
|
||||||
new_tag.attrs = tag.attrs
|
|
||||||
print(tag)
|
|
||||||
print(new_tag)
|
|
||||||
|
|
||||||
tag.replace_with(new_tag)
|
tag.replace_with(new_tag)
|
||||||
|
|
||||||
def write_html_from_list(self, file_name='url_test.html'):
|
def write_html_from_list(self, file_name='url_test.html'):
|
||||||
@@ -711,10 +714,6 @@ class Book:
|
|||||||
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
|
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
|
||||||
json.dump(self.content_dict, f, ensure_ascii=False)
|
json.dump(self.content_dict, f, ensure_ascii=False)
|
||||||
self.log('Data has been saved to .json file.')
|
self.log('Data has been saved to .json file.')
|
||||||
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
pprint(self.content_dict)
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
self.log('Error has occurred while writing json file.', logging.ERROR)
|
self.log('Error has occurred while writing json file.', logging.ERROR)
|
||||||
# self.log_error_to_main_log()
|
# self.log_error_to_main_log()
|
||||||
|
|||||||
Reference in New Issue
Block a user