fix heading support

This commit is contained in:
Jeniamakarchik
2020-04-10 17:04:30 +03:00
parent 66fb1cb5d6
commit 5c56c8c2fa
2 changed files with 15 additions and 14 deletions

View File

@@ -13,5 +13,7 @@ COPY . /app/
WORKDIR /app/ WORKDIR /app/
VOLUME /app/logs VOLUME /app/logs
# VOLUME /app/html
# VOLUME /app/json
CMD python /app/src/util/check_packs.py && python /app/src/util/check_dirs.py && python /app/src/consumer.py CMD python /app/src/util/check_packs.py && python /app/src/util/check_dirs.py && python /app/src/consumer.py

View File

@@ -28,7 +28,11 @@ class Book:
"Trebuchet MS": "trebuchet ms,helvetica,sans-serif", "Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
"Verdana": "verdana,geneva,sans-serif" "Verdana": "verdana,geneva,sans-serif"
} }
SUPPORTED_HEADERS = ["h1", "h2", "h3"]
SUPPPORTED_LEVELS = 3
SUPPORTED_HEADERS = {"h1", "h2", "h3"}
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None): def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None):
self.book_id = book_id self.book_id = book_id
@@ -45,6 +49,9 @@ class Book:
self.images = list() self.images = list()
self.content_dict = dict() self.content_dict = dict()
assert self.SUPPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowd levels."
def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+', def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+',
logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'): logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
""" """
@@ -514,16 +521,16 @@ class Book:
""" """
Function to convert all lower level headings to p tags Function to convert all lower level headings to p tags
""" """
header_tags = self.body_tag.find_all(re.compile("^h[4-6]$")) pattern = f'^h[{self.SUPPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags: for tag in header_tags:
tag.name = 'p' tag.name = 'p'
print(tag)
def _process_headings(self): def _process_headings(self):
""" """
Function to process tags <h>. Function to process tags <h>.
""" """
header_tags = self.body_tag.find_all(re.compile("^h[1-6]$")) header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
for tag in header_tags: for tag in header_tags:
if tag.parent.name == "li": if tag.parent.name == "li":
tag.parent.unwrap() tag.parent.unwrap()
@@ -535,18 +542,14 @@ class Book:
if title == "": if title == "":
tag.unwrap() tag.unwrap()
else: else:
assert tag.name not in ["h4", "h5", "h6"], 'Preprocessing went wrong, there is still h4-h6 headings.' assert tag.name in self.SUPPORTED_HEADERS, \
f'Preprocessing went wrong, there is still h{self.SUPPPORTED_LEVELS+1}-h9 headings.'
# if tag.name in ["h4", "h5", "h6"]: # if tag.name in ["h4", "h5", "h6"]:
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings # tag.name = "h3" # All the lower level headings will be transformed to h3 headings
new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name) new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
new_tag.string = title new_tag.string = title
if new_tag.name == "p":
new_tag.attrs = tag.attrs
print(tag)
print(new_tag)
tag.replace_with(new_tag) tag.replace_with(new_tag)
def write_html_from_list(self, file_name='url_test.html'): def write_html_from_list(self, file_name='url_test.html'):
@@ -711,10 +714,6 @@ class Book:
with codecs.open(self.output_path, 'w', encoding='utf-8') as f: with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
json.dump(self.content_dict, f, ensure_ascii=False) json.dump(self.content_dict, f, ensure_ascii=False)
self.log('Data has been saved to .json file.') self.log('Data has been saved to .json file.')
from pprint import pprint
pprint(self.content_dict)
except Exception as exc: except Exception as exc:
self.log('Error has occurred while writing json file.', logging.ERROR) self.log('Error has occurred while writing json file.', logging.ERROR)
# self.log_error_to_main_log() # self.log_error_to_main_log()