From 5c56c8c2faae8312ad994d30f0f87b4494042c07 Mon Sep 17 00:00:00 2001 From: Jeniamakarchik Date: Fri, 10 Apr 2020 17:04:30 +0300 Subject: [PATCH] fix heading support --- Dockerfile | 2 ++ src/book.py | 27 +++++++++++++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1f21b13..a8afd0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,5 +13,7 @@ COPY . /app/ WORKDIR /app/ VOLUME /app/logs +# VOLUME /app/html +# VOLUME /app/json CMD python /app/src/util/check_packs.py && python /app/src/util/check_dirs.py && python /app/src/consumer.py diff --git a/src/book.py b/src/book.py index 2a7d488..a003e20 100644 --- a/src/book.py +++ b/src/book.py @@ -28,7 +28,11 @@ class Book: "Trebuchet MS": "trebuchet ms,helvetica,sans-serif", "Verdana": "verdana,geneva,sans-serif" } - SUPPORTED_HEADERS = ["h1", "h2", "h3"] + + SUPPPORTED_LEVELS = 3 + SUPPORTED_HEADERS = {"h1", "h2", "h3"} + HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} + def __init__(self, book_id=0, access=None, file_path=None, output_path=None, main_logger=None): self.book_id = book_id @@ -45,6 +49,9 @@ class Book: self.images = list() self.content_dict = dict() + assert self.SUPPPORTED_LEVELS == len(self.SUPPORTED_HEADERS), \ + "Length of headers doesn't match allowd levels." + def configure_file_logger(self, name, attr_name='logger', filename='logs/book_log.log', filemode='w+', logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'): """ @@ -514,16 +521,16 @@ class Book: """ Function to convert all lower level headings to p tags """ - header_tags = self.body_tag.find_all(re.compile("^h[4-6]$")) + pattern = f'^h[{self.SUPPPORTED_LEVELS + 1}-9]$' + header_tags = self.body_tag.find_all(re.compile(pattern)) for tag in header_tags: tag.name = 'p' - print(tag) def _process_headings(self): """ Function to process tags . """ - header_tags = self.body_tag.find_all(re.compile("^h[1-6]$")) + header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) for tag in header_tags: if tag.parent.name == "li": tag.parent.unwrap() @@ -535,18 +542,14 @@ class Book: if title == "": tag.unwrap() else: - assert tag.name not in ["h4", "h5", "h6"], 'Preprocessing went wrong, there is still h4-h6 headings.' + assert tag.name in self.SUPPORTED_HEADERS, \ + f'Preprocessing went wrong, there is still h{self.SUPPPORTED_LEVELS+1}-h9 headings.' # if tag.name in ["h4", "h5", "h6"]: # tag.name = "h3" # All the lower level headings will be transformed to h3 headings new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name) new_tag.string = title - if new_tag.name == "p": - new_tag.attrs = tag.attrs - print(tag) - print(new_tag) - tag.replace_with(new_tag) def write_html_from_list(self, file_name='url_test.html'): @@ -711,10 +714,6 @@ class Book: with codecs.open(self.output_path, 'w', encoding='utf-8') as f: json.dump(self.content_dict, f, ensure_ascii=False) self.log('Data has been saved to .json file.') - - from pprint import pprint - - pprint(self.content_dict) except Exception as exc: self.log('Error has occurred while writing json file.', logging.ERROR) # self.log_error_to_main_log()