From c0ef0b6d6e67c0ce3c314948e67918194c8fd675 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 1 Jun 2022 16:23:53 +0300 Subject: [PATCH] Formatting --- src/access.py | 2 +- src/book_solver.py | 67 ++++--- src/data_objects.py | 2 +- src/docx_converter/html_docx_preprocessor.py | 189 +++++++++++++----- ...verter.py => libre_html2json_converter.py} | 39 +++- src/epub_converter/css_reader.py | 47 +++-- src/epub_converter/epub_solver.py | 20 +- src/epub_converter/html_epub_preprocessor.py | 120 +++++------ src/util/check_dirs.py | 1 + src/util/check_packs.py | 1 + src/util/color_reader.py | 1 - src/util/helpers.py | 8 +- src/util/rgb2closest_color.py | 6 +- 13 files changed, 318 insertions(+), 185 deletions(-) rename src/docx_converter/{libra_html2json_converter.py => libre_html2json_converter.py} (85%) diff --git a/src/access.py b/src/access.py index a906e90..4367c33 100644 --- a/src/access.py +++ b/src/access.py @@ -201,4 +201,4 @@ class Access: pass else: raise Exception( - f'{response.status_code} Bad request: {response.json()["message"]}.') \ No newline at end of file + f'{response.status_code} Bad request: {response.json()["message"]}.') diff --git a/src/book_solver.py b/src/book_solver.py index db14faa..4176280 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -29,12 +29,13 @@ class BookSolver: self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}', book_id=book_id, main_logger=main_logger) - self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id) + self.status_wrapper = BookStatusWrapper( + access, self.logger_object, book_id) assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \ "Length of headers doesn't match allowed levels." - def save_book_file(self, content): + def save_book_file(self, content: str): """ Function saves binary content of file to .docx/.epub Parameters @@ -43,17 +44,21 @@ class BookSolver: binary content of the file """ - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - folder_path = os.path.join(folder_path, f'{self.book_type}/{self.book_id}') + folder_path = os.path.dirname( + os.path.dirname(os.path.abspath(__file__))) + folder_path = os.path.join( + folder_path, f'{self.book_type}/{self.book_id}') pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) - file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}') + file_path = os.path.join( + folder_path, f'{self.book_id}.{self.book_type}') try: with open(file_path, 'wb+') as file: file.write(content) self.logger_object.log(f'File was saved to folder: {folder_path}.') except Exception as exc: - self.logger_object.log(f"Error in writing {self.book_type} file.", logging.ERROR) + self.logger_object.log( + f"Error in writing {self.book_type} file.", logging.ERROR) self.logger_object.log_error_to_main_log() raise exc @@ -62,12 +67,14 @@ class BookSolver: def get_book_file(self): """Method for getting and saving book from server""" try: - self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file') + self.logger_object.log(f'Start receiving file from server. URL:' + f' {self.access.url}/doc-convert/{self.book_id}/file') content = self.access.get_book(self.book_id) self.logger_object.log('File was received from server.') self.save_book_file(content) except FileNotFoundError as f_err: - self.logger_object.log("Can't get file from server.", logging.ERROR) + self.logger_object.log( + "Can't get file from server.", logging.ERROR) self.logger_object.log_error_to_main_log() raise f_err except Exception as exc: @@ -75,14 +82,17 @@ class BookSolver: def check_output_directory(self): if self.output_path is None: - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - output_path = os.path.join(folder_path, f'json/{self.book_id}.json') + folder_path = os.path.dirname( + os.path.dirname(os.path.abspath(__file__))) + output_path = os.path.join( + folder_path, f'json/{self.book_id}.json') self.output_path = output_path self.output_path = pathlib.Path(self.output_path) self.logger_object.log(f'Output file path: {self.output_path}') - pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) + pathlib.Path(self.output_path).parent.mkdir( + parents=True, exist_ok=True) self.output_path.touch(exist_ok=True) def write_to_json(self, content: dict): @@ -90,9 +100,11 @@ class BookSolver: try: with codecs.open(self.output_path, 'w', encoding='utf-8') as f: json.dump(content, f, ensure_ascii=False) - self.logger_object.log(f'Data has been saved to .json file: {self.output_path}') + self.logger_object.log( + f'Data has been saved to .json file: {self.output_path}') except Exception as exc: - self.logger_object.log('Error has occurred while writing json file.' + str(exc), logging.ERROR) + self.logger_object.log( + 'Error has occurred while writing .json file.' + str(exc), logging.ERROR) def send_json_content_to_server(self, content: dict): """Function sends json_content to site""" @@ -100,14 +112,15 @@ class BookSolver: self.access.send_book(self.book_id, content) self.logger_object.log(f'JSON data has been sent to server.') except Exception as exc: - self.logger_object.log('Error has occurred while sending json content.', logging.ERROR) + self.logger_object.log( + 'Error has occurred while sending json content.', logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc @abstractmethod def get_converted_book(self): - self.logger_object.log('Beginning of processing json output.') + self.logger_object.log('Beginning of processing .json output.') self.status_wrapper.set_generating() return {} @@ -119,21 +132,24 @@ class BookSolver: """ try: - self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.') + self.logger_object.log( + f'Beginning of conversion from .{self.book_type} to .json.') self.get_book_file() self.status_wrapper.set_processing() content_dict = self.get_converted_book() + self.status_wrapper.set_generating() self.write_to_json(content_dict) self.send_json_content_to_server(content_dict) - self.logger_object.log(f'End of the conversion to LiveCarta format. Check {self.output_path}.') - + self.logger_object.log( + f'End of the conversion to LiveCarta format. Check {self.output_path}.') except Exception as exc: self.status_wrapper.set_error() - self.logger_object.log('Error has occurred while conversion.', logging.ERROR) + self.logger_object.log( + 'Error has occurred while conversion.', logging.ERROR) self.logger_object.log_error_to_main_log(str(exc)) raise exc - def conversion_local(self, file_name: str): + def conversion_local(self, file_path: str): """ Function - without downloading book from server (local) @@ -141,13 +157,16 @@ class BookSolver: """ try: - self.logger_object.log(f'Data has been downloaded from {file_name}.json file: ..\converter\json') + self.logger_object.log( + f'Data has been downloaded from {file_path} file') self.status_wrapper.set_processing() - with codecs.open(f'json/{file_name}.json', 'r', encoding='utf-8') as f_json: + with codecs.open(file_path, 'r', encoding='utf-8') as f_json: content_dict = json.load(f_json) + self.status_wrapper.set_generating() self.send_json_content_to_server(content_dict) self.logger_object.log(f'Sent a file to server. Check LiveCarta.') except Exception as exc: self.status_wrapper.set_error() - self.logger_object.log('Error has occurred while reading json file.' + str(exc), logging.ERROR) - self.logger_object.log_error_to_main_log(str(exc)) \ No newline at end of file + self.logger_object.log( + 'Error has occurred while reading json file.' + str(exc), logging.ERROR) + self.logger_object.log_error_to_main_log(str(exc)) diff --git a/src/data_objects.py b/src/data_objects.py index ac04284..110db8d 100644 --- a/src/data_objects.py +++ b/src/data_objects.py @@ -88,4 +88,4 @@ class ChapterItem: } def __str__(self): - return '' % self.title \ No newline at end of file + return '' % self.title diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index a8e76a2..b2b89a1 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -21,13 +21,22 @@ class HTMLDocxPreprocessor: self.top_level_headers = None self.content = list() - def _clean_tag(self, tag, attr_name, attr_value): + def _clean_tag(self, tag: str, attr_name: str, attr_value: re): """ Function to clean tags by its name and attribute value. + Parameters + ---------- + tag: str + tag name to clean + attr_name: str + attribute name + attr_value: [str,re] + attribute value + + Returns + ------- + clean tag - :param tag: Tag name to clean. - :param attr_name: Attribute name. - :param attr_value: Attribute value. """ tags = self.body_tag.find_all(tag, {attr_name: attr_value}) for tag in tags: @@ -56,12 +65,19 @@ class HTMLDocxPreprocessor: return value @classmethod - def convert_font_pt_to_px(cls, style): + def convert_font_pt_to_px(cls, style: str) -> str: """ - Method converts point in the font-size to pixels. + Function converts point in the font-size to pixels. + Parameters + ---------- + style: str + str with style to proces + + Returns + ------- + : str + str with converted style - :param style: Str with style to process. - :return: Str with converted style. """ size = re.search(r"font-size: (\d{1,3})pt", style) @@ -77,7 +93,10 @@ class HTMLDocxPreprocessor: return re.sub(size + "pt", str(new_size) + "px", style) def _font_to_span(self): - """Function to convert tag to . If font style is default, then remove this tag.""" + """ + Function to convert tag to . + If font style is default, then remove this tag. + """ fonts = self.body_tag.find_all("font") for font in fonts: face = font.get("face") @@ -105,7 +124,8 @@ class HTMLDocxPreprocessor: if len(font.attrs) == 0: font.unwrap() - assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more tags + # on this step there should be no more tags + assert len(self.body_tag.find_all("font")) == 0 def delete_content_before_toc(self): # remove all tag upper the only in content !!! body tag is not updated @@ -116,11 +136,15 @@ class HTMLDocxPreprocessor: def clean_trash(self): """Function to remove all styles and tags we don't need.""" - self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$')) - self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages - self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$')) + self._clean_tag('span', 'style', re.compile( + r'^background: #[0-9a-fA-F]{6}$')) + # todo: check for another languages + self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) + self._clean_tag('span', 'style', re.compile( + '^letter-spacing: -?[\d\.]+pt$')) - self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$')) + self._clean_tag('font', 'face', re.compile( + r'^Times New Roman[\w, ]+$')) self._clean_tag("a", "name", "_GoBack") self._clean_underline_links() @@ -128,7 +152,8 @@ class HTMLDocxPreprocessor: self._font_to_span() # replace toc with empty tag - tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) + tables = self.body_tag.find_all( + "div", id=re.compile(r'^Table of Contents\d+')) for table in tables: table.wrap(self.html_soup.new_tag("TOC")) table.decompose() @@ -138,7 +163,7 @@ class HTMLDocxPreprocessor: paragraphs = self.body_tag.find_all('p') for p in paragraphs: - # libra converts some \n into

with 2
+ # libre converts some \n into

with 2
# there we remove 1 unnecessary
brs = p.find_all('br') text = p.text @@ -156,9 +181,11 @@ class HTMLDocxPreprocessor: if style: indent = re.search(r'text-indent: ([\d\.]{1,4})in', style) margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style) - margin_right = re.search(r'margin-right: ([\d\.]{1,4})in', style) + margin_right = re.search( + r'margin-right: ([\d\.]{1,4})in', style) margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style) - margin_bottom = re.search(r'margin-bottom: ([\d\.]{1,4})in', style) + margin_bottom = re.search( + r'margin-bottom: ([\d\.]{1,4})in', style) else: indent = None margin_left = None @@ -195,6 +222,7 @@ class HTMLDocxPreprocessor: def _process_tables(self): """Function to process tables. Set "border" attribute.""" + tables = self.body_tag.find_all("table") for table in tables: tds = table.find_all("td") @@ -258,21 +286,24 @@ class HTMLDocxPreprocessor: for x in has_i_tag_or_br] if all(has_i_tag_or_br) and is_zero_border: - new_div = BeautifulSoup(features='lxml').new_tag('blockquote') + new_div = BeautifulSoup( + features='lxml').new_tag('blockquote') for p in paragraphs: new_div.append(p) table.replaceWith(new_div) def _process_hrefs(self): - a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^.*http.+')}) + a_tags_with_href = self.body_tag.find_all( + 'a', {'href': re.compile('^.*http.+')}) # remove char=end of file for some editors for tag in a_tags_with_href: tag.string = tag.text.replace('\u200c', '') tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') - a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^(?!#sdfootnote)')}) + a_tags_with_href = self.body_tag.find_all( + 'a', {'href': re.compile('^(?!#sdfootnote)')}) for tag in a_tags_with_href: tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace('\u200b', '') # zero-width-space @@ -286,23 +317,25 @@ class HTMLDocxPreprocessor: def _process_footnotes(self): """Function returns list of footnotes and delete them from html_soup.""" footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc') - footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$')) + footnote_content = self.body_tag.find_all( + 'div', id=re.compile(r'^sdfootnote\d+$')) footnote_amt = len(footnote_anchors) assert footnote_amt == len(footnote_content), \ - 'Something went wrong with footnotes after libra conversion' + 'Something went wrong with footnotes after libre conversion' footnotes = [] for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): - true_a_tag = cont_tag.find_all('a', class_=re.compile(r'^sdfootnote.+$'))[0] + true_a_tag = cont_tag.find_all( + 'a', class_=re.compile(r'^sdfootnote.+$'))[0] if true_a_tag.attrs.get('href') is None: cont_tag.a.decompose() continue assert anc_tag['name'] == true_a_tag['href'][1:], \ - 'Something went wrong with footnotes after libra conversion' + 'Something went wrong with footnotes after libre conversion' new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' @@ -355,8 +388,10 @@ class HTMLDocxPreprocessor: if len(img_tags): if access is None: - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/')) + folder_path = os.path.dirname( + os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join( + folder_path, f'json/img_{book_id}/')) new_path.mkdir(exist_ok=True) for img in img_tags: @@ -370,10 +405,12 @@ class HTMLDocxPreprocessor: if access is not None: link = access.send_image(img_path, doc_id=book_id) img.attrs['src'] = link - self.logger_object.log(f'{img_name} successfully uploaded.') + self.logger_object.log( + f'{img_name} successfully uploaded.') else: img_size = os.path.getsize(img_path) - self.logger_object.log(f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG) + self.logger_object.log( + f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG) new_img_path = new_path / img_name copyfile(img_path, new_img_path) img.attrs["src"] = str(new_img_path) @@ -408,7 +445,8 @@ class HTMLDocxPreprocessor: def _process_toc_links(self): """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" - toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')}) + toc_links = self.body_tag.find_all( + "a", {'name': re.compile(r'^_Toc\d+')}) headers = [link.parent for link in toc_links] outline_level = "1" # All the unknown outlines will be predicted as

for tag in headers: @@ -418,7 +456,8 @@ class HTMLDocxPreprocessor: elif tag.name == "p": exist_in_toc = self._check_parent_link_exist_in_toc(tag) if tag in self.body_tag.find_all("p") and exist_in_toc: - new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level) + new_tag = BeautifulSoup( + features="lxml").new_tag("h" + outline_level) text = tag.text tag.replaceWith(new_tag) new_tag.string = text @@ -440,14 +479,16 @@ class HTMLDocxPreprocessor: @staticmethod def clean_tag_from_tabs(tag: NavigableString): cleaned = re.sub(r'(\s+)+', ' ', tag) - this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) + this = BeautifulSoup.new_string(BeautifulSoup( + features="lxml"), cleaned, NavigableString) tag.replace_with(this) # print('input: ', repr(tag)) # print('test: ', repr(cleaned)) def clean_tag_from_numbering(self, tag): cleaned = self.clean_title_from_numbering(tag) - this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) + this = BeautifulSoup.new_string(BeautifulSoup( + features="lxml"), cleaned, NavigableString) tag.replace_with(this) # print('input: ', repr(tag)) # print('test: ', repr(cleaned)) @@ -484,7 +525,8 @@ class HTMLDocxPreprocessor: """ headers_info = [] header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) - headers_outline = [int(re.sub(r"^h", "", tag.name)) for tag in header_tags] + headers_outline = [int(re.sub(r"^h", "", tag.name)) + for tag in header_tags] if headers_outline: top_level_outline = min(headers_outline) top_level_headers = [tag for tag in header_tags @@ -518,13 +560,17 @@ class HTMLDocxPreprocessor: Assume header(s) to be introduction if: 1. one header not numbered, before 1 numbered header - 2. it is first header from the top level list and it equals to 'introduction' + 2. it is first header from the top level list and it equals to 'introductio + Returns + ------- + None + mark each top-level header with flag should_be_numbered = true/false - Result : - Mark each top-level header with flag should_be_numbered = true/false """ - is_numbered_header = [header['is_numbered'] for header in self.top_level_headers] - is_title = [header['is_introduction'] for header in self.top_level_headers] + is_numbered_header = [header['is_numbered'] + for header in self.top_level_headers] + is_title = [header['is_introduction'] + for header in self.top_level_headers] first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 second_is_numbered_or_not_exist = all(is_numbered_header[1:2]) @@ -539,7 +585,19 @@ class HTMLDocxPreprocessor: self.top_level_headers[i]['should_be_numbered'] = True def _process_headings(self): - """Function to process tags .""" + """ + Function to process tags . + Steps + ---------- + 1. remove , + 2. clean text in header from numbering and \n + + Returns + ------- + None + processed tags + + """ header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) # 1. remove , @@ -581,36 +639,52 @@ class HTMLDocxPreprocessor: for i, item in enumerate(content): if type(content[i]) is NavigableString: cleaned = re.sub(r'(\s+)+', ' ', content[i]) - this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) + this = BeautifulSoup.new_string(BeautifulSoup( + features="lxml"), cleaned, NavigableString) content[i].replace_with(this) content[i] = this else: - self.apply_func_to_last_child(content[i], self.clean_tag_from_tabs) + self.apply_func_to_last_child( + content[i], self.clean_tag_from_tabs) content[0] = '' if content[0] == ' ' else content[0] content = [item for item in content if item != ''] if type(content[0]) is NavigableString: cleaned = self.clean_title_from_numbering(content[0]) - this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) + this = BeautifulSoup.new_string(BeautifulSoup( + features="lxml"), cleaned, NavigableString) content[0].replace_with(this) content[0] = this else: - self.apply_func_to_last_child(content[0], self.clean_tag_from_numbering) + self.apply_func_to_last_child( + content[0], self.clean_tag_from_numbering) def _process_lists(self): """ - Function to process tags
  • . - Unwrap

    tags. + Function + - process tags

  • . + - unwrap

    tags. + Parameters + ---------- + body_tag: Tag, soup object + + Returns + ------- + None + uwrap

    tag with li + """ + li_tags = self.body_tag.find_all("li") - for il_tag in li_tags: - il_tag.attrs.update(il_tag.p.attrs) - il_tag.p.unwrap() + for li_tag in li_tags: + li_tag.attrs.update(li_tag.p.attrs) + li_tag.p.unwrap() - def process_html(self, access, html_path, book_id): + def process_html(self, access=None, html_path='', book_id='local'): """Process html code to satisfy LiveCarta formatting.""" + self.logger_object.log('Beginning of processing .html file.') try: self.logger_object.log(f'Processing TOC and headers.') self._process_toc_links() @@ -628,18 +702,22 @@ class HTMLDocxPreprocessor: self.logger_object.log('Tables processing.') self._process_tables() - self.logger_object.log(f'{self.tables_amount} tables have been processed.') + self.logger_object.log( + f'{self.tables_amount} tables have been processed.') self.logger_object.log('Hrefs processing.') self._process_hrefs() self.logger_object.log('Footnotes processing.') self._process_footnotes() - self.logger_object.log(f'{len(self.footnotes)} footnotes have been processed.') + self.logger_object.log( + f'{len(self.footnotes)} footnotes have been processed.') self.logger_object.log('Image processing.') - self._process_images(access=access, html_path=html_path, book_id=book_id) - self.logger_object.log(f'{len(self.images)} images have been processed.') + self._process_images( + access=access, html_path=html_path, book_id=book_id) + self.logger_object.log( + f'{len(self.images)} images have been processed.') self._process_footer() self._process_div() @@ -658,7 +736,8 @@ class HTMLDocxPreprocessor: self.delete_content_before_toc() except Exception as exc: - self.logger_object.log('Error has occurred while processing html.', logging.ERROR) + self.logger_object.log( + 'Error has occurred while processing html.', logging.ERROR) self.logger_object.log_error_to_main_log() if self.status_wrapper: self.status_wrapper.set_error() diff --git a/src/docx_converter/libra_html2json_converter.py b/src/docx_converter/libre_html2json_converter.py similarity index 85% rename from src/docx_converter/libra_html2json_converter.py rename to src/docx_converter/libre_html2json_converter.py index 9366b60..45522da 100644 --- a/src/docx_converter/libra_html2json_converter.py +++ b/src/docx_converter/libre_html2json_converter.py @@ -5,7 +5,7 @@ from copy import copy from src.livecarta_config import LiveCartaConfig -class LibraHTML2JSONConverter: +class LibreHTML2JSONConverter: def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None): self.content_dict = None self.content = content @@ -15,12 +15,19 @@ class LibraHTML2JSONConverter: self.book_api_status = book_api_status @staticmethod - def format_html(html_text): + def format_html(html_text: str) -> str: """ Function to remove useless symbols from html code. + Parameters + ---------- + html_text: str + text to process. + + Returns + ------- + new_text: str + cleaned text - :param html_text: Text to process. - :return: Cleaned text. """ new_text = re.sub(r'([\n\t])', ' ', html_text) return new_text @@ -29,8 +36,15 @@ class LibraHTML2JSONConverter: def header_to_livecarta_chapter_item(self, ind) -> (dict, int): """ Function process header and collects all content for it. + Parameters + ---------- + ind: int + index of header in content list. + + Returns + ------- + result, ind - :param ind: Index of header in content list. """ if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: title = str(self.content[ind]) @@ -38,7 +52,8 @@ class LibraHTML2JSONConverter: title = title.replace(f'', '') title = re.sub(r'^\n', '', title) - curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag + # extract outline from tag + curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) result = { 'title': f'{title}', 'contents': [], @@ -53,7 +68,8 @@ class LibraHTML2JSONConverter: outline = int(re.sub(r"^h", "", self.content[ind].name)) # - recursion step until h_i > h_initial if outline > curr_outline: - header_dict, ind = self.header_to_livecarta_chapter_item(ind) + header_dict, ind = self.header_to_livecarta_chapter_item( + ind) if ch_content: result['contents'].append("".join(ch_content)) ch_content = [] @@ -108,7 +124,8 @@ class LibraHTML2JSONConverter: chapter = [] while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: if not self._is_empty_p_tag(self.content[ind]): - chapter.append(self.format_html(str(self.content[ind]))) + chapter.append(self.format_html( + str(self.content[ind]))) ind += 1 if chapter: res = { @@ -121,9 +138,11 @@ class LibraHTML2JSONConverter: if res: json_strc.append(res) ch_amt += 1 - self.logger_object.log(f'Chapter {ch_amt} has been added to structure.') + self.logger_object.log( + f'Chapter {ch_amt} has been added to structure.') except Exception as exc: - self.logger_object.log('Error has occurred while making json structure.', logging.ERROR) + self.logger_object.log( + 'Error has occurred while making json structure.', logging.ERROR) self.logger_object.log_error_to_main_log() if self.book_api_status: self.book_api_status.set_error() diff --git a/src/epub_converter/css_reader.py b/src/epub_converter/css_reader.py index 6665725..7e768b8 100644 --- a/src/epub_converter/css_reader.py +++ b/src/epub_converter/css_reader.py @@ -14,21 +14,23 @@ from src.livecarta_config import LiveCartaConfig cssutils.log.setLevel(CRITICAL) -sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, - 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, - 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] +sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, + 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, + 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, + 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] -sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px', - '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', - '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', - '48px', '49px', '50px', '64px', '72px'] +sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', + '17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px', + '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', + '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', + '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] list_types = ['circle', 'disc', 'armenian', 'decimal', 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] -def convert_tag_values(value: str) -> str: +def convert_tag_style_values(value: str) -> str: """ Function - converts values of tags from em/%/pt to px @@ -42,8 +44,8 @@ def convert_tag_values(value: str) -> str: value: str """ - def find_closest_size(value): - possible_sizes = list(takewhile(lambda x: value > x, sizes_pr)) + def find_closest_size(size_value): + possible_sizes = list(takewhile(lambda x: size_value > x, sizes_pr)) last_possible_size_index = sizes_pr.index(possible_sizes[-1]) return sizes_px[last_possible_size_index] @@ -122,12 +124,13 @@ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING shou to suit livecarta style convention. """ LIVECARTA_STYLE_ATTRS_MAPPING = { - 'text-indent': convert_tag_values, + 'text-indent': convert_tag_style_values, 'font-variant': lambda x: x, 'text-align': lambda x: x, 'font': lambda x: '', - 'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()), - 'font-size': convert_tag_values, + 'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or + LiveCartaConfig.font_correspondence_table.get(x.capitalize()), + 'font-size': convert_tag_style_values, 'color': get_text_color, 'background-color': get_bg_color, 'background': get_bg_color, @@ -140,9 +143,9 @@ LIVECARTA_STYLE_ATTRS_MAPPING = { 'border-bottom': lambda x: x if x != '0' else '', 'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-image': lambda x: 'disc', - 'margin-left': convert_tag_values, - 'margin-top': convert_tag_values, - 'margin': convert_tag_values, + 'margin-left': convert_tag_style_values, + 'margin-top': convert_tag_style_values, + 'margin': convert_tag_style_values, } """ @@ -269,10 +272,10 @@ class TagStyleConverter: item = item.split(':') if item[0] in ['text-indent', 'margin-left', 'margin']: if len(item[1].split(' ')) == 3: - item[1] = convert_tag_values(item[1].split( + item[1] = convert_tag_style_values(item[1].split( ' ')[-2]) # split returns middle value else: - item[1] = convert_tag_values(item[1].split( + item[1] = convert_tag_style_values(item[1].split( ' ')[-1]) # split returns last value clean_style += item[0] + ': ' + item[1] + '; ' @@ -343,7 +346,8 @@ class TagStyleConverter: split_inline_style: dict = remove_extra_spaces(inline_style) - # repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css + # repetition check - if the tag had already had inline style + # that isn't in the css styles, add this to style parsed from css repeat_styles = list(set(split_ultimate_style.keys()) & set(split_inline_style.keys())) @@ -409,7 +413,8 @@ class TagStyleConverter: if has_p_style_attrs: p_style += item + ';' initial_style = initial_style.replace(item + ';', '') - # here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top' + # here check that this style i exactly the same. + # Not 'align' when we have 'text-align', or 'border' when we have 'border-top' styles_to_be_saved_in_span = [((attr + ':') in initial_style) & ( '-' + attr not in initial_style) for attr in styles_cant_be_in_p] if any(styles_to_be_saved_in_span): @@ -549,4 +554,4 @@ if __name__ == '__main__': 'pr01s05.xhtml').get_body_content().decode() html_soup = BeautifulSoup(html_, features='lxml') - print(convert_html_soup_with_css_style(html_soup, css_cleaned)) \ No newline at end of file + print(convert_html_soup_with_css_style(html_soup, css_cleaned)) diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 6ec8c53..cb6e080 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -1,6 +1,7 @@ from src.book_solver import BookSolver from src.epub_converter.epub_converter import EpubConverter + class EpubBook(BookSolver): """Class of .epub type book - child of BookSolver""" @@ -10,10 +11,19 @@ class EpubBook(BookSolver): def get_converted_book(self): """ - 1. Convert epub to html - 2. Parse from line structure to nested structure + Function + Steps + ---------- + 1. Converts .epub to .html + 2. Parses from line structure to nested structure + + Returns + ---------- + content_dict + json for LiveCarta platform + """ - json_converter = EpubConverter(self.file_path, access=self.access, logger=self.logger_object) + json_converter = EpubConverter( + self.file_path, access=self.access, logger=self.logger_object) content_dict = json_converter.convert_to_dict() - self.status_wrapper.set_generating() - return content_dict \ No newline at end of file + return content_dict diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index ff2af37..065481f 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -71,7 +71,7 @@ def update_images_src_links(body_tag: BeautifulSoup, return path2aws_path -def preprocess_table(body_tag: BeautifulSoup): +def _preprocess_table(body_tag: BeautifulSoup): """Function to preprocess tables and tags(td|th|tr): style""" tables = body_tag.find_all("table") for table in tables: @@ -99,7 +99,7 @@ def preprocess_table(body_tag: BeautifulSoup): table.attrs['border'] = '1' -def process_lists(body_tag: BeautifulSoup): +def _process_lists(body_tag: BeautifulSoup): """ Function - process tags

  • . @@ -121,7 +121,7 @@ def process_lists(body_tag: BeautifulSoup): li_tag.p.unwrap() -def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): +def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): """Function inserts span before tag aren't supported by livecarta""" new_tag = main_tag.new_tag("span") new_tag.attrs['id'] = id_ or '' @@ -130,21 +130,21 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): tag.insert_before(new_tag) -def clean_headings_content(content: BeautifulSoup, title: str): +def _clean_headings_content(content: BeautifulSoup, title: str): def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup): if tag_to_be_removed.attrs.get('id'): - insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=tag_to_be_removed.attrs.get( - 'id'), - class_=tag_to_be_removed.attrs.get('class')) + _insert_span_with_attrs_before_tag(body_tag, + tag_to_be_removed, + id_=tag_to_be_removed.attrs.get( + 'id'), + class_=tag_to_be_removed.attrs.get('class')) for sub_tag in tag_to_be_removed.find_all(): if sub_tag.attrs.get('id'): - insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=sub_tag.attrs['id'], - class_=sub_tag.attrs.get('class')) + _insert_span_with_attrs_before_tag(body_tag, + tag_to_be_removed, + id_=sub_tag.attrs['id'], + class_=sub_tag.attrs.get('class')) title = title.lower() for child in content.contents: @@ -165,7 +165,7 @@ def clean_headings_content(content: BeautifulSoup, title: str): break -def heading_tag_to_p_tag(body_tag): +def _heading_tag_to_p_tag(body_tag): """Function to convert all lower level headings to p tags""" pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = body_tag.find_all(re.compile(pattern)) @@ -173,7 +173,7 @@ def heading_tag_to_p_tag(body_tag): tag.name = 'p' -def clean_title_from_numbering(title: str): +def _clean_title_from_numbering(title: str): """Function removes numbering from titles""" title = re.sub(r'^(\s+)+', '', title) # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title @@ -182,7 +182,7 @@ def clean_title_from_numbering(title: str): return title -def replace_with_livecarta_anchor_tag(anchor, i): +def _replace_with_livecarta_anchor_tag(anchor, i): """Function replace noteref_tag(anchor) with new livecarta tag""" new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' @@ -257,7 +257,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote': footnote_tag = footnote_tag.parent new_noterefs_tags.append( - replace_with_livecarta_anchor_tag(noteref_tag, i)) + _replace_with_livecarta_anchor_tag(noteref_tag, i)) content = footnote_tag.text # footnote_tag.decompose() footnotes.append(content) @@ -292,7 +292,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): """ - def preserve_class_in_aside_tag(tag_): + def _preserve_class_in_aside_tag(tag_): """to save css style inherited from class, copy class to aside tag (which is parent to tag_)""" # this is for Wiley books with boxes tag_class = tag_.attrs['class'] if not isinstance( @@ -301,7 +301,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): if not tag_.parent.attrs.get('class'): tag_.parent.attrs['class'] = tag_class - def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool: + def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool: """ Function saves css style inherited from class, copies class to child

    returns True, if

    could be unwrapped @@ -332,13 +332,13 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): else: return True - def add_span_to_save_ids_for_links(tag_to_be_removed): + def _add_span_to_save_ids_for_links(tag_to_be_removed): if tag_to_be_removed.attrs.get('id'): - insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed, - id_=tag_to_be_removed.attrs['id'], - class_=tag_to_be_removed.attrs.get('class')) + _insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed, + id_=tag_to_be_removed.attrs['id'], + class_=tag_to_be_removed.attrs.get('class')) - def replace_div_tag_with_table(): + def _replace_div_tag_with_table(): """ Function replace
    with : 1. Convert div with certain classes to tables @@ -350,11 +350,11 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): div_class = div.attrs['class'] if not isinstance( div.attrs['class'], list) else div.attrs['class'][0] if div_class in ['C409', 'C409a']: - wrap_block_tag_with_table( + _wrap_block_tag_with_table( body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9') elif div_class in ['C441', 'C816']: - wrap_block_tag_with_table( + _wrap_block_tag_with_table( body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8') if div.attrs.get('style'): @@ -363,7 +363,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): 'background-color') + len('background-color') start_index_of_color = end_index + 2 bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7] - wrap_block_tag_with_table( + _wrap_block_tag_with_table( body_tag, old_tag=div, width='100', border='', bg_color=bg_color) elif div.attrs.get('style') == '': del div.attrs['style'] @@ -379,7 +379,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): if all(is_not_struct_tag): div.name = 'p' continue - add_span_to_save_ids_for_links(div) + _add_span_to_save_ids_for_links(div) div.unwrap() # comments removal @@ -387,18 +387,18 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): for element in tag(text=lambda text: isinstance(text, Comment)): element.extract() - replace_div_tag_with_table() + _replace_div_tag_with_table() for s in body_tag.find_all("section"): could_be_unwrapped = True if s.attrs.get('class'): - could_be_unwrapped = preserve_class_in_section_tag(s) - add_span_to_save_ids_for_links(s) + could_be_unwrapped = _preserve_class_in_section_tag(s) + _add_span_to_save_ids_for_links(s) if could_be_unwrapped: s.unwrap() for s in body_tag.find_all("article"): - add_span_to_save_ids_for_links(s) + _add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("figure"): @@ -407,22 +407,22 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): s.attrs['style'] = "text-align: center;" for s in body_tag.find_all("figcaption"): - add_span_to_save_ids_for_links(s) + _add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("aside"): s.name = 'blockquote' for s in body_tag.find_all("main"): - add_span_to_save_ids_for_links(s) + _add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("body"): - add_span_to_save_ids_for_links(s) + _add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("html"): - add_span_to_save_ids_for_links(s) + _add_span_to_save_ids_for_links(s) s.unwrap() for s in body_tag.find_all("header"): @@ -442,7 +442,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): assert all( parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' - heading_tag_to_p_tag(body_tag) + _heading_tag_to_p_tag(body_tag) # wrap NavigableString with

    for node in body_tag: @@ -500,7 +500,7 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu return tags -def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None): +def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None): """Function wraps with

    """ table = main_tag.new_tag("table") table.attrs['border'] = border @@ -520,7 +520,7 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co return table -def clean_wiley_block(block): +def _clean_wiley_block(block): hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) for hr in hrs: hr.extract() @@ -530,30 +530,30 @@ def clean_wiley_block(block): h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) -def preprocess_block_tags(chapter_tag): +def _preprocess_block_tags(chapter_tag): """Function preprocessing tags""" for block in chapter_tag.find_all("blockquote"): if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: - clean_wiley_block(block) + _clean_wiley_block(block) color = '#DDDDDD' if block.attrs.get( 'class') == 'feature1' else None color = '#EEEEEE' if block.attrs.get( 'class') == 'feature2' else color - wrap_block_tag_with_table(chapter_tag, block, bg_color=color) + _wrap_block_tag_with_table(chapter_tag, block, bg_color=color) block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) block.unwrap() for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): - clean_wiley_block(future_block) + _clean_wiley_block(future_block) color = '#DDDDDD' if future_block.attrs.get( 'class') == 'feature1' else None color = '#EEEEEE' if future_block.attrs.get( 'class') == 'feature2' else color - wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) + _wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) -def prepare_formatted(text: str) -> str: +def _prepare_formatted(text: str) -> str: """Function replaces special symbols with their Unicode representation""" text = text.replace("<", "\x3C") text = text.replace(">", "\x3E") @@ -563,7 +563,7 @@ def prepare_formatted(text: str) -> str: return text -def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag: +def _wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag: """Function wraps with
    """ table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag( "tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") @@ -577,7 +577,7 @@ def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag: return table -def preprocess_pre_tags(chapter_tag: BeautifulSoup): +def _preprocess_pre_tags(chapter_tag: BeautifulSoup): """ Function preprocessing
     tags
         Parameters
    @@ -601,7 +601,7 @@ def preprocess_pre_tags(chapter_tag: BeautifulSoup):
             for child in copy_contents:
                 # Navigable String
                 if isinstance(child, NavigableString):
    -                cleaned_text = prepare_formatted(str(child))
    +                cleaned_text = _prepare_formatted(str(child))
                     sub_strings = re.split('\r\n|\n|\r', cleaned_text)
                     for string in sub_strings[:-1]:
                         new_tag.append(NavigableString(string))
    @@ -612,24 +612,24 @@ def preprocess_pre_tags(chapter_tag: BeautifulSoup):
                 else:
                     for sub_child in child.children:
                         if isinstance(sub_child, NavigableString):
    -                        cleaned_text = prepare_formatted(str(sub_child))
    +                        cleaned_text = _prepare_formatted(str(sub_child))
                             sub_child.replace_with(NavigableString(cleaned_text))
                         else:
    -                        sub_child.string = prepare_formatted(sub_child.text)
    +                        sub_child.string = _prepare_formatted(sub_child.text)
                     cleaned_tag = child.extract()
                     new_tag.append(cleaned_tag)
                     if to_add_br:
                         new_tag.append(BeautifulSoup(
                             features='lxml').new_tag('br'))
             pre.replace_with(new_tag)
    -        table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
    +        table = _wrap_preformatted_span_with_table(chapter_tag, new_tag)
             # add 

    to save brs p_for_br = chapter_tag.new_tag("p") p_for_br.string = "\xa0" table.insert_after(p_for_br) -def preprocess_code_tags(chapter_tag: BeautifulSoup): +def _preprocess_code_tags(chapter_tag: BeautifulSoup): """ Function - transform , , tags into span @@ -658,7 +658,7 @@ def prepare_title(title_of_chapter: str) -> str: title_str = BeautifulSoup(title_of_chapter, features='lxml').string title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r' +', ' ', title_str).rstrip() - title_str = clean_title_from_numbering(title_str) + title_str = _clean_title_from_numbering(title_str) return title_str @@ -696,18 +696,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro # 2. heading removal if remove_title_from_chapter: - clean_headings_content(content_tag, title_str) + _clean_headings_content(content_tag, title_str) # 3. processing tags (

  • ,
  • , ,
    , )
    -    process_lists(content_tag)
    -    preprocess_table(content_tag)
    -    preprocess_code_tags(content_tag)
    -    preprocess_pre_tags(content_tag)
    -    preprocess_block_tags(content_tag)
    +    _process_lists(content_tag)
    +    _preprocess_table(content_tag)
    +    _preprocess_code_tags(content_tag)
    +    _preprocess_pre_tags(content_tag)
    +    _preprocess_block_tags(content_tag)
     
         # 4. class removal
         for tag in content_tag.find_all(recursive=True):
             if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
                                                                                                     'footnote-element']):
                 del tag.attrs['class']
    -    return str(content_tag)
    \ No newline at end of file
    +    return str(content_tag)
    diff --git a/src/util/check_dirs.py b/src/util/check_dirs.py
    index 1348865..542763d 100644
    --- a/src/util/check_dirs.py
    +++ b/src/util/check_dirs.py
    @@ -1,6 +1,7 @@
     import os
     import argparse
     
    +
     def parse_args():
         parser = argparse.ArgumentParser(description="Utility for folders's clean up.")
         parser.add_argument('-f', '--folders', type=str, nargs='*', help='Names of the folders to be cleaned.')
    diff --git a/src/util/check_packs.py b/src/util/check_packs.py
    index a0f1f6c..1b2f2e9 100644
    --- a/src/util/check_packs.py
    +++ b/src/util/check_packs.py
    @@ -3,6 +3,7 @@ import sys
     import argparse
     import subprocess
     
    +
     def parse_args():
         parser = argparse.ArgumentParser(description="Utility for checking installed packages.")
         parser.add_argument('-p', '--packages', type=str, nargs='*', help='Names of the packages.')
    diff --git a/src/util/color_reader.py b/src/util/color_reader.py
    index 08db998..706da61 100644
    --- a/src/util/color_reader.py
    +++ b/src/util/color_reader.py
    @@ -4,7 +4,6 @@ from colorsys import hls_to_rgb
     from webcolors import html4_hex_to_names, hex_to_rgb, rgb_to_name, rgb_percent_to_hex, rgb_to_hex, css3_names_to_hex
     
     
    -
     def closest_colour_rgb(requested_color):
         """ Function finds closes colour rgb """
         min_colours = {}
    diff --git a/src/util/helpers.py b/src/util/helpers.py
    index 26728c5..aafb632 100644
    --- a/src/util/helpers.py
    +++ b/src/util/helpers.py
    @@ -20,15 +20,15 @@ class ColoredFormatter(logging.Formatter):
     
         def format(self, record):
             seq = self.MAPPING.get(record.levelname, 37)  # default white
    -        record.levelname = ('{0}{1}m{2}{3}') \
    +        record.levelname = '{0}{1}m{2}{3}' \
                 .format(self.PREFIX, seq, record.levelname, self.SUFFIX)
             return logging.Formatter.format(self, record)
     
     
     class BookLogger:
         def __init__(self, name, book_id, main_logger=None,
    -                 filemode='w+', logging_level=logging.INFO, logging_format=
    -                 '%(asctime)s - %(levelname)s - %(message)s [%(filename)s:%(lineno)d in %(funcName)s]'):
    +                 filemode='w+', logging_level=logging.INFO,
    +                 logging_format='%(asctime)s - %(levelname)s - %(message)s [%(filename)s:%(lineno)d in %(funcName)s]'):
             """
             Method for Logger configuration. Logger will write to file.
             :param name: name of the Logger.
    @@ -107,4 +107,4 @@ class BookStatusWrapper:
             self.set_status('[GENERATE]')
     
         def set_error(self):
    -        self.set_status('[ERROR]')
    \ No newline at end of file
    +        self.set_status('[ERROR]')
    diff --git a/src/util/rgb2closest_color.py b/src/util/rgb2closest_color.py
    index a088bd5..6770684 100644
    --- a/src/util/rgb2closest_color.py
    +++ b/src/util/rgb2closest_color.py
    @@ -82,7 +82,7 @@ def rgb2closest_html_color_name(color):
                 pass
     
             if hue_diff in diff2base_color_dict:
    -            dist_cur_color =(hue_request - hue_html) ** 2 + (s_request - s_html) ** 2 + (v_request - v_html) ** 2
    +            dist_cur_color = (hue_request - hue_html) ** 2 + (s_request - s_html) ** 2 + (v_request - v_html) ** 2
                 hue_prev, s_prev, v_prev = HTML_COLORS_HSV[diff2base_color_dict[hue_diff]]
                 dist_prev_color = (hue_request - hue_prev) ** 2 + (s_request - s_prev) ** 2 + (v_request - v_prev) ** 2
                 if dist_cur_color < dist_prev_color:
    @@ -95,7 +95,7 @@ def rgb2closest_html_color_name(color):
     if __name__ == '__main__':
     
         hex_colors = [
    -        #'#945893',
    +        # '#945893',
             # '#96F',
             # '#000', # black
             # '#4C4C4C', # black
    @@ -115,5 +115,5 @@ if __name__ == '__main__':
     
         for c in hex_colors:
             n = rgb2closest_html_color_name(c)
    -        print(n) # "Actual colour:", c, ", closest colour name:",
    +        print(n)    # "Actual colour:", c, ", closest colour name:",
             # print()