Make todos & refactor code

2021-11-02 12:06:34 +03:00
parent 8c37482616
commit 479695e185
5 changed files with 314 additions and 242 deletions
--- a/src/access.py
+++ b/src/access.py
@@ -95,19 +95,19 @@ class Access:
        else:
            raise Exception(f'{response.status_code}')
-    def get_doc(self, doc_id):
+    def get_book(self, book_id):
        if self.is_time_for_refreshing():
            self.refresh_token()
        self.refreshing.wait()
-        response = requests.get(f'{self.url}/doc-convert/{doc_id}/file', headers=self.headers)
+        response = requests.get(f'{self.url}/doc-convert/{book_id}/file', headers=self.headers)
        if response.status_code == 404:
            raise FileNotFoundError('404 Not Found: file have not found.')
        elif response.status_code == 200:
            content = response.content
        else:
-            raise Exception(f'Error in getting doc from url: {self.url}/doc-convert/{doc_id}/file, '
+            raise Exception(f'Error in getting doc from url: {self.url}/doc-convert/{book_id}/file, '
                            f'status code:{response.status_code}')
        return content
--- a/src/book_solver.py
+++ b/src/book_solver.py
@@ -5,11 +5,10 @@ In parallel it updates status of a book conversion on admin panel.
 Finally sends result to server.
 Result is a json, JSON schema in book_schema.json
 """
 import codecs
 import json
 import logging
 import os
 import json
 import codecs
 import logging
 import pathlib
 from abc import abstractmethod, ABCMeta
@@ -61,11 +60,11 @@ class BookSolver:
        """
        try:
            self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
-            content = self.access.get_doc(self.book_id)
+            content = self.access.get_book(self.book_id)
            self.logger_object.log('File was received from server.')
            self.save_book_file(content)
        except FileNotFoundError as f_err:
-            self.logger_object.log("Can't get docx from server.", logging.ERROR)
+            self.logger_object.log("Can't get file from server.", logging.ERROR)
            self.logger_object.log_error_to_main_log()
            raise f_err
        except Exception as exc:
@@ -109,8 +108,9 @@ class BookSolver:
        return {}
    def test_conversion(self):
        '''Function
        without sending to server'''
        self.logger_object.log('Beginning of the test.')
        folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        folder_path = os.path.join(folder_path, f'{self.book_type}')
        file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
@@ -121,6 +121,9 @@ class BookSolver:
        self.logger_object.log('End of the test.')
    def conversion(self):
        '''Function
        with downloading book from server
        with sending to server'''
        try:
            self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.')
            self.get_book_file()
@@ -137,14 +140,14 @@ class BookSolver:
            raise exc
    def conversion_local(self):
        '''Function
        without downloading book from server (local)
        with sending to server'''
        try:
-            with open('tmp.json') as f:
+            self.logger_object.log(f'Data has been downloaded from tmp.json file: {self.output_path}')
-                d = json.load(f)
+            with codecs.open('json/tmp.json', 'r', encoding='utf-8') as f_json:
-            self.send_json_content_to_server(d)
+                content_dict = json.load(f_json)
-            self.logger_object.log(f'End of the conversion to LiveCarta format. Check {self.output_path}.')
+            self.send_json_content_to_server(content_dict)
        except Exception as exc:
-            self.status_wrapper.set_error()
+            self.logger_object.log('Error has occurred while reading json file.' + str(exc), logging.ERROR)
-            self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
+
            self.logger_object.log_error_to_main_log(str(exc))
            raise exc
--- a/src/epub_converter/css_reader.py
+++ b/src/epub_converter/css_reader.py
@@ -8,8 +8,9 @@ from bs4 import BeautifulSoup
 from premailer import transform
 from itertools import takewhile
 from src.livecarta_config import LiveCartaConfig
 from src.util.color_reader import str2hex
 from src.livecarta_config import LiveCartaConfig
 cssutils.log.setLevel(CRITICAL)
@@ -211,9 +212,9 @@ def build_css_content(css_content):
 class TagStyleConverter:
-    def __init__(self, tag_with_initial_style, tag_with_ultimate_style):
+    def __init__(self, tag_with_inline_style, tag_with_ultimate_style):
-        self.tag_with_initial_style = tag_with_initial_style  # tag with inline style to be updated with style attribute
+        self.tag_with_inline_style = tag_with_inline_style  # tag with inline style to be updated with style attribute
-        self.tag_initial_name = tag_with_initial_style.name
+        self.tag_initial_name = tag_with_inline_style.name
        self.tag_with_ultimate_style = tag_with_ultimate_style  # tag with inline style + style parsed from css file
        self.style = self.preprocess_style()
@@ -293,32 +294,39 @@ class TagStyleConverter:
        ultimate_style = ultimate_style.replace('background:', 'background-color:')
        ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type')
-        split_ultimate_style = ultimate_style.split(';') # make for repetition check and convert to px
+        split_ultimate_style = ultimate_style.replace('; ',';').split(';')
-        # check for another ; in style string in preprocess_style()
+        # when we split style by ; and we have at the end ; that's why we have '' in list
        while '' in split_ultimate_style:
            split_ultimate_style.remove('')
        ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
-        if self.tag_with_initial_style.attrs.get('style'):
+        # replace all spaces between ': & letter' to ':'
        split_ultimate_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_ultimate_style]
-            initial_style = self.tag_with_initial_style.attrs['style']
+        if self.tag_with_inline_style.attrs.get('style'):
-            split_initial_style = initial_style.split(';')
+            inline_style = self.tag_with_inline_style.attrs['style']
-            # check for another ; in style string in preprocess_style()
+            split_inline_style = inline_style.replace('; ',';').split(';')
            while '' in split_initial_style:
                split_initial_style.remove('')
-            # repetition check - if tag had already had inline style, add this to style parsed from css
+            # when we split style by ; and we have at the end ; that's why we have '' in list
-            repeat_styles = list(set(split_ultimate_style) & set(split_initial_style))
+            while '' in split_inline_style:
                split_inline_style.remove('')
            # replace all spaces between ': & letter' to ':'
            split_inline_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_inline_style]
            # repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css
            repeat_styles = list(set(split_ultimate_style) & set(split_inline_style))
            for item in repeat_styles:
-                split_initial_style.remove(item)
+                split_inline_style.remove(item)
-            if split_initial_style:
+            if split_inline_style:
-                # if initial style is not empty - start convert and add to ultimate style
+                # if inline style is not empty - start convert and add to ultimate style
                print('we enter repetition check', '\n')
-                initial_style: str = self.process_indents_to_px(split_initial_style)
+                inline_style: str = self.process_indents_to_px(split_inline_style)
-                ultimate_style += initial_style
+                ultimate_style += inline_style
        ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
        return ultimate_style
    def change_attrs_with_corresponding_tags(self):
@@ -330,15 +338,15 @@ class TagStyleConverter:
            self.style = self.style.replace(s, '')
            self.style = self.style.strip()
            if i == 0:
-                self.tag_with_initial_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
+                self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
-                new_tags.append(self.tag_with_initial_style)
+                new_tags.append(self.tag_with_inline_style)
            else:
                name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
                new_tag = BeautifulSoup(features='lxml').new_tag(name)
                new_tags[-1].wrap(new_tag)
                new_tags.append(new_tag)
-        top_tag = self.tag_with_initial_style
+        top_tag = self.tag_with_inline_style
        if new_tags:
            tmp_attrs = top_tag.attrs.copy()
@@ -355,10 +363,12 @@ class TagStyleConverter:
    @staticmethod
    def wrap_span_in_p_to_save_style_attrs(tag):
-        styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
+        '''Function designed to save style attrs that cannot be in p -> span
-                               if attr not in ['text-align', 'text-indent', 'border-bottom']]
+                                                 that cannot be in span -> p'''
        if tag.name == 'p' and tag.attrs.get('style'):
            styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
                                   if attr not in ['text-align', 'text-indent', 'border-bottom']]
            styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
            if any(styles_to_be_saved):
                tag.name = 'span'
@@ -388,83 +398,81 @@ class TagStyleConverter:
                tag.wrap(p_tag)
    @staticmethod
-    def add_span_to_save_style_attrs_in_li(t):
+    def wrap_span_in_li_to_save_style_attrs(tag):
-        if t.name == 'li' and t.attrs.get('style'):
+        if tag.name == 'li' and tag.attrs.get('style'):
            styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
-                                    attr not in ['text-align', 'list-style-type', 'border-bottom']]
+                                    attr not in ['text-align', 'list-style-type']]
-            check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
+            styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_li]
-            if any(check):
+            if any(styles_to_be_saved):
-                t.name = 'span'
+                tag.name = 'span'
                li_tag = BeautifulSoup(features='lxml').new_tag('li')
-                old_style = t.attrs['style']
+                span_style = tag.attrs['style']
-                new_style = ''
+                li_style = ''
                for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
                                                 re.compile(r'(list-style-type:(\w+);)')]:
-                    has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
+                    has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style)
                    if has_li_style_attrs and has_li_style_attrs.group(1):
-                        new_style += has_li_style_attrs.group(1)
+                        li_style += has_li_style_attrs.group(1)
-                        old_style = old_style.replace(has_li_style_attrs.group(1), '')
+                        span_style = span_style.replace(has_li_style_attrs.group(1), '')
-                li_tag.attrs['style'] = new_style
+                li_tag.attrs['style'] = li_style
-                t.attrs['style'] = old_style
+                tag.attrs['style'] = span_style
-                t.wrap(li_tag)
+                tag.wrap(li_tag)
    @staticmethod
-    def add_span_to_save_style_attrs_in_ul_ol(t):
+    def wrap_span_in_ul_ol_to_save_style_attrs(tag):
-        if t.name in ['ul', 'ol'] and t.attrs.get('style'):
+        if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
            styles_cant_be_in_ul_ol = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
-            check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_ul_ol]
+            check = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_ul_ol]
            if any(check):
-                t.name = 'span'
+                tag.name = 'span'
                li_tag = BeautifulSoup(features='lxml').new_tag('ul')
-                old_style = t.attrs['style']
+                span_style = tag.attrs['style']
                possible_li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
-                has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
+                has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style)
                if has_li_style_attrs and has_li_style_attrs.group(1):
-                    new_style = has_li_style_attrs.group(1)
+                    oul_style = has_li_style_attrs.group(1)
-                    old_style = old_style.replace(new_style, '')
+                    span_style = span_style.replace(oul_style, '')
-                    li_tag.attrs['style'] = new_style
+                    li_tag.attrs['style'] = oul_style
-                t.attrs['style'] = old_style
+                tag.attrs['style'] = span_style
-                t.wrap(li_tag)
+                tag.wrap(li_tag)
    @staticmethod
-    def add_span_to_save_style_attrs(t):
+    def wrap_span_in_h_to_save_style_attrs(tag):
-        no_style_in_livecarta_regexp = re.compile('(^h[1-9]$)')
+        h_regexp = re.compile('(^h[1-9]$)')
-        if re.search(no_style_in_livecarta_regexp, t.name) and t.attrs.get('style'):
+        if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
-            new_tag = BeautifulSoup(features='lxml').new_tag(t.name)
+            h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
-            t.name = 'span'
+            tag.name = 'span'
-            t.wrap(new_tag)
+            tag.wrap(h_tag)
-            style = t.attrs['style']
+            style = tag.attrs['style']
            li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
            has_li_style_attr = re.search(li_attrs_regexp, style)
-            t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
+            tag.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
    def convert_initial_tag(self):
-        self.tag_with_initial_style = self.change_attrs_with_corresponding_tags()
+        self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
-        self.wrap_span_in_p_to_save_style_attrs(self.tag_with_initial_style)
+        self.wrap_span_in_p_to_save_style_attrs(self.tag_with_inline_style)
-        self.add_span_to_save_style_attrs_in_li(self.tag_with_initial_style)
+        self.wrap_span_in_li_to_save_style_attrs(self.tag_with_inline_style)
-        self.add_span_to_save_style_attrs_in_ul_ol(self.tag_with_initial_style)
+        self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_with_inline_style)
-        self.add_span_to_save_style_attrs(self.tag_with_initial_style)
+        self.wrap_span_in_h_to_save_style_attrs(self.tag_with_inline_style)
-        return self.tag_with_initial_style
+        return self.tag_with_inline_style
 def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
    css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
    livecarta_tmp_ids = []
-    h_regex = f'(^h[1-9]$)'
+    could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
    could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
    tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp)
    for i, x in enumerate(tags_with_possible_style_attr):
        x.attrs['livecarta_id'] = i
        livecarta_tmp_ids.append(i)
    # here we add css styles to inline style
    # sometimes in html_with_css_styles
    html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
                                          remove_classes=False,
                                          external_styles=False,
@@ -474,6 +482,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
    inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
    # go through tags with possible style attrs
    for i in livecarta_tmp_ids:
        tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i})
        tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -1,27 +1,28 @@
 import os
 import re
 import json
 import codecs
 import logging
 import os
 from os.path import dirname, normpath, join
 from itertools import chain
 from collections import defaultdict
 from typing import Dict, Union, List
-from os.path import dirname, normpath, join
+
 import ebooklib
 from ebooklib import epub
 from bs4 import BeautifulSoup, Tag
 from ebooklib.epub import Link, Section
 from bs4 import BeautifulSoup, Tag
 from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
 from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
-from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \
+from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \
    update_src_links_in_images, preprocess_footnotes
 class EpubConverter:
    def __init__(self, file, access=None, logger=None):
        self.file = file
@@ -29,9 +30,9 @@ class EpubConverter:
        self.logger: BookLogger = logger
        self.ebooklib_book = epub.read_epub(file)
-        self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {}  # main container for all epub .xhtml files
+        self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
-        self.href2subchapter_ids = defaultdict(list)  # enumerate all subchapter id for each file
+        self.html_href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
-        self.hrefs_added_to_toc = set()  # enumerate all file paths that where added to TOC
+        self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
        # toc tree structure stored as adj.list (NavPoint to list of NavPoints)
        # key = -1 for top level NavPoints
@@ -42,8 +43,8 @@ class EpubConverter:
        self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
        self.internal_anchors = set()
-        self.id_anchor_exist_in_nav_points = False  # flag to be updated while ebooklib.toc is parsed
+        self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
-        self.href2img_bytes = {}  # file path to bytes
+        self.img_href2img_bytes = {}  # file path to bytes
        self.old_image_path2aws_path = {}  # file path from <a> to generated aws path
        self.footnotes_contents: List[str] = []  # to be sent on server as is
        self.noterefs: List[Tag] = []  # start of the footnote
@@ -54,11 +55,11 @@ class EpubConverter:
                       self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
            file_name = x.file_name
            content = x.content
-            self.href2img_bytes[file_name] = content
+            self.img_href2img_bytes[file_name] = content
        self.logger.log('HTML files reading.')
-        self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content()
+        self.html_href2html_body_soup: Dict[str,
-
+                                            BeautifulSoup] = self.build_href2soup_content()
        self.logger.log('CSS files processing.')
        self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
@@ -84,12 +85,14 @@ class EpubConverter:
        # build simple toc from spine if needed
        if self.is_toc_empty():
            self.build_adjacency_list_from_spine()
-        not_added = [x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
+        not_added = [
            x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
        self.logger.log(f'Html documents not added to TOC: {not_added}.')
        self.add_not_added_files_to_adjacency_list(not_added)
        self.logger.log(f'Html internal links and structure processing.')
        self.label_chapters_ids_with_tmp_id()
-        self.process_html_soup_structure_to_line()  # used only after parsed toc, ids from toc needed
+        # used only after parsed toc, ids from toc needed
        self.process_html_soup_structure_to_line()
        self.process_internal_links()
        self.logger.log(f'Building chapters content.')
        self.define_chapters_content()
@@ -110,7 +113,8 @@ class EpubConverter:
        path_to_css_from_html = css_href
        html_folder = dirname(html_href)
-        path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
+        path_to_css_from_root = normpath(
            join(html_folder, path_to_css_from_html)).replace('\\', '/')
        css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
        assert css_obj, f'Css style {css_href} was not in manifest.'
        css_content: str = css_obj.get_content().decode()
@@ -124,14 +128,16 @@ class EpubConverter:
        ...2... = key2value
        '''
-        html_href2css_href: defaultdict = defaultdict(list) # dictionary: href of html to related css files
+        # dictionary: href of html to related css files
        html_href2css_href: defaultdict = defaultdict(list)
        css_href2css_content: dict = {}
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            html_content = item.content
            html_href = item.file_name
            soup_html_content = BeautifulSoup(html_content, features='lxml')
-            for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): #check if file links to css file
+            # check if file links to css file
            for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
                if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
                    continue
                css_href = tag.attrs.get('href')
@@ -144,7 +150,8 @@ class EpubConverter:
            for i, tag in enumerate(soup_html_content.find_all('style')):
                css_content = tag.string
                html_href2css_href[html_href].append(f'href{i}')
-                css_href2css_content[f'href{i}'] = build_css_content(css_content)
+                css_href2css_content[f'href{i}'] = build_css_content(
                    css_content)
        return html_href2css_href, css_href2css_content,
@@ -153,14 +160,14 @@ class EpubConverter:
        This function is designed to update html_href2html_body_soup
        And add to html_inline_style css_style_content
        '''
-        for href in self.html_href2html_body_soup:
+        for html_href in self.html_href2html_body_soup:
-            if self.html_href2css_href.get(href):
+            if self.html_href2css_href.get(html_href):
-                css =''
+                css = ''
-                for key in self.html_href2css_href[href]:
+                for css_href in self.html_href2css_href[html_href]:
-                    css += self.css_href2css_content[key]
+                    css += self.css_href2css_content[css_href]
-                content: BeautifulSoup = self.html_href2html_body_soup[href]
+                content: BeautifulSoup = self.html_href2html_body_soup[html_href]
                content = convert_html_soup_with_css_style(content, css)
-                self.html_href2html_body_soup[href] = content
+                self.html_href2html_body_soup[html_href] = content
    def build_manifest_id2html_href(self):
        links = dict()
@@ -173,18 +180,18 @@ class EpubConverter:
        """
        self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
-        key = -1 if root, value = None if leaf
+        key = -1 if root(top chapters),
        value = None if leaf(least chapters)
-        :param element: [Link, tuple, list] - element that appears in TOC( usually parsed from nav.ncx)
+        :param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
-        :param lvl: level of  depth
+        :param lvl: level of depth
        """
        if isinstance(element, Link):
            # todo: check if link exists
            nav_point = NavPoint(element)
            if nav_point.id:
                self.id_anchor_exist_in_nav_points = True
-                self.href2subchapter_ids[nav_point.href].append(nav_point.id)
+                self.html_href2subchapter_ids[nav_point.href].append(nav_point.id)
            self.adjacency_list[nav_point] = None
            self.hrefs_added_to_toc.add(nav_point.href)
            return nav_point
@@ -195,11 +202,12 @@ class EpubConverter:
            nav_point = NavPoint(first)
            if nav_point.id:
                self.id_anchor_exist_in_nav_points = True
-                self.href2subchapter_ids[nav_point.href].append(nav_point.id)
+                self.html_href2subchapter_ids[nav_point.href].append(nav_point.id)
            sub_nodes = []
            for i in second:
-                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
+                sub_nodes.append(
                    self.build_adjacency_list_from_toc(i, lvl + 1))
            self.adjacency_list[nav_point] = sub_nodes
            self.hrefs_added_to_toc.add(nav_point.href)
@@ -208,39 +216,43 @@ class EpubConverter:
        elif isinstance(element, list) and (lvl == 0):
            sub_nodes = []
            for i in element:
-                sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
+                sub_nodes.append(
                    self.build_adjacency_list_from_toc(i, lvl + 1))
            self.adjacency_list[-1] = sub_nodes
        else:
-            assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
+            assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
    def is_toc_empty(self):
        # there is no toc in ebook or no top chapters
        if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
            return True
        return False
    def build_adjacency_list_from_spine(self):
-        manifest_id2href = self.build_manifest_id2html_href()
+        manifest_id2html_href = self.build_manifest_id2html_href()
        self.adjacency_list = {
            -1: []
        }
        for id_, _ in self.ebooklib_book.spine:
-            nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
+            nav_point = NavPoint(
                Section(manifest_id2html_href[id_], manifest_id2html_href[id_]))
            self.adjacency_list[-1].append(nav_point)
            self.hrefs_added_to_toc.add(nav_point.href)
    def add_not_added_files_to_adjacency_list(self, not_added):
        for i, file in enumerate(not_added):
-            nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file))
+            nav_point = NavPoint(
                Section(f'To check #{i}, filename: {file}', file))
            self.adjacency_list[-1].append(nav_point)
            self.hrefs_added_to_toc.add(file)
    def label_chapters_ids_with_tmp_id(self):
-        for href in self.html_href2html_body_soup:
+        for html_href in self.html_href2html_body_soup:
-            ids = self.href2subchapter_ids[href]
+            ids = self.html_href2subchapter_ids[html_href]
            for i in ids:
-                soup = self.html_href2html_body_soup[href]
+                soup = self.html_href2html_body_soup[html_href]
                tag = soup.find(id=i)
                new_h = soup.new_tag('tmp')
                new_h.attrs['class'] = 'converter-chapter-mark'
@@ -249,9 +261,9 @@ class EpubConverter:
    def process_html_soup_structure_to_line(self):
        # go to line structure
-        for href in self.html_href2html_body_soup:
+        for html_href in self.html_href2html_body_soup:
-            soup = self.html_href2html_body_soup[href]
+            soup = self.html_href2html_body_soup[html_href]
-            self.html_href2html_body_soup[href] = unwrap_structural_tags(soup)
+            self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
    @staticmethod
    def create_unique_id(href, id_):
@@ -280,8 +292,10 @@ class EpubConverter:
        :return:
        """
        dir_name = os.path.dirname(cur_file_path)
-        normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
+        normed_path = os.path.normpath(os.path.join(
-        full_path = [path for path in self.hrefs_added_to_toc if normed_path in path]
+            dir_name, href_in_link)).replace('\\', '/')
        full_path = [
            path for path in self.hrefs_added_to_toc if normed_path in path]
        if not full_path:
            self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
                            f'While processing href in {internal_link_tag}.')
@@ -291,7 +305,7 @@ class EpubConverter:
        if len(full_path) > 1:
            self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
                            f' while {internal_link_tag} processing. The first one will be chosen.')
-            
+
        return full_path[0]
    def process_internal_links(self):
@@ -308,13 +322,15 @@ class EpubConverter:
                tag.attrs['id'] = new_id
        # 2.a) process anchor which is a whole xhtml file
-        internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(htm|html|xhtml)$)')
+        internal_link_reg1 = re.compile(
            r'(^(?!https?://).+\.(htm|html|xhtml)$)')
        for toc_href in self.hrefs_added_to_toc:
            soup = self.html_href2html_body_soup[toc_href]
            for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
                a_tag_href = internal_link_tag.attrs['href']
                # find full path
-                a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
+                a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
                    toc_href, a_tag_href, internal_link_tag)
                if not a_tag_href_matched_to_toc:
                    continue
                new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
@@ -322,7 +338,8 @@ class EpubConverter:
                if new_id not in self.internal_anchors:
                    anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
                    new_anchor_span = self.create_new_anchor_span(soup, new_id)
-                    anchor_soup.insert(0, new_anchor_span)  # insert a new span to the begin of the file
+                    # insert a new span to the begin of the file
                    anchor_soup.insert(0, new_anchor_span)
                    self.internal_anchors.add(new_id)
                del internal_link_tag.attrs['href']
@@ -332,20 +349,26 @@ class EpubConverter:
        for toc_href in self.hrefs_added_to_toc:
            soup = self.html_href2html_body_soup[toc_href]
            for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
-                a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
+                a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split(
                    '#')
                # find full path
                if a_tag_href:
                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
                                                                                 internal_link_tag)
                else:
-                    a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
+                    a_tag_href_matched_to_toc = os.path.normpath(
                        toc_href).replace('\\', '/')
                if not a_tag_href_matched_to_toc:
                    continue
-                new_id = self.create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
+
                new_id = self.create_unique_id(
                    a_tag_href_matched_to_toc, a_tag_id)
                anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
-                anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
+                anchor_tags = anchor_soup.find_all(attrs={'id': new_id, })
-                anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id})  # if link is a footnote
+                anchor_tags = anchor_tags or anchor_soup.find_all(
                    attrs={'id': a_tag_id})  # if link is a footnote
                if anchor_tags:
                    if len(anchor_tags) > 1:
@@ -359,7 +382,8 @@ class EpubConverter:
                    internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
                    # create span to have cyclic links, link has 1 type of class, anchor another
                    if anchor_tag.attrs['id'] not in self.internal_anchors:
-                        new_anchor_span = self.create_new_anchor_span(soup, new_id)
+                        new_anchor_span = self.create_new_anchor_span(
                            soup, new_id)
                        anchor_tag.insert_before(new_anchor_span)
                        self.internal_anchors.add(new_id)
                        del anchor_tag.attrs['id']
@@ -386,11 +410,13 @@ class EpubConverter:
        """
        if nav_point.id:
            soup = self.html_href2html_body_soup[nav_point.href]
-            chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup)
+            chapter_tags = get_tags_between_chapter_marks(
                first_id=nav_point.id, href=nav_point.href, html_soup=soup)
            new_tree = BeautifulSoup('', 'html.parser')
            for tag in chapter_tags:
                new_tree.append(tag)
-            self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] = new_tree
+            self.href_chapter_id2soup_html[(
                nav_point.href, nav_point.id)] = new_tree
        if self.adjacency_list.get(nav_point):
            for sub_node in self.adjacency_list[nav_point]:
@@ -405,25 +431,27 @@ class EpubConverter:
    def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
        title = nav_point.title
        if nav_point.id:
-            content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
+            content: BeautifulSoup = self.href_chapter_id2soup_html[(
                nav_point.href, nav_point.id)]
        else:
            content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
        self.old_image_path2aws_path = update_src_links_in_images(content,
-                                                                  self.href2img_bytes,
+                                                                  self.img_href2img_bytes,
                                                                  path_to_html=nav_point.href,
                                                                  access=self.access,
                                                                  path2aws_path=self.old_image_path2aws_path)
        is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
-        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
+        title_preprocessed = prepare_title(title)
-                                                                             remove_title_from_chapter=is_chapter)
+        content_preprocessed = prepare_content(title_preprocessed, content,
-
+                                                                 remove_title_from_chapter=is_chapter)
        sub_nodes = []
-        # warning! not EpubHtmlItems won;t be added to chapter
+        # warning! not EpubHtmlItems won't be added to chapter
        if self.adjacency_list.get(nav_point):
            for sub_node in self.adjacency_list[nav_point]:
-                sub_chapter_item = self.node_to_livecarta_chapter_item(sub_node, lvl + 1)
+                sub_chapter_item = self.node_to_livecarta_chapter_item(
                    sub_node, lvl + 1)
                sub_nodes.append(sub_chapter_item)
        if self.logger:
@@ -451,16 +479,16 @@ class EpubConverter:
 if __name__ == "__main__":
    logger = logging.getLogger('epub')
-    file_handler = logging.StreamHandler()
+    stream_handler = logging.StreamHandler()
-    logger.addHandler(file_handler)
+    logger.addHandler(stream_handler)
-    file_handler = logging.FileHandler('../epub.log', mode='w+')
+    file_handler = logging.FileHandler('../../epub.log', mode='w+')
    logger.addHandler(file_handler)
    logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
-    json_converter = EpubConverter('../../epub/Cook.epub',
+    json_converter = EpubConverter('../../epub/9781634259804.epub',
                                   logger=logger_object)
    tmp = json_converter.convert_to_dict()
-    with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
+    with codecs.open('../../json/tmp.json', 'w', encoding='utf-8') as f:
-        json.dump(tmp, f, ensure_ascii=False)
+        json.dump(tmp, f, ensure_ascii=False)
--- a/src/epub_converter/html_epub_preprocessor.py
+++ b/src/epub_converter/html_epub_preprocessor.py
@@ -11,7 +11,8 @@ from src.livecarta_config import LiveCartaConfig
 def save_image_locally(img_file_path, img_content, book_id):
    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    new_path = pathlib.Path(os.path.join(folder_path, f'../json/img_{book_id}/'))
+    new_path = pathlib.Path(os.path.join(
        folder_path, f'../json/img_{book_id}/'))
    new_path.mkdir(exist_ok=True)
    new_img_path = new_path / os.path.basename(img_file_path)
@@ -23,7 +24,8 @@ def save_image_locally(img_file_path, img_content, book_id):
 def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
-    link = access.send_image(img_file_path, doc_id=book_id, img_content=img_content)
+    link = access.send_image(
        img_file_path, doc_id=book_id, img_content=img_content)
    return link
@@ -37,7 +39,8 @@ def update_src_links_in_images(body_tag: Tag,
    for img in img_tags:
        path_to_img_from_html = img.attrs.get('src')
        html_folder = os.path.dirname(path_to_html)
-        path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html)).replace('\\', '/')
+        path_to_img_from_root = os.path.normpath(os.path.join(
            html_folder, path_to_img_from_html)).replace('\\', '/')
        assert path_to_img_from_root in href2img_content, \
            f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
@@ -47,10 +50,12 @@ def update_src_links_in_images(body_tag: Tag,
            if path_to_img_from_root in path2aws_path:
                new_folder = path2aws_path[path_to_img_from_root]
            else:
-                new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
+                new_folder = save_image_to_aws(
                    access, path_to_img_from_root, img_content, 'book_id')
                path2aws_path[path_to_img_from_root] = new_folder
        else:
-            new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
+            new_folder = save_image_locally(
                path_to_img_from_root, img_content, 'book_id')
        img.attrs['src'] = str(new_folder)
        if img.attrs.get('width'):
@@ -71,7 +76,8 @@ def preprocess_table(body_tag: BeautifulSoup):
            style = td.get('style')
            width = ''
            if style:
-                width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
+                width_match = re.search(
                    r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
                if width_match:
                    size = width_match.group(1)
                    units = width_match.group(2)
@@ -96,10 +102,10 @@ def process_lists(body_tag):
    """
    li_tags = body_tag.find_all("li")
-    for il_tag in li_tags:
+    for li_tag in li_tags:
-        if il_tag.p:
+        if li_tag.p:
-            il_tag.attrs.update(il_tag.p.attrs)
+            li_tag.attrs.update(li_tag.p.attrs)
-            il_tag.p.unwrap()
+            li_tag.p.unwrap()
 def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
@@ -111,11 +117,12 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
 def clean_headings_content(content: Tag, title: str):
-    def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
+    def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
        if tag_to_be_removed.attrs.get('id'):
            insert_span_with_attrs_before_tag(body_tag,
                                              tag_to_be_removed,
-                                              id_=tag_to_be_removed.attrs.get('id'),
+                                              id_=tag_to_be_removed.attrs.get(
                                                  'id'),
                                              class_=tag_to_be_removed.attrs.get('class'))
        for sub_tag in tag_to_be_removed.find_all():
@@ -136,10 +143,10 @@ def clean_headings_content(content: Tag, title: str):
            text = re.sub(r' +', ' ', text).strip()
            text = text.lower()
            if title == text:
-                _add_span_to_save_ids_for_links(child, content)
+                add_span_to_save_ids_for_links(child, content)
                child.extract()
            elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
-                _add_span_to_save_ids_for_links(child, content)
+                add_span_to_save_ids_for_links(child, content)
                child.extract()
            break
@@ -187,9 +194,12 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
    """
    footnotes = []
-    noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'})
+    noterefs_tags = source_html_tag.find_all(
-    bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')])
+        attrs={noteref_attr_name: 'noteref'})
-    noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
+    bad_noterefs_tags = set(
        [tag for tag in noterefs_tags if not tag.attrs.get('href')])
    noterefs_tags = [
        tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
    new_noterefs_tags = []
    new_footnotes_tags = []
    [tag.decompose() for tag in bad_noterefs_tags]
@@ -204,7 +214,8 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
        if len(tags) == 0:
            anchored_tags = list(target_html_tag.find_all(id=element_id))
            if len(anchored_tags):
-                print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
+                print(
                    f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
                return anchored_tags
            else:
                assert 0, f'Error, No element with id: {href} found.'
@@ -219,7 +230,8 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
        else:
            target_html_tag = href2soup_html.get(file)
            if not target_html_tag:
-                print(f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
+                print(
                    f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
                continue
        possible_footnote = 'note|footnote|endnote|rearenote'
@@ -230,11 +242,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
        footnote_tag = expected_footnote_tags[0]
        if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
            footnote_tag = footnote_tag.parent
-        new_noterefs_tags.append(replace_with_livecarta_anchor_tag(noteref_tag, i))
+        new_noterefs_tags.append(
            replace_with_livecarta_anchor_tag(noteref_tag, i))
        content = footnote_tag.text
        # footnote_tag.decompose()
        footnotes.append(content)
-        footnote_tag = footnote_tag.find(attrs={'role': 'doc-backlink'}) or footnote_tag
+        footnote_tag = footnote_tag.find(
            attrs={'role': 'doc-backlink'}) or footnote_tag
        new_footnotes_tags.append(footnote_tag)
    return footnotes, new_noterefs_tags, new_footnotes_tags
@@ -262,7 +276,8 @@ def unwrap_structural_tags(body_tag):
    def _preserve_class_in_aside_tag(tag_):
        # to save css style inherited from class, copy class to aside tag (which is parent to tag_)
        # this is for Wiley books with boxes
-        tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
+        tag_class = tag_.attrs['class'] if not isinstance(
            tag_.attrs['class'], list) else tag_.attrs['class'][0]
        if tag_.parent.name == 'aside':
            if not tag_.parent.attrs.get('class'):
                tag_.parent.attrs['class'] = tag_class
@@ -272,7 +287,8 @@ def unwrap_structural_tags(body_tag):
        # this is for Wiley books with boxes
        # returns True, if <section> could be unwrapped
-        tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
+        tag_class = tag_.attrs['class'] if not isinstance(
            tag_.attrs['class'], list) else tag_.attrs['class'][0]
        if 'feature' not in tag_class:
            return True
        child_p_tags = tag_.find_all("p")
@@ -288,51 +304,56 @@ def unwrap_structural_tags(body_tag):
        else:
            return True
    def add_table_to_abc_books(tag_, border, bg_color):
        wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color)
    def add_span_to_save_ids_for_links(tag_to_be_removed):
        if tag_to_be_removed.attrs.get('id'):
            insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
                                              id_=tag_to_be_removed.attrs['id'],
                                              class_=tag_to_be_removed.attrs.get('class'))
-    structural_tags_names = [
+    def replace_div_tag_with_table():
-        'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
+        for div in body_tag.find_all("div"):
-        'figure', 'footer', 'iframe', 'span', 'p'
+            if div.attrs.get('class'):
-    ]
+                div_class = div.attrs['class'] if not isinstance(
                    div.attrs['class'], list) else div.attrs['class'][0]
                if div_class in ['C409', 'C409a']:
                    wrap_block_tag_with_table(
                        body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')
                elif div_class in ['C441', 'C816']:
                    wrap_block_tag_with_table(
                        body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')
            if div.attrs.get('style'):
                if 'background-color' in div.attrs['style']:
                    end_index = div.attrs['style'].find(
                        'background-color') + len('background-color')
                    start_index_of_color = end_index + 2
                    bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
                    wrap_block_tag_with_table(
                        body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
            elif div.attrs.get('style') == '':
                del div.attrs['style']
            structural_tags_names = [
                'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
                'figure', 'footer', 'iframe', 'span', 'p'
            ]
            if div.contents:
                is_not_struct_tag = [
                    child.name not in structural_tags_names for child in div.contents]
                if all(is_not_struct_tag):
                    div.name = 'p'
                    continue
            add_span_to_save_ids_for_links(div)
            div.unwrap()
    # comments removal
    for tag in body_tag.find_all():
        for element in tag(text=lambda text: isinstance(text, Comment)):
            element.extract()
    for div in body_tag.find_all("div"):
        if div.attrs.get('class'):
            div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0]
            if div_class in ['C409', 'C409a']:
                add_table_to_abc_books(div, border='solid 3px', bg_color='#e7e7e9')
-            elif div_class in ['C441', 'C816']:
+    replace_div_tag_with_table()
                add_table_to_abc_books(div, border='solid #6e6e70 1px', bg_color='#e7e7e8')
        if div.attrs.get('style'):
            if 'background-color' in div.attrs['style']:
                end_index = div.attrs['style'].find('background-color') + len('background-color')
                start_index_of_color = end_index + 2
                bg_color = div.attrs['style'][start_index_of_color:start_index_of_color+7]
                add_table_to_abc_books(div, border='', bg_color=bg_color)
        if div.attrs.get('style') == '':
            del div.attrs['style']
        if div.contents:
            is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
            if all(is_not_struct_tag):
                div.name = 'p'
                continue
        add_span_to_save_ids_for_links(div)
        div.unwrap()
    for s in body_tag.find_all("section"):
        could_be_unwrapped = True
@@ -348,7 +369,8 @@ def unwrap_structural_tags(body_tag):
    for s in body_tag.find_all("figure"):
        s.name = 'p'
-        s.attrs['style'] = "text-align: center;"  # to center image inside this tag
+        # to center image inside this tag
        s.attrs['style'] = "text-align: center;"
    for s in body_tag.find_all("figcaption"):
        add_span_to_save_ids_for_links(s)
@@ -383,7 +405,8 @@ def unwrap_structural_tags(body_tag):
                x.parent.unwrap()  # todo warning! could reflect on formatting/internal links in some cases
    parents_marks_are_body = [x.parent == body_tag for x in marks]
-    assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
+    assert all(
        parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
    heading_tag_to_p_tag(body_tag)
@@ -411,7 +434,8 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
    :param html_soup: soup object of current  file
    :return: list [Tag, NavigableString]; chapter's tags
    """
-    marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
+    marked_tags = html_soup.find(
        attrs={'id': first_id, 'class': 'converter-chapter-mark'})
    if marked_tags:
        next_tag = marked_tags.next_sibling
        tags = []
@@ -484,16 +508,20 @@ def preprocess_block_tags(chapter_tag):
        if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
            clean_wiley_block(block)
-            color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None
+            color = '#DDDDDD' if block.attrs.get(
-            color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color
+                'class') == 'feature1' else None
            color = '#EEEEEE' if block.attrs.get(
                'class') == 'feature2' else color
            wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
            block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
            block.unwrap()
    for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
        clean_wiley_block(future_block)
-        color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None
+        color = '#DDDDDD' if future_block.attrs.get(
-        color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color
+            'class') == 'feature1' else None
        color = '#EEEEEE' if future_block.attrs.get(
            'class') == 'feature2' else color
        wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
@@ -512,7 +540,8 @@ def preprocess_pre_tags(chapter_tag):
        new_tag = BeautifulSoup(features='lxml').new_tag("span")
        new_tag.attrs = pre.attrs.copy()
        spans = pre.find_all("span")
-        to_add_br = len(spans) > 1  # if in <pre> there are multiple <span>, we need to add <br> after each content
+        # if in <pre> there are multiple <span>, we need to add <br> after each content
        to_add_br = len(spans) > 1
        for child in pre.children:
            if isinstance(child, NavigableString):
@@ -520,7 +549,8 @@ def preprocess_pre_tags(chapter_tag):
                sub_strings = re.split('\r\n|\n|\r', cleaned_text)
                for string in sub_strings:
                    new_tag.append(NavigableString(string))
-                    new_tag.append(BeautifulSoup(features='lxml').new_tag('br'))
+                    new_tag.append(BeautifulSoup(
                        features='lxml').new_tag('br'))
            else:
                for sub_child in child.children:
                    if isinstance(sub_child, NavigableString):
@@ -531,7 +561,8 @@ def preprocess_pre_tags(chapter_tag):
                cleaned_tag = child.extract()
                new_tag.append(cleaned_tag)
                if to_add_br:
-                    new_tag.append(BeautifulSoup(features='lxml').new_tag('br'))
+                    new_tag.append(BeautifulSoup(
                        features='lxml').new_tag('br'))
        new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
                                 "font-size: 14px; white-space: nowrap;"
@@ -551,40 +582,41 @@ def preprocess_code_tags(chapter_tag):
        code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;'
-def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
+def prepare_title(title_of_chapter: str) -> str:
    """
    Final processing/cleaning function.
    :param title: title of the chapter
    :param chapter_tag: soup object
    :param remove_title_from_chapter: bool
    :return: tuple[str, str]
    """
-    title_str = BeautifulSoup(title, features='lxml').string
+    title_str = BeautifulSoup(title_of_chapter, features='lxml').string
    title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
    title_str = re.sub(r' +', ' ', title_str).rstrip()
    title_str = clean_title_from_numbering(title_str)
    return title_str
 def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
    """
    Final processing/cleaning function.
    """
    # 0. cleaning \n
    to_remove = []
-    for child in chapter_tag.contents:
+    for child in content_tag.contents:
        if isinstance(child, NavigableString):
            s = re.sub(r'([\n\t])', '', child.string)
            if s == '':
                to_remove.append(child)
    [x.extract() for x in to_remove]
    # 1. heading removal
    if remove_title_from_chapter:
-        clean_headings_content(chapter_tag, title_str)
+        clean_headings_content(content_tag, title_str)
-    process_lists(chapter_tag)
+    process_lists(content_tag)
-    preprocess_table(chapter_tag)
+    preprocess_table(content_tag)
-    preprocess_code_tags(chapter_tag)
+    preprocess_code_tags(content_tag)
-    preprocess_pre_tags(chapter_tag)
+    preprocess_pre_tags(content_tag)
-    preprocess_block_tags(chapter_tag)
+    preprocess_block_tags(content_tag)
    # 2. class removal
-    for tag in chapter_tag.find_all(recursive=True):
+    for tag in content_tag.find_all(recursive=True):
        if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
                                                                                                'footnote-element']):
            del tag.attrs['class']
-    # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
+    return str(content_tag)
    title_str = clean_title_from_numbering(title_str)
    return title_str, str(chapter_tag)