From af466cbc27d2f43af6b317bf6a5e13b8881d2055 Mon Sep 17 00:00:00 2001
From: Kiryl <kiryl.miatselitsa@teqniksoft.com>
Date: Wed, 27 Jul 2022 20:44:19 +0300
Subject: [PATCH] Change paths to books

---
 consumer.py                                   |  2 +-
 presets/.gitignore                            |  2 +
 src/docx_converter/docx_solver.py             |  8 ++--
 src/docx_converter/footnotes_processing.py    | 44 +++++++++---------
 src/docx_converter/image_processing.py        | 12 ++---
 .../libre_html2json_converter.py              | 46 +++++++++----------
 src/epub_converter/epub_converter.py          |  2 +-
 src/epub_converter/footnotes_processing.py    |  4 +-
 src/epub_converter/image_processing.py        |  2 +-
 9 files changed, 62 insertions(+), 60 deletions(-)
 create mode 100644 presets/.gitignore

diff --git a/consumer.py b/consumer.py
index 095facf..dfa0b16 100644
--- a/consumer.py
+++ b/consumer.py
@@ -33,7 +33,7 @@ def configure_file_logger(name, filename="logs/converter.log", filemode="w+",
 def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict):
     logger.info(f"Start processing book-{book_id}.")
     try:
-        json_file_path = "json/9781614382264.json"
+        json_file_path = "books/json/9781614382264.json"
         book = book_type(book_id=book_id, main_logger=logger, **params)
         book.conversion_local(json_file_path)
     except Exception as exc:
diff --git a/presets/.gitignore b/presets/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/presets/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py
index 6260edb..5edeb46 100644
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -14,7 +14,7 @@ class DocxBook(BookSolver):
 
     def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None):
         super().__init__(book_id, access, main_logger)
-        self.book_type = 'docx'
+        self.book_type = "docx"
         # critical section for occupying libreoffice by one thread
         self.libre_locker: Event() = libre_locker
 
@@ -53,9 +53,9 @@ class DocxBook(BookSolver):
 
 
 if __name__ == "__main__":
-    docx_file_path = '../../books/docx/music_inquiry.docx'
+    docx_file_path = "../../books/docx/music_inquiry.docx"
     logger_object = BookLogger(
-        name='docx', book_id=docx_file_path.split('/')[-1])
+        name="docx", book_id=docx_file_path.split("/")[-1])
     locker = Event()
     locker.set()
 
@@ -70,5 +70,5 @@ if __name__ == "__main__":
         content, footnotes, top_level_headers, logger_object)
     content_dict = json_converter.convert_to_dict()
 
-    with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f:
+    with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f:
         json.dump(content_dict, f, ensure_ascii=False)
diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py
index c269b73..bda6733 100644
--- a/src/docx_converter/footnotes_processing.py
+++ b/src/docx_converter/footnotes_processing.py
@@ -9,58 +9,58 @@ def _clean_footnote_content(content):
 
 def process_footnotes(body_tag):
     """Function returns list of footnotes and delete them from html_soup."""
-    footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc')
+    footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc")
     footnote_content = body_tag.find_all(
-        'div', id=re.compile(r'^sdfootnote\d+$'))
+        "div", id=re.compile(r"^sdfootnote\d+$"))
     footnote_amt = len(footnote_anchors)
 
     assert footnote_amt == len(footnote_content), \
-        'Something went wrong with footnotes after libre conversion'
+        "Something went wrong with footnotes after libre conversion"
 
     footnotes = []
 
     for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
         true_a_tag = cont_tag.find_all(
-            'a', class_=re.compile(r'^sdfootnote.+$'))[0]
+            "a", class_=re.compile(r"^sdfootnote.+$"))[0]
 
-        if true_a_tag.attrs.get('href') is None:
+        if true_a_tag.attrs.get("href") is None:
             cont_tag.a.decompose()
             continue
 
-        assert anc_tag['name'] == true_a_tag['href'][1:], \
-            'Something went wrong with footnotes after libre conversion'
+        assert anc_tag["name"] == true_a_tag["href"][1:], \
+            "Something went wrong with footnotes after libre conversion"
 
-        new_tag = BeautifulSoup(features='lxml').new_tag('sup')
-        new_tag['class'] = 'footnote-element'
-        new_tag['data-id'] = i + 1
-        new_tag['id'] = f'footnote-{i + 1}'
-        new_tag.string = '*'
+        new_tag = BeautifulSoup(features="lxml").new_tag("sup")
+        new_tag["class"] = "footnote-element"
+        new_tag["data-id"] = i + 1
+        new_tag["id"] = f"footnote-{i + 1}"
+        new_tag.string = "*"
         anc_tag.replace_with(new_tag)
 
         # extra digits in footnotes from documents downloaded from livecarta
         a_text = true_a_tag.text
-        if len(cont_tag.find_all('p')):
-            sup = cont_tag.find_all('p')[0].find('sup')
+        if len(cont_tag.find_all("p")):
+            sup = cont_tag.find_all("p")[0].find("sup")
             if sup and sup.text == a_text:
                 sup.decompose()
 
-        for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
+        for tag_a in cont_tag.find_all("a", {"class": "sdfootnotesym"}):
             tag_a.decompose()
 
         # remove font-size
-        for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
-            style = span.get('style')
+        for span in cont_tag.find_all("span", {"style": re.compile("font-size")}):
+            style = span.get("style")
             style = re.sub(r"font-size: \d+px", "", style)
-            if style == '':
-                del span.attrs['style']
+            if style == "":
+                del span.attrs["style"]
             else:
-                span.attrs['style'] = style
+                span.attrs["style"] = style
 
-        unicode_string = ''
+        unicode_string = ""
         for child in cont_tag.children:
             if type(child) is NavigableString:
                 continue
-            if child.name == 'blockquote':
+            if child.name == "blockquote":
                 unicode_string += str(child)
             else:
                 unicode_string += child.decode_contents()
diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py
index 0eab671..9c5fdab 100644
--- a/src/docx_converter/image_processing.py
+++ b/src/docx_converter/image_processing.py
@@ -10,23 +10,23 @@ def process_images(access, html_path, book_id, body_tag):
     For now images are moved to one folder.
 
     """
-    img_tags = body_tag.find_all('img')
+    img_tags = body_tag.find_all("img")
     for img in img_tags:
-        img_name = img.attrs.get('src')
+        img_name = img.attrs.get("src")
         # quick fix for bad links
-        if (len(img_name) >= 3) and img_name[:3] == '../':
+        if (len(img_name) >= 3) and img_name[:3] == "../":
             img_name = img_name[3:]
-        img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
+        img_path = pathlib.Path(f"{html_path.parent}", f"{img_name}")
 
         if access is not None:
             link = access.send_image(img_path, doc_id=book_id)
-            img.attrs['src'] = link
+            img.attrs["src"] = link
         else:
             if img_tags.index(img) == 0:
                 folder_path = os.path.dirname(
                     os.path.dirname(os.path.abspath(__file__)))
                 new_path = pathlib.Path(os.path.join(
-                    folder_path, f'../books/json/img_{book_id}/'))
+                    folder_path, f"../books/json/img_{book_id}/"))
                 new_path.mkdir(exist_ok=True)
             new_img_path = new_path / img_name
             copyfile(img_path, new_img_path)
diff --git a/src/docx_converter/libre_html2json_converter.py b/src/docx_converter/libre_html2json_converter.py
index 0cd92fa..eb5f0a2 100644
--- a/src/docx_converter/libre_html2json_converter.py
+++ b/src/docx_converter/libre_html2json_converter.py
@@ -29,7 +29,7 @@ class LibreHTML2JSONConverter:
             cleaned text
 
         """
-        new_text = re.sub(r'([\n\t])', ' ', html_text)
+        new_text = re.sub(r"([\n\t])", " ", html_text)
         return new_text
 
     # TODO: rethink the function structure without indexes.
@@ -48,16 +48,16 @@ class LibreHTML2JSONConverter:
         """
         if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
             title = str(self.content[ind])
-            title = title.replace(f'<{self.content[ind].name}>', '')
-            title = title.replace(f'</{self.content[ind].name}>', '')
-            title = re.sub(r'^\n', '', title)
+            title = title.replace(f"<{self.content[ind].name}>", "")
+            title = title.replace(f"</{self.content[ind].name}>", "")
+            title = re.sub(r"^\n", "", title)
 
             # extract outline from tag
             curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
             result = {
-                'title': f'{title}',
-                'contents': [],
-                'sub_items': []
+                "title": f"{title}",
+                "contents": [],
+                "sub_items": []
             }
             ch_content = []
             ind += 1
@@ -71,9 +71,9 @@ class LibreHTML2JSONConverter:
                         header_dict, ind = self.header_to_livecarta_chapter_item(
                             ind)
                         if ch_content:
-                            result['contents'].append("".join(ch_content))
+                            result["contents"].append("".join(ch_content))
                         ch_content = []
-                        result['sub_items'].append(header_dict)
+                        result["sub_items"].append(header_dict)
                     # - current h_i <= h_initial, end of recursion
                     else:
                         # return result, ind
@@ -85,21 +85,21 @@ class LibreHTML2JSONConverter:
                     ind += 1
 
             if ch_content:
-                result['contents'].append("".join(ch_content))
+                result["contents"].append("".join(ch_content))
             return result, ind
-        return ''
+        return ""
 
     @staticmethod
     def _is_empty_p_tag(tag):
-        if tag.name != 'p':
+        if tag.name != "p":
             return False
 
         temp_tag = copy(tag)
-        brs = temp_tag.find_all('br')
+        brs = temp_tag.find_all("br")
         for br in brs:
             br.decompose()
 
-        text = re.sub(r'\s+', '', temp_tag.text)
+        text = re.sub(r"\s+", "", temp_tag.text)
         if text:
             return False
 
@@ -117,7 +117,7 @@ class LibreHTML2JSONConverter:
                     res, ind = self.header_to_livecarta_chapter_item(ind)
 
                 else:
-                    chapter_title = f'Untitled chapter {ch_num}'
+                    chapter_title = f"Untitled chapter {ch_num}"
                     chapter = []
                     while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
                         if not self._is_empty_p_tag(self.content[ind]):
@@ -126,9 +126,9 @@ class LibreHTML2JSONConverter:
                         ind += 1
                     if chapter:
                         res = {
-                            'title': chapter_title,
-                            'contents': ["".join(chapter)],
-                            'sub_items': []
+                            "title": chapter_title,
+                            "contents": ["".join(chapter)],
+                            "sub_items": []
                         }
                         ch_num += 1
 
@@ -136,10 +136,10 @@ class LibreHTML2JSONConverter:
                     json_strc.append(res)
                     ch_amt += 1
                     self.logger_object.log(
-                        f'Chapter {ch_amt} has been added to structure.')
+                        f"Chapter {ch_amt} has been added to structure.")
         except Exception as exc:
             self.logger_object.log(
-                'Error has occurred while making json structure.', logging.ERROR)
+                "Error has occurred while making json structure.", logging.ERROR)
             self.logger_object.log_error_to_main_log()
             if self.book_api_status:
                 self.book_api_status.set_error()
@@ -148,10 +148,10 @@ class LibreHTML2JSONConverter:
         # Add is_introduction field to json structure
         # after deleting content before toc, some chapters can be deleted
         if self.top_level_headers:
-            same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
-            is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
+            same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"]
+            is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]
 
-            json_strc[0]['is_introduction'] = is_first_header_introduction
+            json_strc[0]["is_introduction"] = is_first_header_introduction
 
         self.content_dict = {
             "content": json_strc,
diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py
index 4a09481..b8bccf2 100644
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -633,7 +633,7 @@ class EpubConverter:
 
 
 if __name__ == "__main__":
-    epub_file_path = "../../epub/9780763774134.epub"
+    epub_file_path = "../../books/epub/9780763774134.epub"
     logger_object = BookLogger(
         name="epub", book_id=epub_file_path.split("/")[-1])
 
diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py
index f82f073..34cd1fb 100644
--- a/src/epub_converter/footnotes_processing.py
+++ b/src/epub_converter/footnotes_processing.py
@@ -72,7 +72,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
 
         expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
         footnote_tag = expected_footnote_tags[0]
-        if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote":
+        if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote":
             footnote_tag = footnote_tag.parent
         new_noterefs_tags.append(
             _replace_with_livecarta_anchor_tag(noteref_tag, i))
@@ -80,7 +80,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
         # footnote_tag.decompose()
         footnotes.append(content)
         footnote_tag = footnote_tag.find(
-            attrs={"role": "doc-backlink"}) or footnote_tag
+            attrs={"role": "docs-backlink"}) or footnote_tag
         new_footnotes_tags.append(footnote_tag)
 
     for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)):
diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py
index e568aaa..6f35c3a 100644
--- a/src/epub_converter/image_processing.py
+++ b/src/epub_converter/image_processing.py
@@ -16,7 +16,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
     """Function saves all images locally"""
     folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     new_path = pathlib.Path(os.path.join(
-        folder_path, f"../json/img_{book_id}/"))
+        folder_path, f"../books/json/img_{book_id}/"))
     new_path.mkdir(exist_ok=True)
 
     new_img_path = new_path / os.path.basename(img_file_path)