Change paths to books

2022-07-27 20:44:19 +03:00
parent 253c4ebe26
commit af466cbc27
9 changed files with 62 additions and 60 deletions
--- a/consumer.py
+++ b/consumer.py
@@ -33,7 +33,7 @@ def configure_file_logger(name, filename="logs/converter.log", filemode="w+",
 def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict):
    logger.info(f"Start processing book-{book_id}.")
    try:
-        json_file_path = "json/9781614382264.json"
+        json_file_path = "books/json/9781614382264.json"
        book = book_type(book_id=book_id, main_logger=logger, **params)
        book.conversion_local(json_file_path)
    except Exception as exc:
--- a/presets/.gitignore
+++ b/presets/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -14,7 +14,7 @@ class DocxBook(BookSolver):

    def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None):
        super().__init__(book_id, access, main_logger)
-        self.book_type = 'docx'
+        self.book_type = "docx"
        # critical section for occupying libreoffice by one thread
        self.libre_locker: Event() = libre_locker

@@ -53,9 +53,9 @@ class DocxBook(BookSolver):


 if __name__ == "__main__":
-    docx_file_path = '../../books/docx/music_inquiry.docx'
+    docx_file_path = "../../books/docx/music_inquiry.docx"
    logger_object = BookLogger(
-        name='docx', book_id=docx_file_path.split('/')[-1])
+        name="docx", book_id=docx_file_path.split("/")[-1])
    locker = Event()
    locker.set()

@@ -70,5 +70,5 @@ if __name__ == "__main__":
        content, footnotes, top_level_headers, logger_object)
    content_dict = json_converter.convert_to_dict()

-    with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f:
+    with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f:
        json.dump(content_dict, f, ensure_ascii=False)
--- a/src/docx_converter/footnotes_processing.py
+++ b/src/docx_converter/footnotes_processing.py
@@ -9,58 +9,58 @@ def _clean_footnote_content(content):

 def process_footnotes(body_tag):
    """Function returns list of footnotes and delete them from html_soup."""
-    footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc')
+    footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc")
    footnote_content = body_tag.find_all(
-        'div', id=re.compile(r'^sdfootnote\d+$'))
+        "div", id=re.compile(r"^sdfootnote\d+$"))
    footnote_amt = len(footnote_anchors)

    assert footnote_amt == len(footnote_content), \
-        'Something went wrong with footnotes after libre conversion'
+        "Something went wrong with footnotes after libre conversion"

    footnotes = []

    for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
        true_a_tag = cont_tag.find_all(
-            'a', class_=re.compile(r'^sdfootnote.+$'))[0]
+            "a", class_=re.compile(r"^sdfootnote.+$"))[0]

-        if true_a_tag.attrs.get('href') is None:
+        if true_a_tag.attrs.get("href") is None:
            cont_tag.a.decompose()
            continue

-        assert anc_tag['name'] == true_a_tag['href'][1:], \
-            'Something went wrong with footnotes after libre conversion'
+        assert anc_tag["name"] == true_a_tag["href"][1:], \
+            "Something went wrong with footnotes after libre conversion"

-        new_tag = BeautifulSoup(features='lxml').new_tag('sup')
-        new_tag['class'] = 'footnote-element'
-        new_tag['data-id'] = i + 1
-        new_tag['id'] = f'footnote-{i + 1}'
-        new_tag.string = '*'
+        new_tag = BeautifulSoup(features="lxml").new_tag("sup")
+        new_tag["class"] = "footnote-element"
+        new_tag["data-id"] = i + 1
+        new_tag["id"] = f"footnote-{i + 1}"
+        new_tag.string = "*"
        anc_tag.replace_with(new_tag)

        # extra digits in footnotes from documents downloaded from livecarta
        a_text = true_a_tag.text
-        if len(cont_tag.find_all('p')):
-            sup = cont_tag.find_all('p')[0].find('sup')
+        if len(cont_tag.find_all("p")):
+            sup = cont_tag.find_all("p")[0].find("sup")
            if sup and sup.text == a_text:
                sup.decompose()

-        for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
+        for tag_a in cont_tag.find_all("a", {"class": "sdfootnotesym"}):
            tag_a.decompose()

        # remove font-size
-        for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
-            style = span.get('style')
+        for span in cont_tag.find_all("span", {"style": re.compile("font-size")}):
+            style = span.get("style")
            style = re.sub(r"font-size: \d+px", "", style)
-            if style == '':
-                del span.attrs['style']
+            if style == "":
+                del span.attrs["style"]
            else:
-                span.attrs['style'] = style
+                span.attrs["style"] = style

-        unicode_string = ''
+        unicode_string = ""
        for child in cont_tag.children:
            if type(child) is NavigableString:
                continue
-            if child.name == 'blockquote':
+            if child.name == "blockquote":
                unicode_string += str(child)
            else:
                unicode_string += child.decode_contents()
--- a/src/docx_converter/image_processing.py
+++ b/src/docx_converter/image_processing.py
@@ -10,23 +10,23 @@ def process_images(access, html_path, book_id, body_tag):
    For now images are moved to one folder.

    """
-    img_tags = body_tag.find_all('img')
+    img_tags = body_tag.find_all("img")
    for img in img_tags:
-        img_name = img.attrs.get('src')
+        img_name = img.attrs.get("src")
        # quick fix for bad links
-        if (len(img_name) >= 3) and img_name[:3] == '../':
+        if (len(img_name) >= 3) and img_name[:3] == "../":
            img_name = img_name[3:]
-        img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
+        img_path = pathlib.Path(f"{html_path.parent}", f"{img_name}")

        if access is not None:
            link = access.send_image(img_path, doc_id=book_id)
-            img.attrs['src'] = link
+            img.attrs["src"] = link
        else:
            if img_tags.index(img) == 0:
                folder_path = os.path.dirname(
                    os.path.dirname(os.path.abspath(__file__)))
                new_path = pathlib.Path(os.path.join(
-                    folder_path, f'../books/json/img_{book_id}/'))
+                    folder_path, f"../books/json/img_{book_id}/"))
                new_path.mkdir(exist_ok=True)
            new_img_path = new_path / img_name
            copyfile(img_path, new_img_path)
--- a/src/docx_converter/libre_html2json_converter.py
+++ b/src/docx_converter/libre_html2json_converter.py
@@ -29,7 +29,7 @@ class LibreHTML2JSONConverter:
            cleaned text

        """
-        new_text = re.sub(r'([\n\t])', ' ', html_text)
+        new_text = re.sub(r"([\n\t])", " ", html_text)
        return new_text

    # TODO: rethink the function structure without indexes.
@@ -48,16 +48,16 @@ class LibreHTML2JSONConverter:
        """
        if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
            title = str(self.content[ind])
-            title = title.replace(f'<{self.content[ind].name}>', '')
-            title = title.replace(f'</{self.content[ind].name}>', '')
-            title = re.sub(r'^\n', '', title)
+            title = title.replace(f"<{self.content[ind].name}>", "")
+            title = title.replace(f"</{self.content[ind].name}>", "")
+            title = re.sub(r"^\n", "", title)

            # extract outline from tag
            curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
            result = {
-                'title': f'{title}',
-                'contents': [],
-                'sub_items': []
+                "title": f"{title}",
+                "contents": [],
+                "sub_items": []
            }
            ch_content = []
            ind += 1
@@ -71,9 +71,9 @@ class LibreHTML2JSONConverter:
                        header_dict, ind = self.header_to_livecarta_chapter_item(
                            ind)
                        if ch_content:
-                            result['contents'].append("".join(ch_content))
+                            result["contents"].append("".join(ch_content))
                        ch_content = []
-                        result['sub_items'].append(header_dict)
+                        result["sub_items"].append(header_dict)
                    # - current h_i <= h_initial, end of recursion
                    else:
                        # return result, ind
@@ -85,21 +85,21 @@ class LibreHTML2JSONConverter:
                    ind += 1

            if ch_content:
-                result['contents'].append("".join(ch_content))
+                result["contents"].append("".join(ch_content))
            return result, ind
-        return ''
+        return ""

    @staticmethod
    def _is_empty_p_tag(tag):
-        if tag.name != 'p':
+        if tag.name != "p":
            return False

        temp_tag = copy(tag)
-        brs = temp_tag.find_all('br')
+        brs = temp_tag.find_all("br")
        for br in brs:
            br.decompose()

-        text = re.sub(r'\s+', '', temp_tag.text)
+        text = re.sub(r"\s+", "", temp_tag.text)
        if text:
            return False

@@ -117,7 +117,7 @@ class LibreHTML2JSONConverter:
                    res, ind = self.header_to_livecarta_chapter_item(ind)

                else:
-                    chapter_title = f'Untitled chapter {ch_num}'
+                    chapter_title = f"Untitled chapter {ch_num}"
                    chapter = []
                    while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
                        if not self._is_empty_p_tag(self.content[ind]):
@@ -126,9 +126,9 @@ class LibreHTML2JSONConverter:
                        ind += 1
                    if chapter:
                        res = {
-                            'title': chapter_title,
-                            'contents': ["".join(chapter)],
-                            'sub_items': []
+                            "title": chapter_title,
+                            "contents": ["".join(chapter)],
+                            "sub_items": []
                        }
                        ch_num += 1

@@ -136,10 +136,10 @@ class LibreHTML2JSONConverter:
                    json_strc.append(res)
                    ch_amt += 1
                    self.logger_object.log(
-                        f'Chapter {ch_amt} has been added to structure.')
+                        f"Chapter {ch_amt} has been added to structure.")
        except Exception as exc:
            self.logger_object.log(
-                'Error has occurred while making json structure.', logging.ERROR)
+                "Error has occurred while making json structure.", logging.ERROR)
            self.logger_object.log_error_to_main_log()
            if self.book_api_status:
                self.book_api_status.set_error()
@@ -148,10 +148,10 @@ class LibreHTML2JSONConverter:
        # Add is_introduction field to json structure
        # after deleting content before toc, some chapters can be deleted
        if self.top_level_headers:
-            same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
-            is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
+            same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"]
+            is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"]

-            json_strc[0]['is_introduction'] = is_first_header_introduction
+            json_strc[0]["is_introduction"] = is_first_header_introduction

        self.content_dict = {
            "content": json_strc,
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -633,7 +633,7 @@ class EpubConverter:


 if __name__ == "__main__":
-    epub_file_path = "../../epub/9780763774134.epub"
+    epub_file_path = "../../books/epub/9780763774134.epub"
    logger_object = BookLogger(
        name="epub", book_id=epub_file_path.split("/")[-1])

--- a/src/epub_converter/footnotes_processing.py
+++ b/src/epub_converter/footnotes_processing.py
@@ -72,7 +72,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note

        expected_footnote_tags = verify_footnote_tag(expected_footnote_tags)
        footnote_tag = expected_footnote_tags[0]
-        if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote":
+        if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote":
            footnote_tag = footnote_tag.parent
        new_noterefs_tags.append(
            _replace_with_livecarta_anchor_tag(noteref_tag, i))
@@ -80,7 +80,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
        # footnote_tag.decompose()
        footnotes.append(content)
        footnote_tag = footnote_tag.find(
-            attrs={"role": "doc-backlink"}) or footnote_tag
+            attrs={"role": "docs-backlink"}) or footnote_tag
        new_footnotes_tags.append(footnote_tag)

    for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)):
--- a/src/epub_converter/image_processing.py
+++ b/src/epub_converter/image_processing.py
@@ -16,7 +16,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
    """Function saves all images locally"""
    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    new_path = pathlib.Path(os.path.join(
-        folder_path, f"../json/img_{book_id}/"))
+        folder_path, f"../books/json/img_{book_id}/"))
    new_path.mkdir(exist_ok=True)

    new_img_path = new_path / os.path.basename(img_file_path)