From af466cbc27d2f43af6b317bf6a5e13b8881d2055 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 27 Jul 2022 20:44:19 +0300 Subject: [PATCH] Change paths to books --- consumer.py | 2 +- presets/.gitignore | 2 + src/docx_converter/docx_solver.py | 8 ++-- src/docx_converter/footnotes_processing.py | 44 +++++++++--------- src/docx_converter/image_processing.py | 12 ++--- .../libre_html2json_converter.py | 46 +++++++++---------- src/epub_converter/epub_converter.py | 2 +- src/epub_converter/footnotes_processing.py | 4 +- src/epub_converter/image_processing.py | 2 +- 9 files changed, 62 insertions(+), 60 deletions(-) create mode 100644 presets/.gitignore diff --git a/consumer.py b/consumer.py index 095facf..dfa0b16 100644 --- a/consumer.py +++ b/consumer.py @@ -33,7 +33,7 @@ def configure_file_logger(name, filename="logs/converter.log", filemode="w+", def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict): logger.info(f"Start processing book-{book_id}.") try: - json_file_path = "json/9781614382264.json" + json_file_path = "books/json/9781614382264.json" book = book_type(book_id=book_id, main_logger=logger, **params) book.conversion_local(json_file_path) except Exception as exc: diff --git a/presets/.gitignore b/presets/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/presets/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 6260edb..5edeb46 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -14,7 +14,7 @@ class DocxBook(BookSolver): def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None): super().__init__(book_id, access, main_logger) - self.book_type = 'docx' + self.book_type = "docx" # critical section for occupying libreoffice by one thread self.libre_locker: Event() = libre_locker @@ -53,9 +53,9 @@ class DocxBook(BookSolver): if __name__ == "__main__": - docx_file_path = '../../books/docx/music_inquiry.docx' + docx_file_path = "../../books/docx/music_inquiry.docx" logger_object = BookLogger( - name='docx', book_id=docx_file_path.split('/')[-1]) + name="docx", book_id=docx_file_path.split("/")[-1]) locker = Event() locker.set() @@ -70,5 +70,5 @@ if __name__ == "__main__": content, footnotes, top_level_headers, logger_object) content_dict = json_converter.convert_to_dict() - with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f: + with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f: json.dump(content_dict, f, ensure_ascii=False) diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py index c269b73..bda6733 100644 --- a/src/docx_converter/footnotes_processing.py +++ b/src/docx_converter/footnotes_processing.py @@ -9,58 +9,58 @@ def _clean_footnote_content(content): def process_footnotes(body_tag): """Function returns list of footnotes and delete them from html_soup.""" - footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc') + footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc") footnote_content = body_tag.find_all( - 'div', id=re.compile(r'^sdfootnote\d+$')) + "div", id=re.compile(r"^sdfootnote\d+$")) footnote_amt = len(footnote_anchors) assert footnote_amt == len(footnote_content), \ - 'Something went wrong with footnotes after libre conversion' + "Something went wrong with footnotes after libre conversion" footnotes = [] for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): true_a_tag = cont_tag.find_all( - 'a', class_=re.compile(r'^sdfootnote.+$'))[0] + "a", class_=re.compile(r"^sdfootnote.+$"))[0] - if true_a_tag.attrs.get('href') is None: + if true_a_tag.attrs.get("href") is None: cont_tag.a.decompose() continue - assert anc_tag['name'] == true_a_tag['href'][1:], \ - 'Something went wrong with footnotes after libre conversion' + assert anc_tag["name"] == true_a_tag["href"][1:], \ + "Something went wrong with footnotes after libre conversion" - new_tag = BeautifulSoup(features='lxml').new_tag('sup') - new_tag['class'] = 'footnote-element' - new_tag['data-id'] = i + 1 - new_tag['id'] = f'footnote-{i + 1}' - new_tag.string = '*' + new_tag = BeautifulSoup(features="lxml").new_tag("sup") + new_tag["class"] = "footnote-element" + new_tag["data-id"] = i + 1 + new_tag["id"] = f"footnote-{i + 1}" + new_tag.string = "*" anc_tag.replace_with(new_tag) # extra digits in footnotes from documents downloaded from livecarta a_text = true_a_tag.text - if len(cont_tag.find_all('p')): - sup = cont_tag.find_all('p')[0].find('sup') + if len(cont_tag.find_all("p")): + sup = cont_tag.find_all("p")[0].find("sup") if sup and sup.text == a_text: sup.decompose() - for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}): + for tag_a in cont_tag.find_all("a", {"class": "sdfootnotesym"}): tag_a.decompose() # remove font-size - for span in cont_tag.find_all('span', {'style': re.compile('font-size')}): - style = span.get('style') + for span in cont_tag.find_all("span", {"style": re.compile("font-size")}): + style = span.get("style") style = re.sub(r"font-size: \d+px", "", style) - if style == '': - del span.attrs['style'] + if style == "": + del span.attrs["style"] else: - span.attrs['style'] = style + span.attrs["style"] = style - unicode_string = '' + unicode_string = "" for child in cont_tag.children: if type(child) is NavigableString: continue - if child.name == 'blockquote': + if child.name == "blockquote": unicode_string += str(child) else: unicode_string += child.decode_contents() diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py index 0eab671..9c5fdab 100644 --- a/src/docx_converter/image_processing.py +++ b/src/docx_converter/image_processing.py @@ -10,23 +10,23 @@ def process_images(access, html_path, book_id, body_tag): For now images are moved to one folder. """ - img_tags = body_tag.find_all('img') + img_tags = body_tag.find_all("img") for img in img_tags: - img_name = img.attrs.get('src') + img_name = img.attrs.get("src") # quick fix for bad links - if (len(img_name) >= 3) and img_name[:3] == '../': + if (len(img_name) >= 3) and img_name[:3] == "../": img_name = img_name[3:] - img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') + img_path = pathlib.Path(f"{html_path.parent}", f"{img_name}") if access is not None: link = access.send_image(img_path, doc_id=book_id) - img.attrs['src'] = link + img.attrs["src"] = link else: if img_tags.index(img) == 0: folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join( - folder_path, f'../books/json/img_{book_id}/')) + folder_path, f"../books/json/img_{book_id}/")) new_path.mkdir(exist_ok=True) new_img_path = new_path / img_name copyfile(img_path, new_img_path) diff --git a/src/docx_converter/libre_html2json_converter.py b/src/docx_converter/libre_html2json_converter.py index 0cd92fa..eb5f0a2 100644 --- a/src/docx_converter/libre_html2json_converter.py +++ b/src/docx_converter/libre_html2json_converter.py @@ -29,7 +29,7 @@ class LibreHTML2JSONConverter: cleaned text """ - new_text = re.sub(r'([\n\t])', ' ', html_text) + new_text = re.sub(r"([\n\t])", " ", html_text) return new_text # TODO: rethink the function structure without indexes. @@ -48,16 +48,16 @@ class LibreHTML2JSONConverter: """ if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: title = str(self.content[ind]) - title = title.replace(f'<{self.content[ind].name}>', '') - title = title.replace(f'', '') - title = re.sub(r'^\n', '', title) + title = title.replace(f"<{self.content[ind].name}>", "") + title = title.replace(f"", "") + title = re.sub(r"^\n", "", title) # extract outline from tag curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) result = { - 'title': f'{title}', - 'contents': [], - 'sub_items': [] + "title": f"{title}", + "contents": [], + "sub_items": [] } ch_content = [] ind += 1 @@ -71,9 +71,9 @@ class LibreHTML2JSONConverter: header_dict, ind = self.header_to_livecarta_chapter_item( ind) if ch_content: - result['contents'].append("".join(ch_content)) + result["contents"].append("".join(ch_content)) ch_content = [] - result['sub_items'].append(header_dict) + result["sub_items"].append(header_dict) # - current h_i <= h_initial, end of recursion else: # return result, ind @@ -85,21 +85,21 @@ class LibreHTML2JSONConverter: ind += 1 if ch_content: - result['contents'].append("".join(ch_content)) + result["contents"].append("".join(ch_content)) return result, ind - return '' + return "" @staticmethod def _is_empty_p_tag(tag): - if tag.name != 'p': + if tag.name != "p": return False temp_tag = copy(tag) - brs = temp_tag.find_all('br') + brs = temp_tag.find_all("br") for br in brs: br.decompose() - text = re.sub(r'\s+', '', temp_tag.text) + text = re.sub(r"\s+", "", temp_tag.text) if text: return False @@ -117,7 +117,7 @@ class LibreHTML2JSONConverter: res, ind = self.header_to_livecarta_chapter_item(ind) else: - chapter_title = f'Untitled chapter {ch_num}' + chapter_title = f"Untitled chapter {ch_num}" chapter = [] while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: if not self._is_empty_p_tag(self.content[ind]): @@ -126,9 +126,9 @@ class LibreHTML2JSONConverter: ind += 1 if chapter: res = { - 'title': chapter_title, - 'contents': ["".join(chapter)], - 'sub_items': [] + "title": chapter_title, + "contents": ["".join(chapter)], + "sub_items": [] } ch_num += 1 @@ -136,10 +136,10 @@ class LibreHTML2JSONConverter: json_strc.append(res) ch_amt += 1 self.logger_object.log( - f'Chapter {ch_amt} has been added to structure.') + f"Chapter {ch_amt} has been added to structure.") except Exception as exc: self.logger_object.log( - 'Error has occurred while making json structure.', logging.ERROR) + "Error has occurred while making json structure.", logging.ERROR) self.logger_object.log_error_to_main_log() if self.book_api_status: self.book_api_status.set_error() @@ -148,10 +148,10 @@ class LibreHTML2JSONConverter: # Add is_introduction field to json structure # after deleting content before toc, some chapters can be deleted if self.top_level_headers: - same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title'] - is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered'] + same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"] + is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"] - json_strc[0]['is_introduction'] = is_first_header_introduction + json_strc[0]["is_introduction"] = is_first_header_introduction self.content_dict = { "content": json_strc, diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 4a09481..b8bccf2 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -633,7 +633,7 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/9780763774134.epub" + epub_file_path = "../../books/epub/9780763774134.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py index f82f073..34cd1fb 100644 --- a/src/epub_converter/footnotes_processing.py +++ b/src/epub_converter/footnotes_processing.py @@ -72,7 +72,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) footnote_tag = expected_footnote_tags[0] - if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote": + if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote": footnote_tag = footnote_tag.parent new_noterefs_tags.append( _replace_with_livecarta_anchor_tag(noteref_tag, i)) @@ -80,7 +80,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note # footnote_tag.decompose() footnotes.append(content) footnote_tag = footnote_tag.find( - attrs={"role": "doc-backlink"}) or footnote_tag + attrs={"role": "docs-backlink"}) or footnote_tag new_footnotes_tags.append(footnote_tag) for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)): diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py index e568aaa..6f35c3a 100644 --- a/src/epub_converter/image_processing.py +++ b/src/epub_converter/image_processing.py @@ -16,7 +16,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): """Function saves all images locally""" folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join( - folder_path, f"../json/img_{book_id}/")) + folder_path, f"../books/json/img_{book_id}/")) new_path.mkdir(exist_ok=True) new_img_path = new_path / os.path.basename(img_file_path)