From b1ccd796c9cbd6f580ac4e9cf50ac679f757b3dc Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 19:13:34 +0300 Subject: [PATCH] Set up local docx_converter --- src/docx_converter/docx_solver.py | 11 +++++++---- src/epub_converter/footnotes_processing.py | 4 +--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index b4aa9b3..9f1735b 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -35,7 +35,7 @@ class DocxBook(BookSolver): """ # 1. Converts docx to html with LibreOffice html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, - self.logger_object, self.status_wrapper, self.libre_locker) + self.logger_object, self.libre_locker) # TODO presets # 2. Parses and cleans html, gets list of tags, gets footnotes @@ -46,7 +46,7 @@ class DocxBook(BookSolver): # 3. Parses from line structure to nested structure with JSONConverter json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers, - self.logger_object, self.status_wrapper) + self.logger_object) content_dict = json_converter.convert_to_dict() return content_dict @@ -56,12 +56,15 @@ if __name__ == "__main__": docx_file_path = '../../docx/music_inquiry.docx' logger_object = BookLogger( name='docx', book_id=docx_file_path.split('/')[-1]) + locker = Event() + locker.set() - html_converter = Docx2LibreHTML(file_path=docx_file_path) + html_converter = Docx2LibreHTML(file_path=docx_file_path, + logger=logger_object, libre_locker=locker) parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object) content, footnotes, top_level_headers = parser.process_html( - html_converter.html_path) + html_path=html_converter.html_path, book_id=html_converter.book_id) json_converter = LibreHTML2JSONConverter( content, footnotes, top_level_headers, logger_object) diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py index ae568e0..f82f073 100644 --- a/src/epub_converter/footnotes_processing.py +++ b/src/epub_converter/footnotes_processing.py @@ -26,15 +26,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note """ - footnotes = [] + footnotes, new_noterefs_tags, new_footnotes_tags = [], [], [] noterefs_tags = source_html_tag.find_all( attrs={noteref_attr_name: "noteref"}) bad_noterefs_tags = set( [tag for tag in noterefs_tags if not tag.attrs.get("href")]) noterefs_tags = [ tag for tag in noterefs_tags if tag not in bad_noterefs_tags] - new_noterefs_tags = [] - new_footnotes_tags = [] [tag.decompose() for tag in bad_noterefs_tags] def parse_a_tag_href(s: str) -> Tuple[str, str]: