forked from LiveCarta/BookConverter
Set up local docx_converter
This commit is contained in:
@@ -35,7 +35,7 @@ class DocxBook(BookSolver):
|
|||||||
"""
|
"""
|
||||||
# 1. Converts docx to html with LibreOffice
|
# 1. Converts docx to html with LibreOffice
|
||||||
html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access,
|
html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access,
|
||||||
self.logger_object, self.status_wrapper, self.libre_locker)
|
self.logger_object, self.libre_locker)
|
||||||
# TODO presets
|
# TODO presets
|
||||||
|
|
||||||
# 2. Parses and cleans html, gets list of tags, gets footnotes
|
# 2. Parses and cleans html, gets list of tags, gets footnotes
|
||||||
@@ -46,7 +46,7 @@ class DocxBook(BookSolver):
|
|||||||
|
|
||||||
# 3. Parses from line structure to nested structure with JSONConverter
|
# 3. Parses from line structure to nested structure with JSONConverter
|
||||||
json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers,
|
json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers,
|
||||||
self.logger_object, self.status_wrapper)
|
self.logger_object)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
|
|
||||||
return content_dict
|
return content_dict
|
||||||
@@ -56,12 +56,15 @@ if __name__ == "__main__":
|
|||||||
docx_file_path = '../../docx/music_inquiry.docx'
|
docx_file_path = '../../docx/music_inquiry.docx'
|
||||||
logger_object = BookLogger(
|
logger_object = BookLogger(
|
||||||
name='docx', book_id=docx_file_path.split('/')[-1])
|
name='docx', book_id=docx_file_path.split('/')[-1])
|
||||||
|
locker = Event()
|
||||||
|
locker.set()
|
||||||
|
|
||||||
html_converter = Docx2LibreHTML(file_path=docx_file_path)
|
html_converter = Docx2LibreHTML(file_path=docx_file_path,
|
||||||
|
logger=logger_object, libre_locker=locker)
|
||||||
|
|
||||||
parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object)
|
parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object)
|
||||||
content, footnotes, top_level_headers = parser.process_html(
|
content, footnotes, top_level_headers = parser.process_html(
|
||||||
html_converter.html_path)
|
html_path=html_converter.html_path, book_id=html_converter.book_id)
|
||||||
|
|
||||||
json_converter = LibreHTML2JSONConverter(
|
json_converter = LibreHTML2JSONConverter(
|
||||||
content, footnotes, top_level_headers, logger_object)
|
content, footnotes, top_level_headers, logger_object)
|
||||||
|
|||||||
@@ -26,15 +26,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
|||||||
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
||||||
|
|
||||||
"""
|
"""
|
||||||
footnotes = []
|
footnotes, new_noterefs_tags, new_footnotes_tags = [], [], []
|
||||||
noterefs_tags = source_html_tag.find_all(
|
noterefs_tags = source_html_tag.find_all(
|
||||||
attrs={noteref_attr_name: "noteref"})
|
attrs={noteref_attr_name: "noteref"})
|
||||||
bad_noterefs_tags = set(
|
bad_noterefs_tags = set(
|
||||||
[tag for tag in noterefs_tags if not tag.attrs.get("href")])
|
[tag for tag in noterefs_tags if not tag.attrs.get("href")])
|
||||||
noterefs_tags = [
|
noterefs_tags = [
|
||||||
tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
|
tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
|
||||||
new_noterefs_tags = []
|
|
||||||
new_footnotes_tags = []
|
|
||||||
[tag.decompose() for tag in bad_noterefs_tags]
|
[tag.decompose() for tag in bad_noterefs_tags]
|
||||||
|
|
||||||
def parse_a_tag_href(s: str) -> Tuple[str, str]:
|
def parse_a_tag_href(s: str) -> Tuple[str, str]:
|
||||||
|
|||||||
Reference in New Issue
Block a user