forked from LiveCarta/BookConverter
minor fixes in book.py
This commit is contained in:
51
src/book.py
51
src/book.py
@@ -1,16 +1,12 @@
|
|||||||
import argparse
|
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
from shutil import copyfile
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# from src.header_detection import HeaderDetector
|
|
||||||
|
|
||||||
|
|
||||||
class Book:
|
class Book:
|
||||||
# Main constant values
|
# Main constant values
|
||||||
@@ -110,8 +106,9 @@ class Book:
|
|||||||
Method for convert .docx document to .html file.
|
Method for convert .docx document to .html file.
|
||||||
"""
|
"""
|
||||||
self.log(f'File - {self.file_path}.')
|
self.log(f'File - {self.file_path}.')
|
||||||
print(self.file_path)
|
print(f'{self.file_path}')
|
||||||
self.log('Beginning of conversion from .docx to .html.')
|
self.log('Beginning of conversion from .docx to .html.')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
f = open(self.file_path)
|
f = open(self.file_path)
|
||||||
f.close()
|
f.close()
|
||||||
@@ -131,7 +128,7 @@ class Book:
|
|||||||
self.logger.error('Conversion has gone wrong.')
|
self.logger.error('Conversion has gone wrong.')
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
self.log("End of conversion from .docx to .html.")
|
self.log('End of conversion from .docx to .html.')
|
||||||
self.log(f'Input file path after conversion: {self.file_path}.')
|
self.log(f'Input file path after conversion: {self.file_path}.')
|
||||||
|
|
||||||
def check_output_directory(self):
|
def check_output_directory(self):
|
||||||
@@ -140,15 +137,10 @@ class Book:
|
|||||||
self.output_path = f'json/{filename}'
|
self.output_path = f'json/{filename}'
|
||||||
|
|
||||||
self.output_path = pathlib.Path(self.output_path)
|
self.output_path = pathlib.Path(self.output_path)
|
||||||
self.logger.info(f'Output file path: {self.output_path}')
|
self.log(f'Output file path: {self.output_path}')
|
||||||
|
|
||||||
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
|
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
try:
|
self.output_path.touch(exist_ok=True)
|
||||||
self.output_path.touch(exist_ok=self.recreate)
|
|
||||||
except FileExistsError as e:
|
|
||||||
self.logger.error('Output file already exists! '
|
|
||||||
'Either change the name of output file or use --recreate switch.')
|
|
||||||
raise e
|
|
||||||
|
|
||||||
def read_html(self):
|
def read_html(self):
|
||||||
"""
|
"""
|
||||||
@@ -163,10 +155,6 @@ class Book:
|
|||||||
self.html_soup = BeautifulSoup(html_text, features='lxml')
|
self.html_soup = BeautifulSoup(html_text, features='lxml')
|
||||||
self.body_tag = self.html_soup.body
|
self.body_tag = self.html_soup.body
|
||||||
|
|
||||||
# head_tag = self.html_soup.head
|
|
||||||
# styles = parse_styles(head_tag.style)
|
|
||||||
# head_tag.decompose()
|
|
||||||
|
|
||||||
def _clean_tag(self, tag, attr_name, attr_value):
|
def _clean_tag(self, tag, attr_name, attr_value):
|
||||||
"""
|
"""
|
||||||
Function to clean tags by its name and attribute value.
|
Function to clean tags by its name and attribute value.
|
||||||
@@ -307,13 +295,13 @@ class Book:
|
|||||||
"""
|
"""
|
||||||
Function to process <dl> tags. All tags will be replaced with <blockquote> tags.
|
Function to process <dl> tags. All tags will be replaced with <blockquote> tags.
|
||||||
"""
|
"""
|
||||||
dls = self.body_tag.find_all("dl")
|
dls = self.body_tag.find_all('dl')
|
||||||
|
|
||||||
for dl in dls:
|
for dl in dls:
|
||||||
pars = dl.find_all("p")
|
pars = dl.find_all('p')
|
||||||
for p in pars:
|
for p in pars:
|
||||||
p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))
|
p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote'))
|
||||||
new_div = BeautifulSoup(features="lxml").new_tag("div")
|
new_div = BeautifulSoup(features='lxml').new_tag('div')
|
||||||
for p in pars:
|
for p in pars:
|
||||||
new_div.append(p.parent)
|
new_div.append(p.parent)
|
||||||
dl.replaceWith(new_div)
|
dl.replaceWith(new_div)
|
||||||
@@ -328,8 +316,8 @@ class Book:
|
|||||||
"""
|
"""
|
||||||
Function returns list of footnotes and delete them from html_soup.
|
Function returns list of footnotes and delete them from html_soup.
|
||||||
"""
|
"""
|
||||||
footnote_ancors = self.body_tag.find_all("a", class_="sdfootnoteanc")
|
footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc')
|
||||||
footnote_content = self.body_tag.find_all("div", id=re.compile(r"^sdfootnote\d+$"))
|
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
|
||||||
footnote_amt = len(footnote_ancors)
|
footnote_amt = len(footnote_ancors)
|
||||||
|
|
||||||
assert footnote_amt == len(footnote_content)
|
assert footnote_amt == len(footnote_content)
|
||||||
@@ -338,7 +326,7 @@ class Book:
|
|||||||
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
|
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
|
||||||
assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
|
assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
|
||||||
|
|
||||||
new_tag = BeautifulSoup(features="lxml").new_tag('sup')
|
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||||
new_tag['class'] = 'footnote-element'
|
new_tag['class'] = 'footnote-element'
|
||||||
new_tag['data-id'] = i+1
|
new_tag['data-id'] = i+1
|
||||||
new_tag['id'] = f'footnote-{i+1}'
|
new_tag['id'] = f'footnote-{i+1}'
|
||||||
@@ -363,7 +351,7 @@ class Book:
|
|||||||
Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
|
Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
|
||||||
For now images are moved to one folder.
|
For now images are moved to one folder.
|
||||||
"""
|
"""
|
||||||
imgs = self.body_tag.find_all("img")
|
imgs = self.body_tag.find_all('img')
|
||||||
|
|
||||||
if len(imgs):
|
if len(imgs):
|
||||||
new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
|
new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
|
||||||
@@ -518,6 +506,7 @@ class Book:
|
|||||||
result = {title: []}
|
result = {title: []}
|
||||||
ch_content = []
|
ch_content = []
|
||||||
ind += 1
|
ind += 1
|
||||||
|
|
||||||
while ind < len(self.content):
|
while ind < len(self.content):
|
||||||
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
||||||
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
||||||
@@ -535,6 +524,7 @@ class Book:
|
|||||||
# result[title].append(res)
|
# result[title].append(res)
|
||||||
ch_content.append(res)
|
ch_content.append(res)
|
||||||
ind += 1
|
ind += 1
|
||||||
|
|
||||||
if ch_content:
|
if ch_content:
|
||||||
result[title].append("".join(ch_content))
|
result[title].append("".join(ch_content))
|
||||||
return result, ind
|
return result, ind
|
||||||
@@ -547,6 +537,7 @@ class Book:
|
|||||||
json_strc = []
|
json_strc = []
|
||||||
ind = 0
|
ind = 0
|
||||||
ch_num = 0
|
ch_num = 0
|
||||||
|
|
||||||
while ind < len(self.content):
|
while ind < len(self.content):
|
||||||
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
||||||
res, ind = self.header_to_json(ind)
|
res, ind = self.header_to_json(ind)
|
||||||
@@ -584,13 +575,3 @@ class Book:
|
|||||||
self.convert_to_json()
|
self.convert_to_json()
|
||||||
self.write_json()
|
self.write_json()
|
||||||
self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
|
|
||||||
|
|
||||||
book = Book(file_path="", recreate=True)
|
|
||||||
book.parse_args()
|
|
||||||
book.conversion(logging_format)
|
|
||||||
|
|
||||||
print('Script has finished.')
|
|
||||||
|
|||||||
Reference in New Issue
Block a user