forked from LiveCarta/BookConverter
minor fixes in book.py
This commit is contained in:
51
src/book.py
51
src/book.py
@@ -1,16 +1,12 @@
|
||||
import argparse
|
||||
import codecs
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
from shutil import copyfile
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# from src.header_detection import HeaderDetector
|
||||
|
||||
|
||||
class Book:
|
||||
# Main constant values
|
||||
@@ -110,8 +106,9 @@ class Book:
|
||||
Method for convert .docx document to .html file.
|
||||
"""
|
||||
self.log(f'File - {self.file_path}.')
|
||||
print(self.file_path)
|
||||
print(f'{self.file_path}')
|
||||
self.log('Beginning of conversion from .docx to .html.')
|
||||
|
||||
try:
|
||||
f = open(self.file_path)
|
||||
f.close()
|
||||
@@ -131,7 +128,7 @@ class Book:
|
||||
self.logger.error('Conversion has gone wrong.')
|
||||
raise e
|
||||
|
||||
self.log("End of conversion from .docx to .html.")
|
||||
self.log('End of conversion from .docx to .html.')
|
||||
self.log(f'Input file path after conversion: {self.file_path}.')
|
||||
|
||||
def check_output_directory(self):
|
||||
@@ -140,15 +137,10 @@ class Book:
|
||||
self.output_path = f'json/{filename}'
|
||||
|
||||
self.output_path = pathlib.Path(self.output_path)
|
||||
self.logger.info(f'Output file path: {self.output_path}')
|
||||
self.log(f'Output file path: {self.output_path}')
|
||||
|
||||
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
self.output_path.touch(exist_ok=self.recreate)
|
||||
except FileExistsError as e:
|
||||
self.logger.error('Output file already exists! '
|
||||
'Either change the name of output file or use --recreate switch.')
|
||||
raise e
|
||||
self.output_path.touch(exist_ok=True)
|
||||
|
||||
def read_html(self):
|
||||
"""
|
||||
@@ -163,10 +155,6 @@ class Book:
|
||||
self.html_soup = BeautifulSoup(html_text, features='lxml')
|
||||
self.body_tag = self.html_soup.body
|
||||
|
||||
# head_tag = self.html_soup.head
|
||||
# styles = parse_styles(head_tag.style)
|
||||
# head_tag.decompose()
|
||||
|
||||
def _clean_tag(self, tag, attr_name, attr_value):
|
||||
"""
|
||||
Function to clean tags by its name and attribute value.
|
||||
@@ -307,13 +295,13 @@ class Book:
|
||||
"""
|
||||
Function to process <dl> tags. All tags will be replaced with <blockquote> tags.
|
||||
"""
|
||||
dls = self.body_tag.find_all("dl")
|
||||
dls = self.body_tag.find_all('dl')
|
||||
|
||||
for dl in dls:
|
||||
pars = dl.find_all("p")
|
||||
pars = dl.find_all('p')
|
||||
for p in pars:
|
||||
p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))
|
||||
new_div = BeautifulSoup(features="lxml").new_tag("div")
|
||||
p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote'))
|
||||
new_div = BeautifulSoup(features='lxml').new_tag('div')
|
||||
for p in pars:
|
||||
new_div.append(p.parent)
|
||||
dl.replaceWith(new_div)
|
||||
@@ -328,8 +316,8 @@ class Book:
|
||||
"""
|
||||
Function returns list of footnotes and delete them from html_soup.
|
||||
"""
|
||||
footnote_ancors = self.body_tag.find_all("a", class_="sdfootnoteanc")
|
||||
footnote_content = self.body_tag.find_all("div", id=re.compile(r"^sdfootnote\d+$"))
|
||||
footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc')
|
||||
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
|
||||
footnote_amt = len(footnote_ancors)
|
||||
|
||||
assert footnote_amt == len(footnote_content)
|
||||
@@ -338,7 +326,7 @@ class Book:
|
||||
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
|
||||
assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
|
||||
|
||||
new_tag = BeautifulSoup(features="lxml").new_tag('sup')
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||
new_tag['class'] = 'footnote-element'
|
||||
new_tag['data-id'] = i+1
|
||||
new_tag['id'] = f'footnote-{i+1}'
|
||||
@@ -363,7 +351,7 @@ class Book:
|
||||
Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
|
||||
For now images are moved to one folder.
|
||||
"""
|
||||
imgs = self.body_tag.find_all("img")
|
||||
imgs = self.body_tag.find_all('img')
|
||||
|
||||
if len(imgs):
|
||||
new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
|
||||
@@ -518,6 +506,7 @@ class Book:
|
||||
result = {title: []}
|
||||
ch_content = []
|
||||
ind += 1
|
||||
|
||||
while ind < len(self.content):
|
||||
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
||||
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
||||
@@ -535,6 +524,7 @@ class Book:
|
||||
# result[title].append(res)
|
||||
ch_content.append(res)
|
||||
ind += 1
|
||||
|
||||
if ch_content:
|
||||
result[title].append("".join(ch_content))
|
||||
return result, ind
|
||||
@@ -547,6 +537,7 @@ class Book:
|
||||
json_strc = []
|
||||
ind = 0
|
||||
ch_num = 0
|
||||
|
||||
while ind < len(self.content):
|
||||
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
||||
res, ind = self.header_to_json(ind)
|
||||
@@ -584,13 +575,3 @@ class Book:
|
||||
self.convert_to_json()
|
||||
self.write_json()
|
||||
self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
|
||||
|
||||
book = Book(file_path="", recreate=True)
|
||||
book.parse_args()
|
||||
book.conversion(logging_format)
|
||||
|
||||
print('Script has finished.')
|
||||
|
||||
Reference in New Issue
Block a user