minor fixes in book.py

This commit is contained in:
Jeniamakarchik
2020-02-05 16:40:10 +03:00
parent d951682054
commit d13dab2d9e

View File

@@ -1,16 +1,12 @@
import argparse
import codecs
import json
import logging
import os
import pathlib
import re
from shutil import copyfile
from bs4 import BeautifulSoup
# from src.header_detection import HeaderDetector
class Book:
# Main constant values
@@ -110,8 +106,9 @@ class Book:
Method for convert .docx document to .html file.
"""
self.log(f'File - {self.file_path}.')
print(self.file_path)
print(f'{self.file_path}')
self.log('Beginning of conversion from .docx to .html.')
try:
f = open(self.file_path)
f.close()
@@ -131,7 +128,7 @@ class Book:
self.logger.error('Conversion has gone wrong.')
raise e
self.log("End of conversion from .docx to .html.")
self.log('End of conversion from .docx to .html.')
self.log(f'Input file path after conversion: {self.file_path}.')
def check_output_directory(self):
@@ -140,15 +137,10 @@ class Book:
self.output_path = f'json/{filename}'
self.output_path = pathlib.Path(self.output_path)
self.logger.info(f'Output file path: {self.output_path}')
self.log(f'Output file path: {self.output_path}')
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
try:
self.output_path.touch(exist_ok=self.recreate)
except FileExistsError as e:
self.logger.error('Output file already exists! '
'Either change the name of output file or use --recreate switch.')
raise e
self.output_path.touch(exist_ok=True)
def read_html(self):
"""
@@ -163,10 +155,6 @@ class Book:
self.html_soup = BeautifulSoup(html_text, features='lxml')
self.body_tag = self.html_soup.body
# head_tag = self.html_soup.head
# styles = parse_styles(head_tag.style)
# head_tag.decompose()
def _clean_tag(self, tag, attr_name, attr_value):
"""
Function to clean tags by its name and attribute value.
@@ -307,13 +295,13 @@ class Book:
"""
Function to process <dl> tags. All tags will be replaced with <blockquote> tags.
"""
dls = self.body_tag.find_all("dl")
dls = self.body_tag.find_all('dl')
for dl in dls:
pars = dl.find_all("p")
pars = dl.find_all('p')
for p in pars:
p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))
new_div = BeautifulSoup(features="lxml").new_tag("div")
p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote'))
new_div = BeautifulSoup(features='lxml').new_tag('div')
for p in pars:
new_div.append(p.parent)
dl.replaceWith(new_div)
@@ -328,8 +316,8 @@ class Book:
"""
Function returns list of footnotes and delete them from html_soup.
"""
footnote_ancors = self.body_tag.find_all("a", class_="sdfootnoteanc")
footnote_content = self.body_tag.find_all("div", id=re.compile(r"^sdfootnote\d+$"))
footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_ancors)
assert footnote_amt == len(footnote_content)
@@ -338,7 +326,7 @@ class Book:
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
new_tag = BeautifulSoup(features="lxml").new_tag('sup')
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i+1
new_tag['id'] = f'footnote-{i+1}'
@@ -363,7 +351,7 @@ class Book:
Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
For now images are moved to one folder.
"""
imgs = self.body_tag.find_all("img")
imgs = self.body_tag.find_all('img')
if len(imgs):
new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
@@ -518,6 +506,7 @@ class Book:
result = {title: []}
ch_content = []
ind += 1
while ind < len(self.content):
if self.content[ind].name in self.SUPPORTED_HEADERS:
outline = int(re.sub(r"^h", "", self.content[ind].name))
@@ -535,6 +524,7 @@ class Book:
# result[title].append(res)
ch_content.append(res)
ind += 1
if ch_content:
result[title].append("".join(ch_content))
return result, ind
@@ -547,6 +537,7 @@ class Book:
json_strc = []
ind = 0
ch_num = 0
while ind < len(self.content):
if self.content[ind].name in self.SUPPORTED_HEADERS:
res, ind = self.header_to_json(ind)
@@ -584,13 +575,3 @@ class Book:
self.convert_to_json()
self.write_json()
self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
if __name__ == '__main__':
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
book = Book(file_path="", recreate=True)
book.parse_args()
book.conversion(logging_format)
print('Script has finished.')