minor fixes in book.py

This commit is contained in:
Jeniamakarchik
2020-02-05 16:40:10 +03:00
parent d951682054
commit d13dab2d9e

View File

@@ -1,16 +1,12 @@
import argparse
import codecs import codecs
import json import json
import logging import logging
import os import os
import pathlib import pathlib
import re import re
from shutil import copyfile
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# from src.header_detection import HeaderDetector
class Book: class Book:
# Main constant values # Main constant values
@@ -110,8 +106,9 @@ class Book:
Method for convert .docx document to .html file. Method for convert .docx document to .html file.
""" """
self.log(f'File - {self.file_path}.') self.log(f'File - {self.file_path}.')
print(self.file_path) print(f'{self.file_path}')
self.log('Beginning of conversion from .docx to .html.') self.log('Beginning of conversion from .docx to .html.')
try: try:
f = open(self.file_path) f = open(self.file_path)
f.close() f.close()
@@ -131,7 +128,7 @@ class Book:
self.logger.error('Conversion has gone wrong.') self.logger.error('Conversion has gone wrong.')
raise e raise e
self.log("End of conversion from .docx to .html.") self.log('End of conversion from .docx to .html.')
self.log(f'Input file path after conversion: {self.file_path}.') self.log(f'Input file path after conversion: {self.file_path}.')
def check_output_directory(self): def check_output_directory(self):
@@ -140,15 +137,10 @@ class Book:
self.output_path = f'json/{filename}' self.output_path = f'json/{filename}'
self.output_path = pathlib.Path(self.output_path) self.output_path = pathlib.Path(self.output_path)
self.logger.info(f'Output file path: {self.output_path}') self.log(f'Output file path: {self.output_path}')
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
try: self.output_path.touch(exist_ok=True)
self.output_path.touch(exist_ok=self.recreate)
except FileExistsError as e:
self.logger.error('Output file already exists! '
'Either change the name of output file or use --recreate switch.')
raise e
def read_html(self): def read_html(self):
""" """
@@ -163,10 +155,6 @@ class Book:
self.html_soup = BeautifulSoup(html_text, features='lxml') self.html_soup = BeautifulSoup(html_text, features='lxml')
self.body_tag = self.html_soup.body self.body_tag = self.html_soup.body
# head_tag = self.html_soup.head
# styles = parse_styles(head_tag.style)
# head_tag.decompose()
def _clean_tag(self, tag, attr_name, attr_value): def _clean_tag(self, tag, attr_name, attr_value):
""" """
Function to clean tags by its name and attribute value. Function to clean tags by its name and attribute value.
@@ -307,13 +295,13 @@ class Book:
""" """
Function to process <dl> tags. All tags will be replaced with <blockquote> tags. Function to process <dl> tags. All tags will be replaced with <blockquote> tags.
""" """
dls = self.body_tag.find_all("dl") dls = self.body_tag.find_all('dl')
for dl in dls: for dl in dls:
pars = dl.find_all("p") pars = dl.find_all('p')
for p in pars: for p in pars:
p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote")) p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote'))
new_div = BeautifulSoup(features="lxml").new_tag("div") new_div = BeautifulSoup(features='lxml').new_tag('div')
for p in pars: for p in pars:
new_div.append(p.parent) new_div.append(p.parent)
dl.replaceWith(new_div) dl.replaceWith(new_div)
@@ -328,8 +316,8 @@ class Book:
""" """
Function returns list of footnotes and delete them from html_soup. Function returns list of footnotes and delete them from html_soup.
""" """
footnote_ancors = self.body_tag.find_all("a", class_="sdfootnoteanc") footnote_ancors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all("div", id=re.compile(r"^sdfootnote\d+$")) footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_ancors) footnote_amt = len(footnote_ancors)
assert footnote_amt == len(footnote_content) assert footnote_amt == len(footnote_content)
@@ -338,7 +326,7 @@ class Book:
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)): for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
assert anc_tag['name'] == cont_tag.find('a')['href'][1:] assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
new_tag = BeautifulSoup(features="lxml").new_tag('sup') new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element' new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i+1 new_tag['data-id'] = i+1
new_tag['id'] = f'footnote-{i+1}' new_tag['id'] = f'footnote-{i+1}'
@@ -363,7 +351,7 @@ class Book:
Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link. Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
For now images are moved to one folder. For now images are moved to one folder.
""" """
imgs = self.body_tag.find_all("img") imgs = self.body_tag.find_all('img')
if len(imgs): if len(imgs):
new_path = pathlib.Path(f'json/img_{self.file_path.stem}/') new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
@@ -518,6 +506,7 @@ class Book:
result = {title: []} result = {title: []}
ch_content = [] ch_content = []
ind += 1 ind += 1
while ind < len(self.content): while ind < len(self.content):
if self.content[ind].name in self.SUPPORTED_HEADERS: if self.content[ind].name in self.SUPPORTED_HEADERS:
outline = int(re.sub(r"^h", "", self.content[ind].name)) outline = int(re.sub(r"^h", "", self.content[ind].name))
@@ -535,6 +524,7 @@ class Book:
# result[title].append(res) # result[title].append(res)
ch_content.append(res) ch_content.append(res)
ind += 1 ind += 1
if ch_content: if ch_content:
result[title].append("".join(ch_content)) result[title].append("".join(ch_content))
return result, ind return result, ind
@@ -547,6 +537,7 @@ class Book:
json_strc = [] json_strc = []
ind = 0 ind = 0
ch_num = 0 ch_num = 0
while ind < len(self.content): while ind < len(self.content):
if self.content[ind].name in self.SUPPORTED_HEADERS: if self.content[ind].name in self.SUPPORTED_HEADERS:
res, ind = self.header_to_json(ind) res, ind = self.header_to_json(ind)
@@ -584,13 +575,3 @@ class Book:
self.convert_to_json() self.convert_to_json()
self.write_json() self.write_json()
self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.') self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
if __name__ == '__main__':
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
book = Book(file_path="", recreate=True)
book.parse_args()
book.conversion(logging_format)
print('Script has finished.')