forked from LiveCarta/BookConverter
Update book.py
add solution for skiping everything before table of content
This commit is contained in:
115
src/book.py
115
src/book.py
@@ -4,6 +4,8 @@ import logging
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
from copy import copy
|
||||
from shutil import copyfile
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -28,12 +30,12 @@ class Book:
|
||||
}
|
||||
SUPPORTED_HEADERS = ["h1", "h2", "h3"]
|
||||
|
||||
def __init__(self, book_id, access=None):
|
||||
def __init__(self, book_id=0, access=None, file_path=None, output_path=None):
|
||||
self.book_id = book_id
|
||||
self.access = access
|
||||
self.file_path = file_path
|
||||
self.output_path = output_path
|
||||
|
||||
self.file_path = None
|
||||
self.output_path = None
|
||||
self.logger = None
|
||||
self.html_soup = None
|
||||
self.body_tag = None
|
||||
@@ -268,6 +270,19 @@ class Book:
|
||||
for table in tables:
|
||||
table.decompose()
|
||||
|
||||
def _change_table_of_contents(self):
|
||||
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
|
||||
for table in tables:
|
||||
table.wrap(self.html_soup.new_tag("TOC"))
|
||||
table.decompose()
|
||||
|
||||
def delete_content_before_toc(self):
|
||||
toc_tag = self.html_soup.new_tag('TOC')
|
||||
if toc_tag in self.content:
|
||||
ind = self.content.index(toc_tag) + 1
|
||||
self.content = self.content[ind:]
|
||||
self.write_html_from_list()
|
||||
|
||||
def clean_trash(self):
|
||||
"""
|
||||
Function to remove all styles and tags we don't need.
|
||||
@@ -283,7 +298,8 @@ class Book:
|
||||
self._clean_underline_links()
|
||||
|
||||
self._font_to_span()
|
||||
self._remove_table_of_contents()
|
||||
# self._remove_table_of_contents()
|
||||
self._change_table_of_contents()
|
||||
|
||||
def _process_paragraph(self):
|
||||
"""
|
||||
@@ -359,8 +375,8 @@ class Book:
|
||||
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||
new_tag['class'] = 'footnote-element'
|
||||
new_tag['data-id'] = i+1
|
||||
new_tag['id'] = f'footnote-{i+1}'
|
||||
new_tag['data-id'] = i + 1
|
||||
new_tag['id'] = f'footnote-{i + 1}'
|
||||
new_tag.string = '*'
|
||||
anc_tag.replace_with(new_tag)
|
||||
|
||||
@@ -385,21 +401,24 @@ class Book:
|
||||
imgs = self.body_tag.find_all('img')
|
||||
|
||||
if len(imgs):
|
||||
# new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
|
||||
# new_path.mkdir(exist_ok=True)
|
||||
if self.access is None:
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
|
||||
new_path.mkdir(exist_ok=True)
|
||||
|
||||
for img in imgs:
|
||||
img_name = img.attrs.get('src')
|
||||
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
|
||||
|
||||
link = self.access.send_image(img_path, self.book_id)
|
||||
img.attrs['src'] = link
|
||||
|
||||
# img_size = os.path.getsize(img_path)
|
||||
# print(f'{img_name} successfully loaded. Image size: {img_size}.')
|
||||
# new_img_path = new_path / img_name
|
||||
# copyfile(img_path, new_img_path)
|
||||
# img.attrs["src"] = str(new_img_path)
|
||||
if self.access is not None:
|
||||
link = self.access.send_image(img_path, self.book_id)
|
||||
img.attrs['src'] = link
|
||||
else:
|
||||
img_size = os.path.getsize(img_path)
|
||||
print(f'{img_name} successfully loaded. Image size: {img_size}.')
|
||||
new_img_path = new_path / img_name
|
||||
copyfile(img_path, new_img_path)
|
||||
img.attrs["src"] = str(new_img_path)
|
||||
|
||||
self.images = imgs
|
||||
|
||||
@@ -472,7 +491,10 @@ class Book:
|
||||
tag.replace_with(new_tag)
|
||||
|
||||
def write_html_from_list(self, file_name='url_test.html'):
|
||||
with open(file_name, 'w', encoding='utf-8') as f_out:
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
file_path = pathlib.Path(os.path.join(folder_path, file_name))
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f_out:
|
||||
# f_out.write("".join([tag.prettify() for tag in self.content]))
|
||||
f_out.write(self.body_tag.prettify())
|
||||
self.logger.info(f'Check test file - url_test.html.')
|
||||
@@ -502,17 +524,14 @@ class Book:
|
||||
|
||||
self.content = self.body_tag.find_all(recursive=False)
|
||||
|
||||
# if self.train_mode:
|
||||
# self.model.train_model(self.content)
|
||||
# else:
|
||||
# self.model.predict_headers(self.content)
|
||||
|
||||
self.write_html_from_list()
|
||||
|
||||
self._process_toc_links()
|
||||
self._process_headings()
|
||||
|
||||
self.content = self.body_tag.find_all(recursive=False)
|
||||
|
||||
# delete text before table of content if exists
|
||||
self.delete_content_before_toc()
|
||||
|
||||
self.logger.info('End of processing .html file.')
|
||||
|
||||
@staticmethod
|
||||
@@ -563,6 +582,22 @@ class Book:
|
||||
return result, ind
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def _is_empty_p_tag(tag):
|
||||
if tag.name != 'p':
|
||||
return False
|
||||
|
||||
temp_tag = copy(tag)
|
||||
brs = temp_tag.find_all('br')
|
||||
for br in brs:
|
||||
br.decompose()
|
||||
|
||||
text = re.sub(r'\s+', '', temp_tag.text)
|
||||
if text:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def convert_to_json(self):
|
||||
"""
|
||||
Function which convert list of html nodes to appropriate json structure.
|
||||
@@ -572,17 +607,22 @@ class Book:
|
||||
ch_num = 0
|
||||
|
||||
while ind < len(self.content):
|
||||
res = {}
|
||||
|
||||
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
||||
res, ind = self.header_to_json(ind)
|
||||
else:
|
||||
chapter_title = f'Untitled chapter {ch_num}'
|
||||
chapter = []
|
||||
while ind < len(self.content) and self.content[ind].name not in self.SUPPORTED_HEADERS:
|
||||
chapter.append(self.format_html(str(self.content[ind])))
|
||||
if not self._is_empty_p_tag(self.content[ind]):
|
||||
chapter.append(self.format_html(str(self.content[ind])))
|
||||
ind += 1
|
||||
res = {chapter_title: ["".join(chapter)]}
|
||||
ch_num += 1
|
||||
json_strc.append(res)
|
||||
if chapter:
|
||||
res = {chapter_title: ["".join(chapter)]}
|
||||
ch_num += 1
|
||||
if res:
|
||||
json_strc.append(res)
|
||||
|
||||
self.content_dict = {
|
||||
"content": json_strc,
|
||||
@@ -599,6 +639,13 @@ class Book:
|
||||
except Exception as exc:
|
||||
raise exc
|
||||
|
||||
def convert_from_html(self, logging_format):
|
||||
self.configure_file_logger(__name__, logging_format=logging_format, filemode='w+')
|
||||
self.read_html()
|
||||
self.process_html()
|
||||
self.convert_to_json()
|
||||
self.write_json()
|
||||
|
||||
def conversion(self, logging_format, filemode='w+'):
|
||||
self.configure_file_logger(__name__, logging_format=logging_format, filemode=filemode)
|
||||
self.log('Beginning of conversion from .docx to .json.')
|
||||
@@ -607,10 +654,20 @@ class Book:
|
||||
self.convert_doc_to_html()
|
||||
self.check_output_directory()
|
||||
self.read_html()
|
||||
self.clean_trash()
|
||||
self.process_html()
|
||||
self.set_generate_status()
|
||||
self.convert_to_json()
|
||||
self.write_json()
|
||||
self.send_json_content()
|
||||
self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
file_path = pathlib.Path(os.path.join(folder_path, 'html/11/11.html'))
|
||||
out_path = pathlib.Path(os.path.join(folder_path, 'json/11.json'))
|
||||
|
||||
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
|
||||
|
||||
book = Book(file_path=file_path, output_path=out_path)
|
||||
book.convert_from_html(logging_format=logging_format)
|
||||
|
||||
Reference in New Issue
Block a user