forked from LiveCarta/BookConverter
Update book.py
add solution for skiping everything before table of content
This commit is contained in:
99
src/book.py
99
src/book.py
@@ -4,6 +4,8 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
|
from copy import copy
|
||||||
|
from shutil import copyfile
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
@@ -28,12 +30,12 @@ class Book:
|
|||||||
}
|
}
|
||||||
SUPPORTED_HEADERS = ["h1", "h2", "h3"]
|
SUPPORTED_HEADERS = ["h1", "h2", "h3"]
|
||||||
|
|
||||||
def __init__(self, book_id, access=None):
|
def __init__(self, book_id=0, access=None, file_path=None, output_path=None):
|
||||||
self.book_id = book_id
|
self.book_id = book_id
|
||||||
self.access = access
|
self.access = access
|
||||||
|
self.file_path = file_path
|
||||||
|
self.output_path = output_path
|
||||||
|
|
||||||
self.file_path = None
|
|
||||||
self.output_path = None
|
|
||||||
self.logger = None
|
self.logger = None
|
||||||
self.html_soup = None
|
self.html_soup = None
|
||||||
self.body_tag = None
|
self.body_tag = None
|
||||||
@@ -268,6 +270,19 @@ class Book:
|
|||||||
for table in tables:
|
for table in tables:
|
||||||
table.decompose()
|
table.decompose()
|
||||||
|
|
||||||
|
def _change_table_of_contents(self):
|
||||||
|
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
|
||||||
|
for table in tables:
|
||||||
|
table.wrap(self.html_soup.new_tag("TOC"))
|
||||||
|
table.decompose()
|
||||||
|
|
||||||
|
def delete_content_before_toc(self):
|
||||||
|
toc_tag = self.html_soup.new_tag('TOC')
|
||||||
|
if toc_tag in self.content:
|
||||||
|
ind = self.content.index(toc_tag) + 1
|
||||||
|
self.content = self.content[ind:]
|
||||||
|
self.write_html_from_list()
|
||||||
|
|
||||||
def clean_trash(self):
|
def clean_trash(self):
|
||||||
"""
|
"""
|
||||||
Function to remove all styles and tags we don't need.
|
Function to remove all styles and tags we don't need.
|
||||||
@@ -283,7 +298,8 @@ class Book:
|
|||||||
self._clean_underline_links()
|
self._clean_underline_links()
|
||||||
|
|
||||||
self._font_to_span()
|
self._font_to_span()
|
||||||
self._remove_table_of_contents()
|
# self._remove_table_of_contents()
|
||||||
|
self._change_table_of_contents()
|
||||||
|
|
||||||
def _process_paragraph(self):
|
def _process_paragraph(self):
|
||||||
"""
|
"""
|
||||||
@@ -385,21 +401,24 @@ class Book:
|
|||||||
imgs = self.body_tag.find_all('img')
|
imgs = self.body_tag.find_all('img')
|
||||||
|
|
||||||
if len(imgs):
|
if len(imgs):
|
||||||
# new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
|
if self.access is None:
|
||||||
# new_path.mkdir(exist_ok=True)
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/'))
|
||||||
|
new_path.mkdir(exist_ok=True)
|
||||||
|
|
||||||
for img in imgs:
|
for img in imgs:
|
||||||
img_name = img.attrs.get('src')
|
img_name = img.attrs.get('src')
|
||||||
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
|
img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}')
|
||||||
|
|
||||||
|
if self.access is not None:
|
||||||
link = self.access.send_image(img_path, self.book_id)
|
link = self.access.send_image(img_path, self.book_id)
|
||||||
img.attrs['src'] = link
|
img.attrs['src'] = link
|
||||||
|
else:
|
||||||
# img_size = os.path.getsize(img_path)
|
img_size = os.path.getsize(img_path)
|
||||||
# print(f'{img_name} successfully loaded. Image size: {img_size}.')
|
print(f'{img_name} successfully loaded. Image size: {img_size}.')
|
||||||
# new_img_path = new_path / img_name
|
new_img_path = new_path / img_name
|
||||||
# copyfile(img_path, new_img_path)
|
copyfile(img_path, new_img_path)
|
||||||
# img.attrs["src"] = str(new_img_path)
|
img.attrs["src"] = str(new_img_path)
|
||||||
|
|
||||||
self.images = imgs
|
self.images = imgs
|
||||||
|
|
||||||
@@ -472,7 +491,10 @@ class Book:
|
|||||||
tag.replace_with(new_tag)
|
tag.replace_with(new_tag)
|
||||||
|
|
||||||
def write_html_from_list(self, file_name='url_test.html'):
|
def write_html_from_list(self, file_name='url_test.html'):
|
||||||
with open(file_name, 'w', encoding='utf-8') as f_out:
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
file_path = pathlib.Path(os.path.join(folder_path, file_name))
|
||||||
|
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as f_out:
|
||||||
# f_out.write("".join([tag.prettify() for tag in self.content]))
|
# f_out.write("".join([tag.prettify() for tag in self.content]))
|
||||||
f_out.write(self.body_tag.prettify())
|
f_out.write(self.body_tag.prettify())
|
||||||
self.logger.info(f'Check test file - url_test.html.')
|
self.logger.info(f'Check test file - url_test.html.')
|
||||||
@@ -502,17 +524,14 @@ class Book:
|
|||||||
|
|
||||||
self.content = self.body_tag.find_all(recursive=False)
|
self.content = self.body_tag.find_all(recursive=False)
|
||||||
|
|
||||||
# if self.train_mode:
|
|
||||||
# self.model.train_model(self.content)
|
|
||||||
# else:
|
|
||||||
# self.model.predict_headers(self.content)
|
|
||||||
|
|
||||||
self.write_html_from_list()
|
|
||||||
|
|
||||||
self._process_toc_links()
|
self._process_toc_links()
|
||||||
self._process_headings()
|
self._process_headings()
|
||||||
|
|
||||||
self.content = self.body_tag.find_all(recursive=False)
|
self.content = self.body_tag.find_all(recursive=False)
|
||||||
|
|
||||||
|
# delete text before table of content if exists
|
||||||
|
self.delete_content_before_toc()
|
||||||
|
|
||||||
self.logger.info('End of processing .html file.')
|
self.logger.info('End of processing .html file.')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -563,6 +582,22 @@ class Book:
|
|||||||
return result, ind
|
return result, ind
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_empty_p_tag(tag):
|
||||||
|
if tag.name != 'p':
|
||||||
|
return False
|
||||||
|
|
||||||
|
temp_tag = copy(tag)
|
||||||
|
brs = temp_tag.find_all('br')
|
||||||
|
for br in brs:
|
||||||
|
br.decompose()
|
||||||
|
|
||||||
|
text = re.sub(r'\s+', '', temp_tag.text)
|
||||||
|
if text:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def convert_to_json(self):
|
def convert_to_json(self):
|
||||||
"""
|
"""
|
||||||
Function which convert list of html nodes to appropriate json structure.
|
Function which convert list of html nodes to appropriate json structure.
|
||||||
@@ -572,16 +607,21 @@ class Book:
|
|||||||
ch_num = 0
|
ch_num = 0
|
||||||
|
|
||||||
while ind < len(self.content):
|
while ind < len(self.content):
|
||||||
|
res = {}
|
||||||
|
|
||||||
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
if self.content[ind].name in self.SUPPORTED_HEADERS:
|
||||||
res, ind = self.header_to_json(ind)
|
res, ind = self.header_to_json(ind)
|
||||||
else:
|
else:
|
||||||
chapter_title = f'Untitled chapter {ch_num}'
|
chapter_title = f'Untitled chapter {ch_num}'
|
||||||
chapter = []
|
chapter = []
|
||||||
while ind < len(self.content) and self.content[ind].name not in self.SUPPORTED_HEADERS:
|
while ind < len(self.content) and self.content[ind].name not in self.SUPPORTED_HEADERS:
|
||||||
|
if not self._is_empty_p_tag(self.content[ind]):
|
||||||
chapter.append(self.format_html(str(self.content[ind])))
|
chapter.append(self.format_html(str(self.content[ind])))
|
||||||
ind += 1
|
ind += 1
|
||||||
|
if chapter:
|
||||||
res = {chapter_title: ["".join(chapter)]}
|
res = {chapter_title: ["".join(chapter)]}
|
||||||
ch_num += 1
|
ch_num += 1
|
||||||
|
if res:
|
||||||
json_strc.append(res)
|
json_strc.append(res)
|
||||||
|
|
||||||
self.content_dict = {
|
self.content_dict = {
|
||||||
@@ -599,6 +639,13 @@ class Book:
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
|
def convert_from_html(self, logging_format):
|
||||||
|
self.configure_file_logger(__name__, logging_format=logging_format, filemode='w+')
|
||||||
|
self.read_html()
|
||||||
|
self.process_html()
|
||||||
|
self.convert_to_json()
|
||||||
|
self.write_json()
|
||||||
|
|
||||||
def conversion(self, logging_format, filemode='w+'):
|
def conversion(self, logging_format, filemode='w+'):
|
||||||
self.configure_file_logger(__name__, logging_format=logging_format, filemode=filemode)
|
self.configure_file_logger(__name__, logging_format=logging_format, filemode=filemode)
|
||||||
self.log('Beginning of conversion from .docx to .json.')
|
self.log('Beginning of conversion from .docx to .json.')
|
||||||
@@ -607,10 +654,20 @@ class Book:
|
|||||||
self.convert_doc_to_html()
|
self.convert_doc_to_html()
|
||||||
self.check_output_directory()
|
self.check_output_directory()
|
||||||
self.read_html()
|
self.read_html()
|
||||||
self.clean_trash()
|
|
||||||
self.process_html()
|
self.process_html()
|
||||||
self.set_generate_status()
|
self.set_generate_status()
|
||||||
self.convert_to_json()
|
self.convert_to_json()
|
||||||
self.write_json()
|
self.write_json()
|
||||||
self.send_json_content()
|
self.send_json_content()
|
||||||
self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
file_path = pathlib.Path(os.path.join(folder_path, 'html/11/11.html'))
|
||||||
|
out_path = pathlib.Path(os.path.join(folder_path, 'json/11.json'))
|
||||||
|
|
||||||
|
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
|
||||||
|
book = Book(file_path=file_path, output_path=out_path)
|
||||||
|
book.convert_from_html(logging_format=logging_format)
|
||||||
|
|||||||
Reference in New Issue
Block a user