From d4fb6223e6885331502e452c2958535abfc506d9 Mon Sep 17 00:00:00 2001 From: Jeniamakarchik Date: Fri, 7 Feb 2020 12:03:50 +0300 Subject: [PATCH] Update book.py add solution for skiping everything before table of content --- src/book.py | 115 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 86 insertions(+), 29 deletions(-) diff --git a/src/book.py b/src/book.py index 4c278f3..ceba5e6 100644 --- a/src/book.py +++ b/src/book.py @@ -4,6 +4,8 @@ import logging import os import pathlib import re +from copy import copy +from shutil import copyfile from bs4 import BeautifulSoup @@ -28,12 +30,12 @@ class Book: } SUPPORTED_HEADERS = ["h1", "h2", "h3"] - def __init__(self, book_id, access=None): + def __init__(self, book_id=0, access=None, file_path=None, output_path=None): self.book_id = book_id self.access = access + self.file_path = file_path + self.output_path = output_path - self.file_path = None - self.output_path = None self.logger = None self.html_soup = None self.body_tag = None @@ -268,6 +270,19 @@ class Book: for table in tables: table.decompose() + def _change_table_of_contents(self): + tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) + for table in tables: + table.wrap(self.html_soup.new_tag("TOC")) + table.decompose() + + def delete_content_before_toc(self): + toc_tag = self.html_soup.new_tag('TOC') + if toc_tag in self.content: + ind = self.content.index(toc_tag) + 1 + self.content = self.content[ind:] + self.write_html_from_list() + def clean_trash(self): """ Function to remove all styles and tags we don't need. @@ -283,7 +298,8 @@ class Book: self._clean_underline_links() self._font_to_span() - self._remove_table_of_contents() + # self._remove_table_of_contents() + self._change_table_of_contents() def _process_paragraph(self): """ @@ -359,8 +375,8 @@ class Book: new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' - new_tag['data-id'] = i+1 - new_tag['id'] = f'footnote-{i+1}' + new_tag['data-id'] = i + 1 + new_tag['id'] = f'footnote-{i + 1}' new_tag.string = '*' anc_tag.replace_with(new_tag) @@ -385,21 +401,24 @@ class Book: imgs = self.body_tag.find_all('img') if len(imgs): - # new_path = pathlib.Path(f'json/img_{self.file_path.stem}/') - # new_path.mkdir(exist_ok=True) + if self.access is None: + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{self.file_path.stem}/')) + new_path.mkdir(exist_ok=True) for img in imgs: img_name = img.attrs.get('src') img_path = pathlib.Path(f'{self.file_path.parent}/{img_name}') - link = self.access.send_image(img_path, self.book_id) - img.attrs['src'] = link - - # img_size = os.path.getsize(img_path) - # print(f'{img_name} successfully loaded. Image size: {img_size}.') - # new_img_path = new_path / img_name - # copyfile(img_path, new_img_path) - # img.attrs["src"] = str(new_img_path) + if self.access is not None: + link = self.access.send_image(img_path, self.book_id) + img.attrs['src'] = link + else: + img_size = os.path.getsize(img_path) + print(f'{img_name} successfully loaded. Image size: {img_size}.') + new_img_path = new_path / img_name + copyfile(img_path, new_img_path) + img.attrs["src"] = str(new_img_path) self.images = imgs @@ -472,7 +491,10 @@ class Book: tag.replace_with(new_tag) def write_html_from_list(self, file_name='url_test.html'): - with open(file_name, 'w', encoding='utf-8') as f_out: + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + file_path = pathlib.Path(os.path.join(folder_path, file_name)) + + with open(file_path, 'w', encoding='utf-8') as f_out: # f_out.write("".join([tag.prettify() for tag in self.content])) f_out.write(self.body_tag.prettify()) self.logger.info(f'Check test file - url_test.html.') @@ -502,17 +524,14 @@ class Book: self.content = self.body_tag.find_all(recursive=False) - # if self.train_mode: - # self.model.train_model(self.content) - # else: - # self.model.predict_headers(self.content) - - self.write_html_from_list() - self._process_toc_links() self._process_headings() self.content = self.body_tag.find_all(recursive=False) + + # delete text before table of content if exists + self.delete_content_before_toc() + self.logger.info('End of processing .html file.') @staticmethod @@ -563,6 +582,22 @@ class Book: return result, ind return '' + @staticmethod + def _is_empty_p_tag(tag): + if tag.name != 'p': + return False + + temp_tag = copy(tag) + brs = temp_tag.find_all('br') + for br in brs: + br.decompose() + + text = re.sub(r'\s+', '', temp_tag.text) + if text: + return False + + return True + def convert_to_json(self): """ Function which convert list of html nodes to appropriate json structure. @@ -572,17 +607,22 @@ class Book: ch_num = 0 while ind < len(self.content): + res = {} + if self.content[ind].name in self.SUPPORTED_HEADERS: res, ind = self.header_to_json(ind) else: chapter_title = f'Untitled chapter {ch_num}' chapter = [] while ind < len(self.content) and self.content[ind].name not in self.SUPPORTED_HEADERS: - chapter.append(self.format_html(str(self.content[ind]))) + if not self._is_empty_p_tag(self.content[ind]): + chapter.append(self.format_html(str(self.content[ind]))) ind += 1 - res = {chapter_title: ["".join(chapter)]} - ch_num += 1 - json_strc.append(res) + if chapter: + res = {chapter_title: ["".join(chapter)]} + ch_num += 1 + if res: + json_strc.append(res) self.content_dict = { "content": json_strc, @@ -599,6 +639,13 @@ class Book: except Exception as exc: raise exc + def convert_from_html(self, logging_format): + self.configure_file_logger(__name__, logging_format=logging_format, filemode='w+') + self.read_html() + self.process_html() + self.convert_to_json() + self.write_json() + def conversion(self, logging_format, filemode='w+'): self.configure_file_logger(__name__, logging_format=logging_format, filemode=filemode) self.log('Beginning of conversion from .docx to .json.') @@ -607,10 +654,20 @@ class Book: self.convert_doc_to_html() self.check_output_directory() self.read_html() - self.clean_trash() self.process_html() self.set_generate_status() self.convert_to_json() self.write_json() self.send_json_content() self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.') + + +if __name__ == "__main__": + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + file_path = pathlib.Path(os.path.join(folder_path, 'html/11/11.html')) + out_path = pathlib.Path(os.path.join(folder_path, 'json/11.json')) + + logging_format = '%(asctime)s - %(levelname)s - %(message)s' + + book = Book(file_path=file_path, output_path=out_path) + book.convert_from_html(logging_format=logging_format)