forked from LiveCarta/BookConverter
epub converter: files and classes renaming
This commit is contained in:
@@ -6,8 +6,8 @@ from subprocess import PIPE
|
|||||||
from threading import Event
|
from threading import Event
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from html_preprocessor import HTMLPreprocessor
|
from html_docx_preprocessor import HTMLDocxPreprocessor
|
||||||
from json_postprocessor import JSONConverter
|
from json_postprocessor import DocxHTML2JSONConverter
|
||||||
from src.solver import BookSolver
|
from src.solver import BookSolver
|
||||||
|
|
||||||
|
|
||||||
@@ -117,9 +117,9 @@ class DocxBook(BookSolver):
|
|||||||
|
|
||||||
def convert_from_html(self):
|
def convert_from_html(self):
|
||||||
html_soup = self.read_html()
|
html_soup = self.read_html()
|
||||||
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
parser = HTMLDocxPreprocessor(html_soup, self.logger_object)
|
||||||
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
content, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
||||||
json_converter = JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
json_converter = DocxHTML2JSONConverter(content, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
self.write_to_json(content_dict)
|
self.write_to_json(content_dict)
|
||||||
self.write_html_from_list(parser.body_tag)
|
self.write_html_from_list(parser.body_tag)
|
||||||
@@ -137,13 +137,13 @@ class DocxBook(BookSolver):
|
|||||||
html_soup = self.read_html()
|
html_soup = self.read_html()
|
||||||
self.logger_object.log('Beginning of processing .html file.')
|
self.logger_object.log('Beginning of processing .html file.')
|
||||||
|
|
||||||
parser = HTMLPreprocessor(html_soup, self.logger_object)
|
parser = HTMLDocxPreprocessor(html_soup, self.logger_object)
|
||||||
bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
bs_tags, footnotes, top_level_headers = parser.process_html(self.access, self.html_path, self.book_id)
|
||||||
|
|
||||||
self.logger_object.log('Beginning of processing json output.')
|
self.logger_object.log('Beginning of processing json output.')
|
||||||
self.status_wrapper.set_generating()
|
self.status_wrapper.set_generating()
|
||||||
|
|
||||||
json_converter = JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
json_converter = DocxHTML2JSONConverter(bs_tags, footnotes, top_level_headers, self.logger_object, self.status_wrapper)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
return content_dict
|
return content_dict
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ from css_reader import clean_css, add_inline_style_to_html_soup
|
|||||||
from livecarta_config import LawCartaConfig, BookLogger
|
from livecarta_config import LawCartaConfig, BookLogger
|
||||||
|
|
||||||
|
|
||||||
class EpubPostprocessor:
|
class EpubConverter:
|
||||||
def __init__(self, file, access=None, logger=None):
|
def __init__(self, file, access=None, logger=None):
|
||||||
self.file = file
|
self.file = file
|
||||||
self.access = access
|
self.access = access
|
||||||
@@ -411,8 +411,8 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
||||||
|
|
||||||
json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/9781119682387_pre_code2.epub',
|
json_converter = EpubConverter('/home/katerina/PycharmProjects/Jenia/converter/epub/9781119682387_pre_code2.epub',
|
||||||
logger=logger_object)
|
logger=logger_object)
|
||||||
tmp = json_converter.convert_to_dict()
|
tmp = json_converter.convert_to_dict()
|
||||||
|
|
||||||
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
|
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
from epub_postprocessor import EpubPostprocessor
|
from epub_converter import EpubConverter
|
||||||
from src.solver import BookSolver
|
from src.solver import BookSolver
|
||||||
|
|
||||||
|
|
||||||
@@ -10,7 +10,7 @@ class EpubBook(BookSolver):
|
|||||||
self.book_type = 'epub'
|
self.book_type = 'epub'
|
||||||
|
|
||||||
def get_converted_book(self):
|
def get_converted_book(self):
|
||||||
json_converter = EpubPostprocessor(self.file_path, access=self.access, logger=self.logger_object)
|
json_converter = EpubConverter(self.file_path, access=self.access, logger=self.logger_object)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
self.status_wrapper.set_generating()
|
self.status_wrapper.set_generating()
|
||||||
return content_dict
|
return content_dict
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, NavigableString, Tag
|
|||||||
from livecarta_config import LawCartaConfig, BookLogger, BookStatusWrapper
|
from livecarta_config import LawCartaConfig, BookLogger, BookStatusWrapper
|
||||||
|
|
||||||
|
|
||||||
class HTMLPreprocessor:
|
class HTMLDocxPreprocessor:
|
||||||
|
|
||||||
def __init__(self, html_soup, logger_object, status_wrapper=None):
|
def __init__(self, html_soup, logger_object, status_wrapper=None):
|
||||||
self.body_tag = html_soup.body
|
self.body_tag = html_soup.body
|
||||||
@@ -5,7 +5,7 @@ from copy import copy
|
|||||||
from livecarta_config import LawCartaConfig
|
from livecarta_config import LawCartaConfig
|
||||||
|
|
||||||
|
|
||||||
class JSONConverter:
|
class DocxHTML2JSONConverter:
|
||||||
def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
|
def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
|
||||||
self.content_dict = None
|
self.content_dict = None
|
||||||
self.content = content
|
self.content = content
|
||||||
|
|||||||
Reference in New Issue
Block a user