diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 3ec04e2..dbf3509 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -12,16 +12,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint -from src.style_preprocessor import StylePreprocessor +from src.style_reader import StyleReader from src.epub_converter.html_epub_processor import HtmlEpubProcessor from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.footnotes_processing import preprocess_footnotes -from src.tag_inline_style_processor import modify_html_soup_with_css_styles +from src.inline_style_processor import modify_html_soup_with_css_styles class EpubConverter: def __init__(self, book_path, access=None, logger: BookLogger = None, - style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None): + style_processor: StyleReader = None, html_processor: HtmlEpubProcessor = None): self.book_path = book_path self.access = access self.logger: BookLogger = logger @@ -57,13 +57,6 @@ class EpubConverter: self.noterefs: List[Tag] = [] # start of the footnote self.footnotes: List[Tag] = [] # end of the footnote - self.logger.log("Image processing.") - for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), - self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): - file_name = x.file_name - content = x.content - self.img_href2img_bytes[file_name] = content - self.logger.log("HTML files reading.") self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() @@ -76,6 +69,13 @@ class EpubConverter: self.logger.log("CSS styles fusion(inline+file).") self.add_css_styles_to_html_soup() + self.logger.log("Image processing.") + for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), + self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): + file_name = x.file_name + content = x.content + self.img_href2img_bytes[file_name] = content + self.logger.log("Footnotes processing.") for href in self.html_href2html_body_soup: self.footnotes_contents, self.noterefs, self.footnotes =\ diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 33019f2..90c3b95 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -3,8 +3,8 @@ import codecs from src.book_solver import BookSolver from src.util.helpers import BookLogger -from src.html_preprocessor import HtmlPreprocessor -from src.style_preprocessor import StylePreprocessor +from src.html_presets_processor import HtmlPresetsProcessor +from src.style_reader import StyleReader from src.epub_converter.html_epub_processor import HtmlEpubProcessor from src.epub_converter.epub_converter import EpubConverter @@ -30,16 +30,15 @@ class EpubBook(BookSolver): json for LiveCarta platform """ - html_preprocessor = HtmlPreprocessor( + html_preprocessor = HtmlPresetsProcessor( logger=self.logger_object, preset_path="presets/epub_presets.json") - style_preprocessor = StylePreprocessor() + style_preprocessor = StyleReader() html_processor = HtmlEpubProcessor(logger=self.logger_object, html_preprocessor=html_preprocessor) json_converter = EpubConverter( self.book_path, access=self.access, logger=self.logger_object, style_processor=style_preprocessor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() - return content_dict @@ -48,9 +47,9 @@ if __name__ == "__main__": logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) - html_preprocessor = HtmlPreprocessor( + html_preprocessor = HtmlPresetsProcessor( logger=logger_object, preset_path="../../presets/epub_presets.json") - style_preprocessor = StylePreprocessor() + style_preprocessor = StyleReader() html_processor = HtmlEpubProcessor(logger=logger_object, html_preprocessor=html_preprocessor) diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index a8ac544..e92ac8b 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -4,7 +4,7 @@ from bs4.element import PageElement from bs4 import BeautifulSoup, Tag, NavigableString, Comment from src.util.helpers import BookLogger -from src.html_preprocessor import _preprocess_html +from src.html_presets_processor import _process_presets class HtmlEpubProcessor: @@ -113,43 +113,6 @@ class HtmlEpubProcessor: tag.extract() return - @staticmethod - def _process_tables(chapter_tag: BeautifulSoup): - """ - Function preprocesses tables and tags(td|th|tr) - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - NoReturn - Chapter Tag with processed tables - - """ - tables = chapter_tag.find_all("table") - for table in tables: - for t_tag in table.find_all(re.compile("td|th|tr")): - width = "" - if t_tag.get("style"): - width_match = re.search( - r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) - if width_match: - size = width_match.group(1) - width = size + "px" - - t_tag.attrs["width"] = t_tag.get("width") or width - - if t_tag.attrs.get("style"): - t_tag.attrs["style"] = t_tag.attrs["style"].replace( - "border:0;", "") - if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": - del t_tag.attrs["style"] - - if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: - table.attrs["border"] = "1" - @staticmethod def _class_removing(chapter_tag: BeautifulSoup): """ @@ -185,13 +148,13 @@ class HtmlEpubProcessor: ---------- 1. comments removal 2. wrap NavigableString with tag
- 3-6. wrap tags with