Remove processing of tables

2022-09-08 13:11:45 +03:00
parent 539a8df176
commit 001e55a27b
3 changed files with 26 additions and 66 deletions
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -12,16 +12,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString
 from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
-from src.style_preprocessor import StylePreprocessor
+from src.style_reader import StyleReader
 from src.epub_converter.html_epub_processor import HtmlEpubProcessor
 from src.epub_converter.image_processing import update_images_src_links
 from src.epub_converter.footnotes_processing import preprocess_footnotes
-from src.tag_inline_style_processor import modify_html_soup_with_css_styles
+from src.inline_style_processor import modify_html_soup_with_css_styles
 class EpubConverter:
    def __init__(self, book_path, access=None, logger: BookLogger = None,
-                 style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
+                 style_processor: StyleReader = None, html_processor: HtmlEpubProcessor = None):
        self.book_path = book_path
        self.access = access
        self.logger: BookLogger = logger
@@ -57,13 +57,6 @@ class EpubConverter:
        self.noterefs: List[Tag] = []  # start of the footnote
        self.footnotes: List[Tag] = []  # end of the footnote
        self.logger.log("Image processing.")
        for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
                       self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
            file_name = x.file_name
            content = x.content
            self.img_href2img_bytes[file_name] = content
        self.logger.log("HTML files reading.")
        self.html_href2html_body_soup: Dict[str,
                                            BeautifulSoup] = self.build_href2soup_content()
@@ -76,6 +69,13 @@ class EpubConverter:
        self.logger.log("CSS styles fusion(inline+file).")
        self.add_css_styles_to_html_soup()
        self.logger.log("Image processing.")
        for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
                       self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
            file_name = x.file_name
            content = x.content
            self.img_href2img_bytes[file_name] = content
        self.logger.log("Footnotes processing.")
        for href in self.html_href2html_body_soup:
            self.footnotes_contents, self.noterefs, self.footnotes =\
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -3,8 +3,8 @@ import codecs
 from src.book_solver import BookSolver
 from src.util.helpers import BookLogger
-from src.html_preprocessor import HtmlPreprocessor
+from src.html_presets_processor import HtmlPresetsProcessor
-from src.style_preprocessor import StylePreprocessor
+from src.style_reader import StyleReader
 from src.epub_converter.html_epub_processor import HtmlEpubProcessor
 from src.epub_converter.epub_converter import EpubConverter
@@ -30,16 +30,15 @@ class EpubBook(BookSolver):
            json for LiveCarta platform
        """
-        html_preprocessor = HtmlPreprocessor(
+        html_preprocessor = HtmlPresetsProcessor(
            logger=self.logger_object, preset_path="presets/epub_presets.json")
-        style_preprocessor = StylePreprocessor()
+        style_preprocessor = StyleReader()
        html_processor = HtmlEpubProcessor(logger=self.logger_object,
                                           html_preprocessor=html_preprocessor)
        json_converter = EpubConverter(
            self.book_path, access=self.access, logger=self.logger_object,
            style_processor=style_preprocessor, html_processor=html_processor)
        content_dict = json_converter.convert_to_dict()
        return content_dict
@@ -48,9 +47,9 @@ if __name__ == "__main__":
    logger_object = BookLogger(
        name="epub", book_id=epub_file_path.split("/")[-1])
-    html_preprocessor = HtmlPreprocessor(
+    html_preprocessor = HtmlPresetsProcessor(
        logger=logger_object, preset_path="../../presets/epub_presets.json")
-    style_preprocessor = StylePreprocessor()
+    style_preprocessor = StyleReader()
    html_processor = HtmlEpubProcessor(logger=logger_object,
                                       html_preprocessor=html_preprocessor)
--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -4,7 +4,7 @@ from bs4.element import PageElement
 from bs4 import BeautifulSoup, Tag, NavigableString, Comment
 from src.util.helpers import BookLogger
-from src.html_preprocessor import _preprocess_html
+from src.html_presets_processor import _process_presets
 class HtmlEpubProcessor:
@@ -113,43 +113,6 @@ class HtmlEpubProcessor:
                    tag.extract()
                    return
    @staticmethod
    def _process_tables(chapter_tag: BeautifulSoup):
        """
        Function preprocesses tables and tags(td|th|tr)
        Parameters
        ----------
        chapter_tag: BeautifulSoup
            Tag & contents of the chapter tag
        Returns
        -------
        NoReturn
            Chapter Tag with processed tables
        """
        tables = chapter_tag.find_all("table")
        for table in tables:
            for t_tag in table.find_all(re.compile("td|th|tr")):
                width = ""
                if t_tag.get("style"):
                    width_match = re.search(
                        r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
                    if width_match:
                        size = width_match.group(1)
                        width = size + "px"
                t_tag.attrs["width"] = t_tag.get("width") or width
                if t_tag.attrs.get("style"):
                    t_tag.attrs["style"] = t_tag.attrs["style"].replace(
                        "border:0;", "")
                    if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
                        del t_tag.attrs["style"]
            if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
                table.attrs["border"] = "1"
    @staticmethod
    def _class_removing(chapter_tag: BeautifulSoup):
        """
@@ -185,13 +148,13 @@ class HtmlEpubProcessor:
        ----------
        1. comments removal
        2. wrap NavigableString with tag <p>
-        3-6. wrap tags with <table>
+        3. heading removal
        4. wrap tags with <table>
            replace tags with correspond LiveCarta tags
            replace/remove attrs, values of attrs
            unwrap tags
            insert tags into correspond tags
-        7. heading removal
+        5. class removal
        8. process_tables
        9. class removal
        Returns
        -------
@@ -203,14 +166,12 @@ class HtmlEpubProcessor:
        self._remove_comments(chapter_tag)
        # 2.
        self._wrap_strings_with_p(chapter_tag)
-        # 3-6.
+        # 3.
        _preprocess_html(
            html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
        # 7.
        if remove_title_from_chapter:
            self._remove_headings_content(chapter_tag, title_str)
-        # 8.
+        # 4.
-        self._process_tables(chapter_tag)
+        _process_presets(
-        # 9. remove classes that weren't created by converter
+            html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
        # 5. remove classes that weren't created by converter
        self._class_removing(chapter_tag)
        return chapter_tag