Remove processing of tables

2022-09-08 13:11:45 +03:00
parent 539a8df176
commit 001e55a27b
3 changed files with 26 additions and 66 deletions
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -12,16 +12,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString
 from src.util.helpers import BookLogger
 from src.livecarta_config import LiveCartaConfig
 from src.data_objects import ChapterItem, NavPoint
-from src.style_preprocessor import StylePreprocessor
+from src.style_reader import StyleReader
 from src.epub_converter.html_epub_processor import HtmlEpubProcessor
 from src.epub_converter.image_processing import update_images_src_links
 from src.epub_converter.footnotes_processing import preprocess_footnotes
-from src.tag_inline_style_processor import modify_html_soup_with_css_styles
+from src.inline_style_processor import modify_html_soup_with_css_styles


 class EpubConverter:
    def __init__(self, book_path, access=None, logger: BookLogger = None,
-                 style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
+                 style_processor: StyleReader = None, html_processor: HtmlEpubProcessor = None):
        self.book_path = book_path
        self.access = access
        self.logger: BookLogger = logger
@@ -57,13 +57,6 @@ class EpubConverter:
        self.noterefs: List[Tag] = []  # start of the footnote
        self.footnotes: List[Tag] = []  # end of the footnote

-        self.logger.log("Image processing.")
-        for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
-                       self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
-            file_name = x.file_name
-            content = x.content
-            self.img_href2img_bytes[file_name] = content
-
        self.logger.log("HTML files reading.")
        self.html_href2html_body_soup: Dict[str,
                                            BeautifulSoup] = self.build_href2soup_content()
@@ -76,6 +69,13 @@ class EpubConverter:
        self.logger.log("CSS styles fusion(inline+file).")
        self.add_css_styles_to_html_soup()

+        self.logger.log("Image processing.")
+        for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
+                       self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
+            file_name = x.file_name
+            content = x.content
+            self.img_href2img_bytes[file_name] = content
+
        self.logger.log("Footnotes processing.")
        for href in self.html_href2html_body_soup:
            self.footnotes_contents, self.noterefs, self.footnotes =\
--- a/src/epub_converter/epub_solver.py
+++ b/src/epub_converter/epub_solver.py
@@ -3,8 +3,8 @@ import codecs

 from src.book_solver import BookSolver
 from src.util.helpers import BookLogger
-from src.html_preprocessor import HtmlPreprocessor
-from src.style_preprocessor import StylePreprocessor
+from src.html_presets_processor import HtmlPresetsProcessor
+from src.style_reader import StyleReader
 from src.epub_converter.html_epub_processor import HtmlEpubProcessor
 from src.epub_converter.epub_converter import EpubConverter

@@ -30,16 +30,15 @@ class EpubBook(BookSolver):
            json for LiveCarta platform

        """
-        html_preprocessor = HtmlPreprocessor(
+        html_preprocessor = HtmlPresetsProcessor(
            logger=self.logger_object, preset_path="presets/epub_presets.json")
-        style_preprocessor = StylePreprocessor()
+        style_preprocessor = StyleReader()
        html_processor = HtmlEpubProcessor(logger=self.logger_object,
                                           html_preprocessor=html_preprocessor)
        json_converter = EpubConverter(
            self.book_path, access=self.access, logger=self.logger_object,
            style_processor=style_preprocessor, html_processor=html_processor)
        content_dict = json_converter.convert_to_dict()
-
        return content_dict


@@ -48,9 +47,9 @@ if __name__ == "__main__":
    logger_object = BookLogger(
        name="epub", book_id=epub_file_path.split("/")[-1])

-    html_preprocessor = HtmlPreprocessor(
+    html_preprocessor = HtmlPresetsProcessor(
        logger=logger_object, preset_path="../../presets/epub_presets.json")
-    style_preprocessor = StylePreprocessor()
+    style_preprocessor = StyleReader()
    html_processor = HtmlEpubProcessor(logger=logger_object,
                                       html_preprocessor=html_preprocessor)

--- a/src/epub_converter/html_epub_processor.py
+++ b/src/epub_converter/html_epub_processor.py
@@ -4,7 +4,7 @@ from bs4.element import PageElement
 from bs4 import BeautifulSoup, Tag, NavigableString, Comment

 from src.util.helpers import BookLogger
-from src.html_preprocessor import _preprocess_html
+from src.html_presets_processor import _process_presets


 class HtmlEpubProcessor:
@@ -113,43 +113,6 @@ class HtmlEpubProcessor:
                    tag.extract()
                    return

-    @staticmethod
-    def _process_tables(chapter_tag: BeautifulSoup):
-        """
-        Function preprocesses tables and tags(td|th|tr)
-        Parameters
-        ----------
-        chapter_tag: BeautifulSoup
-            Tag & contents of the chapter tag
-
-        Returns
-        -------
-        NoReturn
-            Chapter Tag with processed tables
-
-        """
-        tables = chapter_tag.find_all("table")
-        for table in tables:
-            for t_tag in table.find_all(re.compile("td|th|tr")):
-                width = ""
-                if t_tag.get("style"):
-                    width_match = re.search(
-                        r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
-                    if width_match:
-                        size = width_match.group(1)
-                        width = size + "px"
-
-                t_tag.attrs["width"] = t_tag.get("width") or width
-
-                if t_tag.attrs.get("style"):
-                    t_tag.attrs["style"] = t_tag.attrs["style"].replace(
-                        "border:0;", "")
-                    if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
-                        del t_tag.attrs["style"]
-
-            if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
-                table.attrs["border"] = "1"
-
    @staticmethod
    def _class_removing(chapter_tag: BeautifulSoup):
        """
@@ -185,13 +148,13 @@ class HtmlEpubProcessor:
        ----------
        1. comments removal
        2. wrap NavigableString with tag <p>
-        3-6. wrap tags with <table>
+        3. heading removal
+        4. wrap tags with <table>
            replace tags with correspond LiveCarta tags
+            replace/remove attrs, values of attrs
            unwrap tags
            insert tags into correspond tags
-        7. heading removal
-        8. process_tables
-        9. class removal
+        5. class removal

        Returns
        -------
@@ -203,14 +166,12 @@ class HtmlEpubProcessor:
        self._remove_comments(chapter_tag)
        # 2.
        self._wrap_strings_with_p(chapter_tag)
-        # 3-6.
-        _preprocess_html(
-            html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
-        # 7.
+        # 3.
        if remove_title_from_chapter:
            self._remove_headings_content(chapter_tag, title_str)
-        # 8.
-        self._process_tables(chapter_tag)
-        # 9. remove classes that weren't created by converter
+        # 4.
+        _process_presets(
+            html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
+        # 5. remove classes that weren't created by converter
        self._class_removing(chapter_tag)
        return chapter_tag