put every step on its place

2022-09-09 15:14:14 +03:00
parent b716a2180c
commit 4e0d2067d7
1 changed files with 59 additions and 143 deletions
--- a/src/docx_converter/html_docx_processor.py
+++ b/src/docx_converter/html_docx_processor.py
@@ -4,11 +4,10 @@ from typing import List, Tuple, Dict, Union
 from bs4 import BeautifulSoup, Tag, NavigableString

 from src.util.helpers import BookLogger
-from src.livecarta_config import LiveCartaConfig
-from src.html_preprocessor import _preprocess_html
+from src.html_presets_processor import _process_presets
 from src.docx_converter.image_processing import process_images
 from src.docx_converter.footnotes_processing import process_footnotes
-from src.tag_inline_style_processor import modify_html_soup_with_css_styles
+from src.inline_style_processor import modify_html_soup_with_css_styles


 class HtmlDocxProcessor:
@@ -18,6 +17,28 @@ class HtmlDocxProcessor:
        self.body_tag = self.html_soup.body
        self.html_preprocessor = html_preprocessor
        self.style_preprocessor = style_preprocessor
+        self.content: List[Tag] = []
+
+    def _font_to_span(self):
+        for font in self.body_tag.find_all("font"):
+            font.name = "span"
+
+
+    def _process_hrefs(self):
+        a_tags_with_href = self.body_tag.find_all(
+            "a", {"href": re.compile("^.*http.+")})
+
+        # remove char=end of file for some editors
+        for tag in a_tags_with_href:
+            tag.string = tag.text.replace("\u200c", "")
+            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
+
+        a_tags_with_href = self.body_tag.find_all(
+            "a", {"href": re.compile("^(?!#sdfootnote)")})
+        for tag in a_tags_with_href:
+            tag.string = tag.text.replace("\u200c", "")
+            tag.string = tag.text.replace("\u200b", "")  # zero-width-space
+            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")

    def _process_toc_links(self):
        """Function to extract nodes which contains TOC links, remove links from file and detect headers."""
@@ -50,95 +71,13 @@ class HtmlDocxProcessor:
                                f"Check the structure of the file."
                                f"Tag name: {tag.name}")

-    def _process_quotes(self):
-        """
-            Function to process block quotes.
-            After docx to html conversion block quotes are stored inside table with 1 cell.
-            All text is wrapped in a <i> tag.
-            Such tables will be replaced with <blockquote> tags.
-
-            <table cellpadding=\"7\" cellspacing=\"0\" width=\"614\">
-                <col width=\"600\"/>
-                <tr>
-                    <td width=\"600\">
-                        <p style=\"text-align: justify;\"><i>aaaaa</i></p>
-                        <p style=\"text-align: justify;\"><br/></p>
-                    </td>
-                </tr>
-            </table>
-
-        """
-        tables = self.body_tag.find_all("table")
-        for table in tables:
-            trs = table.find_all("tr")
-            tds = table.find_all("td")
-            if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600":
-                td = tds[0]
-                is_zero_border = "border: none;" in td.get("style")
-                paragraphs = td.find_all("p")
-                has_i_tag_or_br = [(p.i, p.br) for p in paragraphs]
-                has_i_tag_or_br = [x[0] is not None or x[1] is not None
-                                   for x in has_i_tag_or_br]
-
-                if all(has_i_tag_or_br) and is_zero_border:
-                    new_div = BeautifulSoup(
-                        features="lxml").new_tag("blockquote")
-                    for p in paragraphs:
-                        new_div.append(p)
-
-                    table.replaceWith(new_div)
-
-    def _process_tables(self):
-        """Function to process tables. Set "border" attribute."""
-        tables = self.body_tag.find_all("table")
-        for table in tables:
-            tds = table.find_all("td")
-            sizes = []
-            for td in tds:
-                style = td.get("style")
-                if style:
-                    match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
-                    if match:
-                        size = match.group(1)
-                        units = match.group(2)
-                        if units == "pt":
-                            value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\
-                                if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\
-                                else float(size)
-                            size = value
-                        sizes.append(float(size))
-                width = td.get("width")
-                td.attrs = {}
-                if width:
-                    td.attrs["width"] = width
-            if sizes:
-                border_size = sum(sizes) / len(sizes)
-                table.attrs["border"] = f"{border_size:.2}"
-        self.tables_amount = len(tables)
-
-    def _process_hrefs(self):
-        a_tags_with_href = self.body_tag.find_all(
-            "a", {"href": re.compile("^.*http.+")})
-
-        # remove char=end of file for some editors
-        for tag in a_tags_with_href:
-            tag.string = tag.text.replace("\u200c", "")
-            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
-
-        a_tags_with_href = self.body_tag.find_all(
-            "a", {"href": re.compile("^(?!#sdfootnote)")})
-        for tag in a_tags_with_href:
-            tag.string = tag.text.replace("\u200c", "")
-            tag.string = tag.text.replace("\u200b", "")  # zero-width-space
-            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
-
    def _get_top_level_headers(self) -> List[Dict[str, Union[str, bool]]]:
        """
        Function for gathering info about top-level chapters.

        Assume: _
            - Headers with the smallest outline(or digit in <h>) are top level chapters.
-            [ It is consistent with a recursive algorithm
+            [It is consistent with a recursive algorithm
            for saving content to a resulted json structure,
            which happens in  header_to_json()]

@@ -172,7 +111,8 @@ class HtmlDocxProcessor:
                    "is_introduction": is_introduction})
        return headers_info

-    def _mark_introduction_headers(self):
+    @staticmethod
+    def _mark_introduction_headers(top_level_headers: List[Dict[str, Union[str, bool]]]):
        """
        Function to find out:
        what header shouldn't be numbered and can be treated as introduction chapter
@@ -187,21 +127,21 @@ class HtmlDocxProcessor:

        """
        is_numbered_header = [header["is_numbered"]
-                              for header in self.top_level_headers]
+                              for header in top_level_headers]
        is_title = [header["is_introduction"]
-                    for header in self.top_level_headers]
+                    for header in top_level_headers]

        first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
        second_is_numbered_or_not_exist = all(is_numbered_header[1:2])
        first_header_is_introduction = is_title and is_title[0]

        if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction:
-            self.top_level_headers[0]["should_be_numbered"] = False
-            for i in range(1, len(self.top_level_headers)):
-                self.top_level_headers[i]["should_be_numbered"] = True
+            top_level_headers[0]["should_be_numbered"] = False
+            for i in range(1, len(top_level_headers)):
+                top_level_headers[i]["should_be_numbered"] = True
        else:
-            for i in range(0, len(self.top_level_headers)):
-                self.top_level_headers[i]["should_be_numbered"] = True
+            for i in range(0, len(top_level_headers)):
+                top_level_headers[i]["should_be_numbered"] = True

    @staticmethod
    def clean_title_from_tabs(tag: NavigableString):
@@ -217,10 +157,8 @@ class HtmlDocxProcessor:
        """
        if type(tag) is NavigableString:
            func(tag)
-        else:
-            children = list(tag.children)
-            if children:
-                self.apply_func_to_last_child(children[0], func)
+        elif list(tag.children):
+            self.apply_func_to_last_child(list(tag.children)[0], func)

    def _process_headings(self):
        """
@@ -233,25 +171,20 @@ class HtmlDocxProcessor:
            processed <h> tags

        """
-        header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
-
+        header_tags = self.body_tag.find_all(re.compile("^h[1-5]$"))
        # clean header from attrs and text in header from numbering and \n
        for h_tag in header_tags:
            h_tag.attrs = {}
+            for tag in h_tag.find_all():
+                tag.attrs = {}
            if h_tag.parent.name == "li":
                h_tag.parent.unwrap()
                while h_tag.parent.name == "ol":
                    h_tag.parent.unwrap()

            cleaned_title = re.sub(r"[\s\xa0]", " ", h_tag.text)
-            if cleaned_title == "":
-                h_tag.unwrap()
-            else:
-                assert h_tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
-                    f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings."
-
+            if cleaned_title != "":
                content = list(h_tag.children)
-
                # do not take into account rubbish empty tags like <a>, but don"t remove them
                content = [item for item in content if
                           (type(item) is not NavigableString and item.text != "")
@@ -270,11 +203,13 @@ class HtmlDocxProcessor:
                    else:
                        self.apply_func_to_last_child(
                            content[i], self.clean_title_from_tabs)
+            else:
+                h_tag.unwrap()
+

    def delete_content_before_toc(self):
        # remove all tag upper the <TOC> only in content !!! body tag is not updated
        toc_tag = self.html_soup.new_tag("TOC")
-        self.content: List[Tag] = self.body_tag.find_all(recursive=False)
        if toc_tag in self.content:
            ind = self.content.index(toc_tag) + 1
            self.content = self.content[ind:]
@@ -297,54 +232,35 @@ class HtmlDocxProcessor:
        modify_html_soup_with_css_styles(self.body_tag)

        self.logger.log("Image processing.")
-        self.images = process_images(access, path_to_html=html_path,
-                                     book_id=book_id, body_tag=self.body_tag)
+        images = process_images(access, path_to_html=html_path,
+                                book_id=book_id, body_tag=self.body_tag)
        self.logger.log(
-            f"{len(self.images)} images have been processed.")
+            f"{len(images)} images have been processed.")

        self.logger.log("Footnotes processing.")
-        self.footnotes: List[str] = process_footnotes(self.body_tag)
+        footnotes: List[str] = process_footnotes(self.body_tag)
        self.logger.log(
-            f"{len(self.footnotes)} footnotes have been processed.")
-
-        self.logger.log(f"Processing TOC and headers.")
-        self._process_toc_links()
-
-        self.logger.log(f"Preprocess Html using presets.")
-        _preprocess_html(html_preprocessor=self.html_preprocessor,
-                         html_soup=self.html_soup)
-
-        # CSS after html processing cause of <fonts> that aren't supported by html
-        self.logger.log("CSS inline style preprocessing.")
-        self.style_preprocessor.process_inline_styles_in_html_soup(
-            self.body_tag)
-
-        self.logger.log("CSS inline style processing.")
-        modify_html_soup_with_css_styles(self.body_tag)
-
-        # process main elements of the .html doc
-        self.logger.log(f"Processing main elements of html.")
-
-        self.logger.log("Block quotes processing.")
-        self._process_quotes()
-
-        self.logger.log("Tables processing.")
-        self._process_tables()
-        self.logger.log(
-            f"{self.tables_amount} tables have been processed.")
+            f"{len(footnotes)} footnotes have been processed.")

        self.logger.log("Hrefs processing.")
        self._process_hrefs()

-        self.top_level_headers: List[Dict[str, Union[str, bool]]]\
+        self.logger.log(f"TOC processing.")
+        self._process_toc_links()
+
+        top_level_headers: List[Dict[str, Union[str, bool]]]\
            = self._get_top_level_headers()
-        self._mark_introduction_headers()
+        self._mark_introduction_headers(top_level_headers)

        self._process_headings()

+        self.logger.log(f".html using presets processing.")
+        _process_presets(html_preprocessor=self.html_preprocessor,
+                         html_soup=self.html_soup)
+
+        self.content = self.body_tag.find_all(recursive=False)
        # delete text before table of content if exists
        self.delete_content_before_toc()

        self.logger.log("End of processing .html file.")
-
-        return self.content, self.footnotes, self.top_level_headers
+        return self.content, footnotes, top_level_headers