put every step on its place

2022-09-09 15:14:14 +03:00
parent b716a2180c
commit 4e0d2067d7
1 changed files with 59 additions and 143 deletions
--- a/src/docx_converter/html_docx_processor.py
+++ b/src/docx_converter/html_docx_processor.py
@@ -4,11 +4,10 @@ from typing import List, Tuple, Dict, Union
 from bs4 import BeautifulSoup, Tag, NavigableString
 from src.util.helpers import BookLogger
-from src.livecarta_config import LiveCartaConfig
+from src.html_presets_processor import _process_presets
 from src.html_preprocessor import _preprocess_html
 from src.docx_converter.image_processing import process_images
 from src.docx_converter.footnotes_processing import process_footnotes
-from src.tag_inline_style_processor import modify_html_soup_with_css_styles
+from src.inline_style_processor import modify_html_soup_with_css_styles
 class HtmlDocxProcessor:
@@ -18,6 +17,28 @@ class HtmlDocxProcessor:
        self.body_tag = self.html_soup.body
        self.html_preprocessor = html_preprocessor
        self.style_preprocessor = style_preprocessor
        self.content: List[Tag] = []
    def _font_to_span(self):
        for font in self.body_tag.find_all("font"):
            font.name = "span"
    def _process_hrefs(self):
        a_tags_with_href = self.body_tag.find_all(
            "a", {"href": re.compile("^.*http.+")})
        # remove char=end of file for some editors
        for tag in a_tags_with_href:
            tag.string = tag.text.replace("\u200c", "")
            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
        a_tags_with_href = self.body_tag.find_all(
            "a", {"href": re.compile("^(?!#sdfootnote)")})
        for tag in a_tags_with_href:
            tag.string = tag.text.replace("\u200c", "")
            tag.string = tag.text.replace("\u200b", "")  # zero-width-space
            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
    def _process_toc_links(self):
        """Function to extract nodes which contains TOC links, remove links from file and detect headers."""
@@ -50,88 +71,6 @@ class HtmlDocxProcessor:
                                f"Check the structure of the file."
                                f"Tag name: {tag.name}")
    def _process_quotes(self):
        """
            Function to process block quotes.
            After docx to html conversion block quotes are stored inside table with 1 cell.
            All text is wrapped in a <i> tag.
            Such tables will be replaced with <blockquote> tags.
            <table cellpadding=\"7\" cellspacing=\"0\" width=\"614\">
                <col width=\"600\"/>
                <tr>
                    <td width=\"600\">
                        <p style=\"text-align: justify;\"><i>aaaaa</i></p>
                        <p style=\"text-align: justify;\"><br/></p>
                    </td>
                </tr>
            </table>
        """
        tables = self.body_tag.find_all("table")
        for table in tables:
            trs = table.find_all("tr")
            tds = table.find_all("td")
            if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600":
                td = tds[0]
                is_zero_border = "border: none;" in td.get("style")
                paragraphs = td.find_all("p")
                has_i_tag_or_br = [(p.i, p.br) for p in paragraphs]
                has_i_tag_or_br = [x[0] is not None or x[1] is not None
                                   for x in has_i_tag_or_br]
                if all(has_i_tag_or_br) and is_zero_border:
                    new_div = BeautifulSoup(
                        features="lxml").new_tag("blockquote")
                    for p in paragraphs:
                        new_div.append(p)
                    table.replaceWith(new_div)
    def _process_tables(self):
        """Function to process tables. Set "border" attribute."""
        tables = self.body_tag.find_all("table")
        for table in tables:
            tds = table.find_all("td")
            sizes = []
            for td in tds:
                style = td.get("style")
                if style:
                    match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
                    if match:
                        size = match.group(1)
                        units = match.group(2)
                        if units == "pt":
                            value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\
                                if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\
                                else float(size)
                            size = value
                        sizes.append(float(size))
                width = td.get("width")
                td.attrs = {}
                if width:
                    td.attrs["width"] = width
            if sizes:
                border_size = sum(sizes) / len(sizes)
                table.attrs["border"] = f"{border_size:.2}"
        self.tables_amount = len(tables)
    def _process_hrefs(self):
        a_tags_with_href = self.body_tag.find_all(
            "a", {"href": re.compile("^.*http.+")})
        # remove char=end of file for some editors
        for tag in a_tags_with_href:
            tag.string = tag.text.replace("\u200c", "")
            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
        a_tags_with_href = self.body_tag.find_all(
            "a", {"href": re.compile("^(?!#sdfootnote)")})
        for tag in a_tags_with_href:
            tag.string = tag.text.replace("\u200c", "")
            tag.string = tag.text.replace("\u200b", "")  # zero-width-space
            tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
    def _get_top_level_headers(self) -> List[Dict[str, Union[str, bool]]]:
        """
        Function for gathering info about top-level chapters.
@@ -172,7 +111,8 @@ class HtmlDocxProcessor:
                    "is_introduction": is_introduction})
        return headers_info
-    def _mark_introduction_headers(self):
+    @staticmethod
    def _mark_introduction_headers(top_level_headers: List[Dict[str, Union[str, bool]]]):
        """
        Function to find out:
        what header shouldn't be numbered and can be treated as introduction chapter
@@ -187,21 +127,21 @@ class HtmlDocxProcessor:
        """
        is_numbered_header = [header["is_numbered"]
-                              for header in self.top_level_headers]
+                              for header in top_level_headers]
        is_title = [header["is_introduction"]
-                    for header in self.top_level_headers]
+                    for header in top_level_headers]
        first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
        second_is_numbered_or_not_exist = all(is_numbered_header[1:2])
        first_header_is_introduction = is_title and is_title[0]
        if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction:
-            self.top_level_headers[0]["should_be_numbered"] = False
+            top_level_headers[0]["should_be_numbered"] = False
-            for i in range(1, len(self.top_level_headers)):
+            for i in range(1, len(top_level_headers)):
-                self.top_level_headers[i]["should_be_numbered"] = True
+                top_level_headers[i]["should_be_numbered"] = True
        else:
-            for i in range(0, len(self.top_level_headers)):
+            for i in range(0, len(top_level_headers)):
-                self.top_level_headers[i]["should_be_numbered"] = True
+                top_level_headers[i]["should_be_numbered"] = True
    @staticmethod
    def clean_title_from_tabs(tag: NavigableString):
@@ -217,10 +157,8 @@ class HtmlDocxProcessor:
        """
        if type(tag) is NavigableString:
            func(tag)
-        else:
+        elif list(tag.children):
-            children = list(tag.children)
+            self.apply_func_to_last_child(list(tag.children)[0], func)
            if children:
                self.apply_func_to_last_child(children[0], func)
    def _process_headings(self):
        """
@@ -233,25 +171,20 @@ class HtmlDocxProcessor:
            processed <h> tags
        """
-        header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
+        header_tags = self.body_tag.find_all(re.compile("^h[1-5]$"))
        # clean header from attrs and text in header from numbering and \n
        for h_tag in header_tags:
            h_tag.attrs = {}
            for tag in h_tag.find_all():
                tag.attrs = {}
            if h_tag.parent.name == "li":
                h_tag.parent.unwrap()
                while h_tag.parent.name == "ol":
                    h_tag.parent.unwrap()
            cleaned_title = re.sub(r"[\s\xa0]", " ", h_tag.text)
-            if cleaned_title == "":
+            if cleaned_title != "":
                h_tag.unwrap()
            else:
                assert h_tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
                    f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings."
                content = list(h_tag.children)
                # do not take into account rubbish empty tags like <a>, but don"t remove them
                content = [item for item in content if
                           (type(item) is not NavigableString and item.text != "")
@@ -270,11 +203,13 @@ class HtmlDocxProcessor:
                    else:
                        self.apply_func_to_last_child(
                            content[i], self.clean_title_from_tabs)
            else:
                h_tag.unwrap()
    def delete_content_before_toc(self):
        # remove all tag upper the <TOC> only in content !!! body tag is not updated
        toc_tag = self.html_soup.new_tag("TOC")
        self.content: List[Tag] = self.body_tag.find_all(recursive=False)
        if toc_tag in self.content:
            ind = self.content.index(toc_tag) + 1
            self.content = self.content[ind:]
@@ -297,54 +232,35 @@ class HtmlDocxProcessor:
        modify_html_soup_with_css_styles(self.body_tag)
        self.logger.log("Image processing.")
-        self.images = process_images(access, path_to_html=html_path,
+        images = process_images(access, path_to_html=html_path,
                                book_id=book_id, body_tag=self.body_tag)
        self.logger.log(
-            f"{len(self.images)} images have been processed.")
+            f"{len(images)} images have been processed.")
        self.logger.log("Footnotes processing.")
-        self.footnotes: List[str] = process_footnotes(self.body_tag)
+        footnotes: List[str] = process_footnotes(self.body_tag)
        self.logger.log(
-            f"{len(self.footnotes)} footnotes have been processed.")
+            f"{len(footnotes)} footnotes have been processed.")
        self.logger.log(f"Processing TOC and headers.")
        self._process_toc_links()
        self.logger.log(f"Preprocess Html using presets.")
        _preprocess_html(html_preprocessor=self.html_preprocessor,
                         html_soup=self.html_soup)
        # CSS after html processing cause of <fonts> that aren't supported by html
        self.logger.log("CSS inline style preprocessing.")
        self.style_preprocessor.process_inline_styles_in_html_soup(
            self.body_tag)
        self.logger.log("CSS inline style processing.")
        modify_html_soup_with_css_styles(self.body_tag)
        # process main elements of the .html doc
        self.logger.log(f"Processing main elements of html.")
        self.logger.log("Block quotes processing.")
        self._process_quotes()
        self.logger.log("Tables processing.")
        self._process_tables()
        self.logger.log(
            f"{self.tables_amount} tables have been processed.")
        self.logger.log("Hrefs processing.")
        self._process_hrefs()
-        self.top_level_headers: List[Dict[str, Union[str, bool]]]\
+        self.logger.log(f"TOC processing.")
        self._process_toc_links()
        top_level_headers: List[Dict[str, Union[str, bool]]]\
            = self._get_top_level_headers()
-        self._mark_introduction_headers()
+        self._mark_introduction_headers(top_level_headers)
        self._process_headings()
        self.logger.log(f".html using presets processing.")
        _process_presets(html_preprocessor=self.html_preprocessor,
                         html_soup=self.html_soup)
        self.content = self.body_tag.find_all(recursive=False)
        # delete text before table of content if exists
        self.delete_content_before_toc()
        self.logger.log("End of processing .html file.")
-
+        return self.content, footnotes, top_level_headers
        return self.content, self.footnotes, self.top_level_headers