Formatting

2022-06-01 16:23:53 +03:00
parent 5039417a0f
commit c0ef0b6d6e
13 changed files with 318 additions and 185 deletions
--- a/src/docx_converter/html_docx_preprocessor.py
+++ b/src/docx_converter/html_docx_preprocessor.py
@@ -21,13 +21,22 @@ class HTMLDocxPreprocessor:
        self.top_level_headers = None
        self.content = list()

-    def _clean_tag(self, tag, attr_name, attr_value):
+    def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
        """
        Function to clean tags by its name and attribute value.
+        Parameters
+        ----------
+        tag: str
+            tag name to clean
+        attr_name: str
+            attribute name
+        attr_value: [str,re]
+            attribute value
+
+        Returns
+        -------
+        clean tag

-        :param tag: Tag name to clean.
-        :param attr_name: Attribute name.
-        :param attr_value: Attribute value.
        """
        tags = self.body_tag.find_all(tag, {attr_name: attr_value})
        for tag in tags:
@@ -56,12 +65,19 @@ class HTMLDocxPreprocessor:
            return value

    @classmethod
-    def convert_font_pt_to_px(cls, style):
+    def convert_font_pt_to_px(cls, style: str) -> str:
        """
-        Method converts point in the font-size to pixels.
+        Function converts point in the font-size to pixels.
+        Parameters
+        ----------
+        style: str
+            str with style to proces
+
+        Returns
+        -------
+        : str
+            str with converted style

-        :param style: Str with style to process.
-        :return: Str with converted style.
        """
        size = re.search(r"font-size: (\d{1,3})pt", style)

@@ -77,7 +93,10 @@ class HTMLDocxPreprocessor:
        return re.sub(size + "pt", str(new_size) + "px", style)

    def _font_to_span(self):
-        """Function to convert <font> tag to <span>. If font style is default, then remove this tag."""
+        """
+        Function to convert <font> tag to <span>.
+        If font style is default, then remove this tag.
+        """
        fonts = self.body_tag.find_all("font")
        for font in fonts:
            face = font.get("face")
@@ -105,7 +124,8 @@ class HTMLDocxPreprocessor:
            if len(font.attrs) == 0:
                font.unwrap()

-        assert len(self.body_tag.find_all("font")) == 0  # on this step there should be no more <font> tags
+        # on this step there should be no more <font> tags
+        assert len(self.body_tag.find_all("font")) == 0

    def delete_content_before_toc(self):
        # remove all tag upper the <TOC> only in content !!! body tag is not updated
@@ -116,11 +136,15 @@ class HTMLDocxPreprocessor:

    def clean_trash(self):
        """Function to remove all styles and tags we don't need."""
-        self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$'))
-        self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))  # todo: check for another languages
-        self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
+        self._clean_tag('span', 'style', re.compile(
+            r'^background: #[0-9a-fA-F]{6}$'))
+        # todo: check for another languages
+        self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))
+        self._clean_tag('span', 'style', re.compile(
+            '^letter-spacing: -?[\d\.]+pt$'))

-        self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$'))
+        self._clean_tag('font', 'face', re.compile(
+            r'^Times New Roman[\w, ]+$'))

        self._clean_tag("a", "name", "_GoBack")
        self._clean_underline_links()
@@ -128,7 +152,8 @@ class HTMLDocxPreprocessor:
        self._font_to_span()

        # replace toc with empty <TOC> tag
-        tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
+        tables = self.body_tag.find_all(
+            "div", id=re.compile(r'^Table of Contents\d+'))
        for table in tables:
            table.wrap(self.html_soup.new_tag("TOC"))
            table.decompose()
@@ -138,7 +163,7 @@ class HTMLDocxPreprocessor:
        paragraphs = self.body_tag.find_all('p')

        for p in paragraphs:
-            # libra converts some \n into <p> with 2 </br>
+            # libre converts some \n into <p> with 2 </br>
            # there we remove 1 unnecessary <br>
            brs = p.find_all('br')
            text = p.text
@@ -156,9 +181,11 @@ class HTMLDocxPreprocessor:
            if style:
                indent = re.search(r'text-indent: ([\d\.]{1,4})in', style)
                margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style)
-                margin_right = re.search(r'margin-right: ([\d\.]{1,4})in', style)
+                margin_right = re.search(
+                    r'margin-right: ([\d\.]{1,4})in', style)
                margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style)
-                margin_bottom = re.search(r'margin-bottom: ([\d\.]{1,4})in', style)
+                margin_bottom = re.search(
+                    r'margin-bottom: ([\d\.]{1,4})in', style)
            else:
                indent = None
                margin_left = None
@@ -195,6 +222,7 @@ class HTMLDocxPreprocessor:

    def _process_tables(self):
        """Function to process tables. Set "border" attribute."""
+
        tables = self.body_tag.find_all("table")
        for table in tables:
            tds = table.find_all("td")
@@ -258,21 +286,24 @@ class HTMLDocxPreprocessor:
                                   for x in has_i_tag_or_br]

                if all(has_i_tag_or_br) and is_zero_border:
-                    new_div = BeautifulSoup(features='lxml').new_tag('blockquote')
+                    new_div = BeautifulSoup(
+                        features='lxml').new_tag('blockquote')
                    for p in paragraphs:
                        new_div.append(p)

                    table.replaceWith(new_div)

    def _process_hrefs(self):
-        a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^.*http.+')})
+        a_tags_with_href = self.body_tag.find_all(
+            'a', {'href': re.compile('^.*http.+')})

        # remove char=end of file for some editors
        for tag in a_tags_with_href:
            tag.string = tag.text.replace('\u200c', '')
            tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')

-        a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^(?!#sdfootnote)')})
+        a_tags_with_href = self.body_tag.find_all(
+            'a', {'href': re.compile('^(?!#sdfootnote)')})
        for tag in a_tags_with_href:
            tag.string = tag.text.replace('\u200c', '')
            tag.string = tag.text.replace('\u200b', '')  # zero-width-space
@@ -286,23 +317,25 @@ class HTMLDocxPreprocessor:
    def _process_footnotes(self):
        """Function returns list of footnotes and delete them from html_soup."""
        footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
-        footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
+        footnote_content = self.body_tag.find_all(
+            'div', id=re.compile(r'^sdfootnote\d+$'))
        footnote_amt = len(footnote_anchors)

        assert footnote_amt == len(footnote_content), \
-            'Something went wrong with footnotes after libra conversion'
+            'Something went wrong with footnotes after libre conversion'

        footnotes = []

        for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
-            true_a_tag = cont_tag.find_all('a', class_=re.compile(r'^sdfootnote.+$'))[0]
+            true_a_tag = cont_tag.find_all(
+                'a', class_=re.compile(r'^sdfootnote.+$'))[0]

            if true_a_tag.attrs.get('href') is None:
                cont_tag.a.decompose()
                continue

            assert anc_tag['name'] == true_a_tag['href'][1:], \
-                'Something went wrong with footnotes after libra conversion'
+                'Something went wrong with footnotes after libre conversion'

            new_tag = BeautifulSoup(features='lxml').new_tag('sup')
            new_tag['class'] = 'footnote-element'
@@ -355,8 +388,10 @@ class HTMLDocxPreprocessor:

        if len(img_tags):
            if access is None:
-                folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-                new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
+                folder_path = os.path.dirname(
+                    os.path.dirname(os.path.abspath(__file__)))
+                new_path = pathlib.Path(os.path.join(
+                    folder_path, f'json/img_{book_id}/'))
                new_path.mkdir(exist_ok=True)

            for img in img_tags:
@@ -370,10 +405,12 @@ class HTMLDocxPreprocessor:
                if access is not None:
                    link = access.send_image(img_path, doc_id=book_id)
                    img.attrs['src'] = link
-                    self.logger_object.log(f'{img_name} successfully uploaded.')
+                    self.logger_object.log(
+                        f'{img_name} successfully uploaded.')
                else:
                    img_size = os.path.getsize(img_path)
-                    self.logger_object.log(f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG)
+                    self.logger_object.log(
+                        f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG)
                    new_img_path = new_path / img_name
                    copyfile(img_path, new_img_path)
                    img.attrs["src"] = str(new_img_path)
@@ -408,7 +445,8 @@ class HTMLDocxPreprocessor:

    def _process_toc_links(self):
        """Function to extract nodes which contains TOC links, remove links from file and detect headers."""
-        toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')})
+        toc_links = self.body_tag.find_all(
+            "a", {'name': re.compile(r'^_Toc\d+')})
        headers = [link.parent for link in toc_links]
        outline_level = "1"  # All the unknown outlines will be predicted as <h1>
        for tag in headers:
@@ -418,7 +456,8 @@ class HTMLDocxPreprocessor:
            elif tag.name == "p":
                exist_in_toc = self._check_parent_link_exist_in_toc(tag)
                if tag in self.body_tag.find_all("p") and exist_in_toc:
-                    new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
+                    new_tag = BeautifulSoup(
+                        features="lxml").new_tag("h" + outline_level)
                    text = tag.text
                    tag.replaceWith(new_tag)
                    new_tag.string = text
@@ -440,14 +479,16 @@ class HTMLDocxPreprocessor:
    @staticmethod
    def clean_tag_from_tabs(tag: NavigableString):
        cleaned = re.sub(r'(\s+)+', ' ', tag)
-        this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+        this = BeautifulSoup.new_string(BeautifulSoup(
+            features="lxml"), cleaned, NavigableString)
        tag.replace_with(this)
        # print('input: ', repr(tag))
        # print('test: ', repr(cleaned))

    def clean_tag_from_numbering(self, tag):
        cleaned = self.clean_title_from_numbering(tag)
-        this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+        this = BeautifulSoup.new_string(BeautifulSoup(
+            features="lxml"), cleaned, NavigableString)
        tag.replace_with(this)
        # print('input: ', repr(tag))
        # print('test: ', repr(cleaned))
@@ -484,7 +525,8 @@ class HTMLDocxPreprocessor:
        """
        headers_info = []
        header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
-        headers_outline = [int(re.sub(r"^h", "", tag.name)) for tag in header_tags]
+        headers_outline = [int(re.sub(r"^h", "", tag.name))
+                           for tag in header_tags]
        if headers_outline:
            top_level_outline = min(headers_outline)
            top_level_headers = [tag for tag in header_tags
@@ -518,13 +560,17 @@ class HTMLDocxPreprocessor:

        Assume  header(s) to be introduction if:
            1. one header not numbered, before 1 numbered header
-            2. it is first header from the top level list and it equals to 'introduction'
+            2. it is first header from the top level list and it equals to 'introductio
+        Returns
+        -------
+        None
+            mark each top-level header with flag should_be_numbered = true/false

-        Result :
-        Mark each top-level header with flag should_be_numbered = true/false
        """
-        is_numbered_header = [header['is_numbered'] for header in self.top_level_headers]
-        is_title = [header['is_introduction'] for header in self.top_level_headers]
+        is_numbered_header = [header['is_numbered']
+                              for header in self.top_level_headers]
+        is_title = [header['is_introduction']
+                    for header in self.top_level_headers]

        first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
        second_is_numbered_or_not_exist = all(is_numbered_header[1:2])
@@ -539,7 +585,19 @@ class HTMLDocxPreprocessor:
                self.top_level_headers[i]['should_be_numbered'] = True

    def _process_headings(self):
-        """Function to process tags <h>."""
+        """
+        Function to process tags <h>.
+        Steps
+        ----------
+        1. remove <b>, <span>
+        2. clean text in header from numbering and \n
+
+        Returns
+        -------
+        None
+            processed <h> tags
+
+        """
        header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))

        # 1. remove <b>, <span>
@@ -581,36 +639,52 @@ class HTMLDocxPreprocessor:
                for i, item in enumerate(content):
                    if type(content[i]) is NavigableString:
                        cleaned = re.sub(r'(\s+)+', ' ', content[i])
-                        this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+                        this = BeautifulSoup.new_string(BeautifulSoup(
+                            features="lxml"), cleaned, NavigableString)
                        content[i].replace_with(this)
                        content[i] = this
                    else:
-                        self.apply_func_to_last_child(content[i], self.clean_tag_from_tabs)
+                        self.apply_func_to_last_child(
+                            content[i], self.clean_tag_from_tabs)

                content[0] = '' if content[0] == ' ' else content[0]
                content = [item for item in content if item != '']

                if type(content[0]) is NavigableString:
                    cleaned = self.clean_title_from_numbering(content[0])
-                    this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
+                    this = BeautifulSoup.new_string(BeautifulSoup(
+                        features="lxml"), cleaned, NavigableString)
                    content[0].replace_with(this)
                    content[0] = this
                else:
-                    self.apply_func_to_last_child(content[0], self.clean_tag_from_numbering)
+                    self.apply_func_to_last_child(
+                        content[0], self.clean_tag_from_numbering)

    def _process_lists(self):
        """
-        Function to process tags <li>.
-        Unwrap <p> tags.
+        Function
+        - process tags <li>.
+        - unwrap <p> tags.
+        Parameters
+        ----------
+        body_tag: Tag, soup object
+
+        Returns
+        -------
+        None
+            uwrap <p> tag with li
+
        """
+
        li_tags = self.body_tag.find_all("li")

-        for il_tag in li_tags:
-            il_tag.attrs.update(il_tag.p.attrs)
-            il_tag.p.unwrap()
+        for li_tag in li_tags:
+            li_tag.attrs.update(li_tag.p.attrs)
+            li_tag.p.unwrap()

-    def process_html(self, access, html_path, book_id):
+    def process_html(self, access=None, html_path='', book_id='local'):
        """Process html code to satisfy LiveCarta formatting."""
+        self.logger_object.log('Beginning of processing .html file.')
        try:
            self.logger_object.log(f'Processing TOC and headers.')
            self._process_toc_links()
@@ -628,18 +702,22 @@ class HTMLDocxPreprocessor:

            self.logger_object.log('Tables processing.')
            self._process_tables()
-            self.logger_object.log(f'{self.tables_amount} tables have been processed.')
+            self.logger_object.log(
+                f'{self.tables_amount} tables have been processed.')

            self.logger_object.log('Hrefs processing.')
            self._process_hrefs()

            self.logger_object.log('Footnotes processing.')
            self._process_footnotes()
-            self.logger_object.log(f'{len(self.footnotes)} footnotes have been processed.')
+            self.logger_object.log(
+                f'{len(self.footnotes)} footnotes have been processed.')

            self.logger_object.log('Image processing.')
-            self._process_images(access=access, html_path=html_path, book_id=book_id)
-            self.logger_object.log(f'{len(self.images)} images have been processed.')
+            self._process_images(
+                access=access, html_path=html_path, book_id=book_id)
+            self.logger_object.log(
+                f'{len(self.images)} images have been processed.')

            self._process_footer()
            self._process_div()
@@ -658,7 +736,8 @@ class HTMLDocxPreprocessor:
            self.delete_content_before_toc()

        except Exception as exc:
-            self.logger_object.log('Error has occurred while processing html.', logging.ERROR)
+            self.logger_object.log(
+                'Error has occurred while processing html.', logging.ERROR)
            self.logger_object.log_error_to_main_log()
            if self.status_wrapper:
                self.status_wrapper.set_error()