From 9f067bb93d0b2b4c3217a465954d4c360da2f4f8 Mon Sep 17 00:00:00 2001
From: Kiryl <kiryl.miatselitsa@teqniksoft.com>
Date: Fri, 24 Jun 2022 17:10:43 +0300
Subject: [PATCH] Make structure of docx c as epub s

---
 src/docx_converter/footnotes_processing.py   |  73 +++++++++
 src/docx_converter/html_docx_preprocessor.py | 148 +++----------------
 src/docx_converter/image_processing.py       |  39 +++++
 3 files changed, 134 insertions(+), 126 deletions(-)
 create mode 100644 src/docx_converter/footnotes_processing.py
 create mode 100644 src/docx_converter/image_processing.py

diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py
new file mode 100644
index 0000000..84861d7
--- /dev/null
+++ b/src/docx_converter/footnotes_processing.py
@@ -0,0 +1,73 @@
+import re
+from bs4 import BeautifulSoup, NavigableString, Tag
+
+@staticmethod
+def _clean_footnote_content(content):
+    content = content.strip()
+    return content.strip()
+
+
+def process_footnotes(body_tag):
+    """Function returns list of footnotes and delete them from html_soup."""
+    footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc')
+    footnote_content = body_tag.find_all(
+        'div', id=re.compile(r'^sdfootnote\d+$'))
+    footnote_amt = len(footnote_anchors)
+
+    assert footnote_amt == len(footnote_content), \
+        'Something went wrong with footnotes after libre conversion'
+
+    footnotes = []
+
+    for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
+        true_a_tag = cont_tag.find_all(
+            'a', class_=re.compile(r'^sdfootnote.+$'))[0]
+
+        if true_a_tag.attrs.get('href') is None:
+            cont_tag.a.decompose()
+            continue
+
+        assert anc_tag['name'] == true_a_tag['href'][1:], \
+            'Something went wrong with footnotes after libre conversion'
+
+        new_tag = BeautifulSoup(features='lxml').new_tag('sup')
+        new_tag['class'] = 'footnote-element'
+        new_tag['data-id'] = i + 1
+        new_tag['id'] = f'footnote-{i + 1}'
+        new_tag.string = '*'
+        anc_tag.replace_with(new_tag)
+
+        # extra digits in footnotes from documents downloaded from livecarta
+        a_text = true_a_tag.text
+        if len(cont_tag.find_all('p')):
+            sup = cont_tag.find_all('p')[0].find('sup')
+            if sup and sup.text == a_text:
+                sup.decompose()
+
+        for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
+            tag_a.decompose()
+
+        # remove font-size
+        for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
+            style = span.get('style')
+            style = re.sub(r"font-size: \d+px", "", style)
+            if style == '':
+                del span.attrs['style']
+            else:
+                span.attrs['style'] = style
+
+        unicode_string = ''
+        for child in cont_tag.children:
+            if type(child) is NavigableString:
+                continue
+            if child.name == 'blockquote':
+                unicode_string += str(child)
+            else:
+                unicode_string += child.decode_contents()
+
+        content = _clean_footnote_content(unicode_string)
+        cont_tag.decompose()
+
+        footnotes.append(content)
+
+    return footnotes
diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py
index 80d96a3..425fa10 100644
--- a/src/docx_converter/html_docx_preprocessor.py
+++ b/src/docx_converter/html_docx_preprocessor.py
@@ -1,14 +1,13 @@
-import os
 import re
 import logging
-import pathlib
 from typing import List
-from shutil import copyfile
 
 from bs4 import BeautifulSoup, NavigableString, Tag
 
 from src.livecarta_config import LiveCartaConfig
 from src.util.helpers import BookLogger, BookStatusWrapper
+from src.docx_converter.footnotes_processing import process_footnotes
+from src.docx_converter.image_processing import process_images
 
 
 class HTMLDocxPreprocessor:
@@ -22,6 +21,7 @@ class HTMLDocxPreprocessor:
         self.content = list()
 
     def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
+        # todo regex
         """
         Function to clean tags by its name and attribute value.
         Parameters
@@ -44,6 +44,7 @@ class HTMLDocxPreprocessor:
                 tag.unwrap()
 
     def _clean_underline_links(self):
+        # todo regex
         """Function cleans meaningless <u> tags before links."""
         underlines = self.body_tag.find_all("u")
         for u in underlines:
@@ -99,12 +100,10 @@ class HTMLDocxPreprocessor:
         """
         fonts = self.body_tag.find_all("font")
         for font in fonts:
-            face = font.get("face")
-            style = font.get("style")
-            color = font.get("color")
+            face, style, color =\
+                font.get("face"), font.get("style"), font.get("color")
 
-            font.attrs = {}
-            font.name = "span"
+            font.attrs, font.name = {}, "span"
             if style:
                 style = self.convert_font_pt_to_px(style)
                 if style != "":
@@ -127,14 +126,8 @@ class HTMLDocxPreprocessor:
         # on this step there should be no more <font> tags
         assert len(self.body_tag.find_all("font")) == 0
 
-    def delete_content_before_toc(self):
-        # remove all tag upper the <TOC> only in content !!! body tag is not updated
-        toc_tag = self.html_soup.new_tag('TOC')
-        if toc_tag in self.content:
-            ind = self.content.index(toc_tag) + 1
-            self.content = self.content[ind:]
-
     def clean_trash(self):
+        # todo make it regex dict
         """Function to remove all styles and tags we don't need."""
         self._clean_tag('span', 'style', re.compile(
             r'^background: #[\da-fA-F]{6}$'))
@@ -308,115 +301,8 @@ class HTMLDocxPreprocessor:
             tag.string = tag.text.replace('\u200b', '')  # zero-width-space
             tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
 
-    @staticmethod
-    def _clean_footnote_content(content):
-        content = content.strip()
-        return content.strip()
-
-    def _process_footnotes(self):
-        """Function returns list of footnotes and delete them from html_soup."""
-        footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
-        footnote_content = self.body_tag.find_all(
-            'div', id=re.compile(r'^sdfootnote\d+$'))
-        footnote_amt = len(footnote_anchors)
-
-        assert footnote_amt == len(footnote_content), \
-            'Something went wrong with footnotes after libre conversion'
-
-        footnotes = []
-
-        for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
-            true_a_tag = cont_tag.find_all(
-                'a', class_=re.compile(r'^sdfootnote.+$'))[0]
-
-            if true_a_tag.attrs.get('href') is None:
-                cont_tag.a.decompose()
-                continue
-
-            assert anc_tag['name'] == true_a_tag['href'][1:], \
-                'Something went wrong with footnotes after libre conversion'
-
-            new_tag = BeautifulSoup(features='lxml').new_tag('sup')
-            new_tag['class'] = 'footnote-element'
-            new_tag['data-id'] = i + 1
-            new_tag['id'] = f'footnote-{i + 1}'
-            new_tag.string = '*'
-            anc_tag.replace_with(new_tag)
-
-            # extra digits in footnotes from documents downloaded from livecarta
-            a_text = true_a_tag.text
-            if len(cont_tag.find_all('p')):
-                sup = cont_tag.find_all('p')[0].find('sup')
-                if sup and sup.text == a_text:
-                    sup.decompose()
-
-            for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
-                tag_a.decompose()
-
-            # remove font-size
-            for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
-                style = span.get('style')
-                style = re.sub(r"font-size: \d+px", "", style)
-                if style == '':
-                    del span.attrs['style']
-                else:
-                    span.attrs['style'] = style
-
-            unicode_string = ''
-            for child in cont_tag.children:
-                if type(child) is NavigableString:
-                    continue
-                if child.name == 'blockquote':
-                    unicode_string += str(child)
-                else:
-                    unicode_string += child.decode_contents()
-
-            content = self._clean_footnote_content(unicode_string)
-            cont_tag.decompose()
-
-            footnotes.append(content)
-
-        self.footnotes = footnotes
-
-    def _process_images(self, access, html_path, book_id):
-        """
-        Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
-        For now images are moved to one folder.
-        """
-        img_tags = self.body_tag.find_all('img')
-
-        if len(img_tags):
-            if access is None:
-                folder_path = os.path.dirname(
-                    os.path.dirname(os.path.abspath(__file__)))
-                new_path = pathlib.Path(os.path.join(
-                    folder_path, f'json/img_{book_id}/'))
-                new_path.mkdir(exist_ok=True)
-
-            for img in img_tags:
-                img_name = img.attrs.get('src')
-                # quick fix for bad links
-                if (len(img_name) >= 3) and img_name[:3] == '../':
-                    img_name = img_name[3:]
-
-                img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
-
-                if access is not None:
-                    link = access.send_image(img_path, doc_id=book_id)
-                    img.attrs['src'] = link
-                    self.logger_object.log(
-                        f'{img_name} successfully uploaded.')
-                else:
-                    img_size = os.path.getsize(img_path)
-                    self.logger_object.log(
-                        f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG)
-                    new_img_path = new_path / img_name
-                    copyfile(img_path, new_img_path)
-                    img.attrs["src"] = str(new_img_path)
-
-        self.images = img_tags
-
     def _process_footer(self):
+        # todo regex
         """
         Function to process <div title="footer"> tags.
         All the tags will be deleted from file.
@@ -426,6 +312,7 @@ class HTMLDocxPreprocessor:
             div.decompose()
 
     def _process_div(self):
+        # todo regex
         """Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
         divs = self.body_tag.find_all("div")
 
@@ -505,6 +392,7 @@ class HTMLDocxPreprocessor:
                 self.apply_func_to_last_child(children[0], func)
 
     def _preprocessing_headings(self):
+        # todo regex
         """Function to convert all lower level headings to p tags"""
         pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
         header_tags = self.body_tag.find_all(re.compile(pattern))
@@ -584,6 +472,7 @@ class HTMLDocxPreprocessor:
                 self.top_level_headers[i]['should_be_numbered'] = True
 
     def _process_headings(self):
+        # todo regex
         """
         Function to process tags <h>.
         Steps
@@ -660,6 +549,7 @@ class HTMLDocxPreprocessor:
                         content[0], self.clean_tag_from_numbering)
 
     def _process_lists(self):
+        # todo regex
         """
         Function
         - process tags <li>.
@@ -678,6 +568,13 @@ class HTMLDocxPreprocessor:
             li_tag.attrs.update(li_tag.p.attrs)
             li_tag.p.unwrap()
 
+    def delete_content_before_toc(self):
+        # remove all tag upper the <TOC> only in content !!! body tag is not updated
+        toc_tag = self.html_soup.new_tag('TOC')
+        if toc_tag in self.content:
+            ind = self.content.index(toc_tag) + 1
+            self.content = self.content[ind:]
+
     def process_html(self, access=None, html_path='', book_id='local'):
         """Process html code to satisfy LiveCarta formatting."""
         self.logger_object.log('Beginning of processing .html file.')
@@ -705,13 +602,12 @@ class HTMLDocxPreprocessor:
             self._process_hrefs()
 
             self.logger_object.log('Footnotes processing.')
-            self._process_footnotes()
+            self.footnotes = process_footnotes(self.body_tag)
             self.logger_object.log(
                 f'{len(self.footnotes)} footnotes have been processed.')
 
             self.logger_object.log('Image processing.')
-            self._process_images(
-                access=access, html_path=html_path, book_id=book_id)
+            self.images = process_images(self.body_tag, access=access, html_path=html_path, book_id=book_id)
             self.logger_object.log(
                 f'{len(self.images)} images have been processed.')
 
diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py
new file mode 100644
index 0000000..923a274
--- /dev/null
+++ b/src/docx_converter/image_processing.py
@@ -0,0 +1,39 @@
+import os
+import logging
+import pathlib
+from shutil import copyfile
+
+
+def process_images(body_tag, access, html_path, book_id):
+    """
+    Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
+    For now images are moved to one folder.
+    """
+    img_tags = body_tag.find_all('img')
+
+    if len(img_tags):
+        if access is None:
+            folder_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(__file__)))
+            new_path = pathlib.Path(os.path.join(
+                folder_path, f'json/img_{book_id}/'))
+            new_path.mkdir(exist_ok=True)
+
+        for img in img_tags:
+            img_name = img.attrs.get('src')
+            # quick fix for bad links
+            if (len(img_name) >= 3) and img_name[:3] == '../':
+                img_name = img_name[3:]
+
+            img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
+
+            if access is not None:
+                link = access.send_image(img_path, doc_id=book_id)
+                img.attrs['src'] = link
+            else:
+                img_size = os.path.getsize(img_path)
+                new_img_path = new_path / img_name
+                copyfile(img_path, new_img_path)
+                img.attrs["src"] = str(new_img_path)
+
+    return img_tags
\ No newline at end of file