converter: just renaming file and functions

2021-04-14 14:15:34 +03:00
parent 81ccbf5af3
commit a353f0346f
2 changed files with 8 additions and 8 deletions
--- a/src/json_postprocessor.py
+++ b/src/json_postprocessor.py
@@ -0,0 +1,147 @@
+import logging
+import re
+from copy import copy
+
+from config import LawCartaConfig
+
+
+class JSONConverter:
+    def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
+        self.content_dict = None
+        self.content = content
+        self.footnotes = footnotes
+        self.top_level_headers = top_level_headers
+        self.logger_object = logger_object
+        self.book_api_status = book_api_status
+
+    @staticmethod
+    def format_html(html_text):
+        """
+        Function to remove useless symbols from html code.
+
+        :param html_text: Text to process.
+        :return: Cleaned text.
+        """
+        new_text = re.sub(r'([\n\t])', ' ', html_text)
+        return new_text
+
+    # TODO: rethink the function structure without indexes.
+    def header_to_livecarta_chapter_item(self, ind) -> (dict, int):
+        """
+        Function process header and collects all content for it.
+
+        :param ind: Index of header in content list.
+        """
+        if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
+            title = str(self.content[ind])
+            title = title.replace(f'<{self.content[ind].name}>', '')
+            title = title.replace(f'</{self.content[ind].name}>', '')
+            title = re.sub(r'^\n', '', title)
+
+            curr_outline = int(re.sub(r"^h", "", self.content[ind].name))  # extract outline from tag
+            result = {
+                'title': f'{title}',
+                'contents': [],
+                'sub_items': []
+            }
+            ch_content = []
+            ind += 1
+
+            while ind < len(self.content):
+                # 1. next tag is a header
+                if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
+                    outline = int(re.sub(r"^h", "", self.content[ind].name))
+                    # - recursion step until h_i > h_initial
+                    if outline > curr_outline:
+                        header_dict, ind = self.header_to_livecarta_chapter_item(ind)
+                        if ch_content:
+                            result['contents'].append("".join(ch_content))
+                        ch_content = []
+                        result['sub_items'].append(header_dict)
+                    # - current h_i <= h_initial, end of recursion
+                    else:
+                        # return result, ind
+                        break
+                # 2. next tag is not a header. add new paragraphs
+                else:
+                    html_str = self.format_html(str(self.content[ind]))
+                    ch_content.append(html_str)
+                    ind += 1
+
+            if ch_content:
+                result['contents'].append("".join(ch_content))
+            return result, ind
+        return ''
+
+    @staticmethod
+    def _is_empty_p_tag(tag):
+        if tag.name != 'p':
+            return False
+
+        temp_tag = copy(tag)
+        brs = temp_tag.find_all('br')
+        for br in brs:
+            br.decompose()
+
+        text = re.sub(r'\s+', '', temp_tag.text)
+        if text:
+            return False
+
+        return True
+
+    def convert_to_dict(self):
+        """
+        Function which convert list of html nodes to appropriate json structure.
+        """
+        json_strc = []
+        ind = 0
+        ch_num = 0
+        ch_amt = 0
+
+        try:
+            while ind < len(self.content):
+                res = {}
+
+                if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
+                    res, ind = self.header_to_livecarta_chapter_item(ind)
+
+                else:
+                    chapter_title = f'Untitled chapter {ch_num}'
+                    chapter = []
+                    while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
+                        if not self._is_empty_p_tag(self.content[ind]):
+                            chapter.append(self.format_html(str(self.content[ind])))
+                        ind += 1
+                    if chapter:
+                        res = {
+                            'title': chapter_title,
+                            'contents': ["".join(chapter)],
+                            'sub_items': []
+                        }
+                        ch_num += 1
+
+                if res:
+                    json_strc.append(res)
+                    ch_amt += 1
+                    self.logger_object.log(f'Chapter {ch_amt} has been added to structure.')
+        except Exception as exc:
+            self.logger_object.log('Error has occurred while making json structure.', logging.ERROR)
+            self.logger_object.log_error_to_main_log()
+            if self.book_api_status:
+                self.book_api_status.set_error_status()
+            raise exc
+
+        # Add is_introduction field to json structure
+        # after deleting content before toc, some chapters can be deleted
+        if self.top_level_headers:
+            same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title']
+            is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered']
+
+            json_strc[0]['is_introduction'] = is_first_header_introduction
+
+        self.content_dict = {
+            "content": json_strc,
+            "footnotes": self.footnotes
+        }
+
+        return self.content_dict