add color style processing

deleting toc in the end of whole html parsing fix searching <a> in footnotes
2020-09-23 15:20:53 +03:00
parent f392b6930d
commit bbfd489327
3 changed files with 28 additions and 35 deletions
--- a/src/book.py
+++ b/src/book.py
@@ -32,7 +32,7 @@ class Book:
                                        main_logger=main_logger)
        self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id)
-        assert BookConfig.SUPPORTED_LEVELS == len(BookConfig.SUPPORTED_HEADERS), \
+        assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
            "Length of headers doesn't match allowed levels."
    def save_docx(self, content):
--- a/src/html_preprocessor.py
+++ b/src/html_preprocessor.py
@@ -5,7 +5,7 @@ import re
 from shutil import copyfile
 from bs4 import BeautifulSoup, NavigableString
-from config import BookConfig, BookLogger, BookApiWrapper
+from config import LawCartaConfig, BookLogger, BookApiWrapper
 class HTMLPreprocessor:
@@ -49,8 +49,8 @@ class HTMLPreprocessor:
    @classmethod
    def convert_pt_to_px(cls, value):
        value = int(value)
-        if value == BookConfig.WORD_DEFAULT_FONT_SIZE:
+        if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE:
-            return BookConfig.LAWCARTA_DEFAULT_FONT_SIZE
+            return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
        else:
            return value
@@ -70,7 +70,7 @@ class HTMLPreprocessor:
        size = size.group(1)
        new_size = cls.convert_pt_to_px(size)
-        if new_size == BookConfig.LAWCARTA_DEFAULT_FONT_SIZE:
+        if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
            return ""
        return re.sub(size + "pt", str(new_size) + "px", style)
@@ -83,41 +83,39 @@ class HTMLPreprocessor:
        for font in fonts:
            face = font.get("face")
            style = font.get("style")
            color = font.get("color")
            font.attrs = {}
            font.name = "span"
            if style:
                style = self.convert_font_pt_to_px(style)
                if style != "":
                    if color and color != '#000000':
                        style += f'; color: {color};'
                    font.attrs["style"] = style
            elif color and color != '#000000':
                font.attrs["style"] = f'color: {color};'
            if face is not None:
                face = re.sub(r",[\w,\- ]*$", "", face)
-                if face != BookConfig.DEFAULT_FONT_NAME and BookConfig.font_correspondence_table.get(face):
+                if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face):
-                    font.attrs["face"] = BookConfig.font_correspondence_table[face]
+                    font.attrs["face"] = LawCartaConfig.font_correspondence_table[face]
                else:
-                    font.attrs["face"] = BookConfig.DEFAULT_FONT_NAME
+                    font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME
            if len(font.attrs) == 0:
                font.unwrap()
        assert len(self.body_tag.find_all("font")) == 0  # on this step there should be no more <font> tags
-    def _remove_table_of_contents(self):
+    def delete_content_before_toc(self):
-        """
+        # replace toc with empty <TOC> tag
        Function to remove table of content from file.
        """
        tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
        for table in tables:
            table.decompose()
    def _change_table_of_contents(self):
        self._change_table_of_contents()
        tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
        for table in tables:
            table.wrap(self.html_soup.new_tag("TOC"))
            table.decompose()
-    def delete_content_before_toc(self):
+        # remove all tag upper the <TOC>
        toc_tag = self.html_soup.new_tag('TOC')
        if toc_tag in self.content:
            ind = self.content.index(toc_tag) + 1
@@ -131,14 +129,12 @@ class HTMLPreprocessor:
        self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))  # todo: check for another languages
        self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
        self._clean_tag('font', 'color', re.compile(r'^#[0-9a-fA-F]{6}$'))
        self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$'))
        self._clean_tag("a", "name", "_GoBack")
        self._clean_underline_links()
        self._font_to_span()
        # self._remove_table_of_contents()
    def _process_paragraph(self):
        """
@@ -178,7 +174,7 @@ class HTMLPreprocessor:
            p.attrs = {}
            style = ''
-            if align is not None and align != BookConfig.DEFAULT_ALIGN_STYLE:
+            if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE:
                style += f'text-align: {align};'
            if indent is not None:
@@ -280,10 +276,6 @@ class HTMLPreprocessor:
            tag.string = tag.text.replace('\u200c', '')
            tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
        # %E2%80%8C
        for tag in a_tags_with_href:
            print(tag)
    @staticmethod
    def _clean_footnote_content(content):
        content = content.strip()
@@ -303,7 +295,8 @@ class HTMLPreprocessor:
        footnotes = []
        for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
-            true_a_tag = cont_tag.find('a', {'class': 'sdfootnotesym-western'})
+            true_a_tag = cont_tag.find_all('a', class_=re.compile(r'^sdfootnote.+$'))[0]
            if true_a_tag.attrs.get('href') is None:
                cont_tag.a.decompose()
                continue
@@ -439,7 +432,7 @@ class HTMLPreprocessor:
        """
        Function to convert all lower level headings to p tags
        """
-        pattern = f'^h[{BookConfig.SUPPORTED_LEVELS + 1}-9]$'
+        pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
        header_tags = self.body_tag.find_all(re.compile(pattern))
        for tag in header_tags:
            tag.name = 'p'
@@ -527,8 +520,8 @@ class HTMLPreprocessor:
            if title == "":
                tag.unwrap()
            else:
-                assert tag.name in BookConfig.SUPPORTED_HEADERS, \
+                assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
-                    f'Preprocessing went wrong, there is still h{BookConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
+                    f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
                # if tag.name in ["h4", "h5", "h6"]:
                #     tag.name = "h3" # All the lower level headings will be transformed to h3 headings
--- a/src/json_converter.py
+++ b/src/json_converter.py
@@ -4,7 +4,7 @@ import codecs
 import json
 from copy import copy
-from config import BookConfig
+from src.config import LawCartaConfig
 class JSONConverter:
@@ -34,7 +34,7 @@ class JSONConverter:
        :param ind: Index of header in content list.
        """
-        if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
+        if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
            title = self.content[ind].text
            curr_outline = int(re.sub(r"^h", "", self.content[ind].name))  # extract outline from tag
            result = {
@@ -47,7 +47,7 @@ class JSONConverter:
            while ind < len(self.content):
                # 1. next tag is a header
-                if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
+                if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
                    outline = int(re.sub(r"^h", "", self.content[ind].name))
                    # - recursion step until h_i > h_initial
                    if outline > curr_outline:
@@ -100,13 +100,13 @@ class JSONConverter:
            while ind < len(self.content):
                res = {}
-                if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
+                if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
                    res, ind = self.header_to_json(ind)
                else:
                    chapter_title = f'Untitled chapter {ch_num}'
                    chapter = []
-                    while ind < len(self.content) and self.content[ind].name not in BookConfig.SUPPORTED_HEADERS:
+                    while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
                        if not self._is_empty_p_tag(self.content[ind]):
                            chapter.append(self.format_html(str(self.content[ind])))
                        ind += 1