From ebb5f0802e71cd9e8894b1e56bf65b170b41dda2 Mon Sep 17 00:00:00 2001
From: Kiryl <you@example.com>
Date: Tue, 28 Sep 2021 13:37:37 +0300
Subject: [PATCH] Add 1-many css + Fix bug 4635

---
 src/book_solver.py               |   4 +-
 src/css_reader.py                | 108 +++++++++++++++++--------------
 src/data_objects.py              |   6 +-
 src/epub_converter.py            |  21 +++---
 src/html_docx_preprocessor.py    |  28 ++++----
 src/html_epub_preprocessor.py    |   4 +-
 src/libra_html2json_converter.py |  10 +--
 src/livecarta_config.py          |   2 +-
 8 files changed, 101 insertions(+), 82 deletions(-)

diff --git a/src/book_solver.py b/src/book_solver.py
index 4c3d8f2..f4294d1 100644
--- a/src/book_solver.py
+++ b/src/book_solver.py
@@ -13,7 +13,7 @@ import os
 import pathlib
 from abc import abstractmethod, ABCMeta
 
-from livecarta_config import LawCartaConfig
+from livecarta_config import LiveCartaConfig
 from util.helpers import BookLogger, BookStatusWrapper
 
 
@@ -32,7 +32,7 @@ class BookSolver:
                                         main_logger=main_logger)
         self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
 
-        assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
+        assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
             "Length of headers doesn't match allowed levels."
 
     def save_book_file(self, content):
diff --git a/src/css_reader.py b/src/css_reader.py
index 8d5c2aa..92f8814 100644
--- a/src/css_reader.py
+++ b/src/css_reader.py
@@ -9,7 +9,7 @@ from premailer import transform
 from itertools import takewhile
 from logging import CRITICAL
 
-from livecarta_config import LawCartaConfig
+from livecarta_config import LiveCartaConfig
 from util.color_reader import str2hex
 
 cssutils.log.setLevel(CRITICAL)
@@ -30,7 +30,7 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
 
 def convert_font_size(value):
     if 'pt' in value:
-        if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
+        if int(value.replace('pt', '')) == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
             return ''
         else:
             return value.replace('pt', 'px')
@@ -57,22 +57,27 @@ def convert_font_size(value):
         return ''
 
 def convert_indents(value):
-    if '-' not in value[0]:
         # 30px = 3.2% = 1.25em = 23pt
-        positive_text_indent_regexp = re.compile(r'(\w+%)|(\w*.*\w+em)')
-        has_style_attrs = re.search(positive_text_indent_regexp, value)
-        if has_style_attrs:
-            if has_style_attrs.group(1):
-                value = value.replace(has_style_attrs.group(1),
-                                    str(int("".join(filter(str.isdigit, str(has_style_attrs.group(1)))))) +
-                                    '%')
-            # elif has_style_attrs.group(2):
-            #     value = value.replace(has_style_attrs.group(2),
-            #                         str(int("".join(filter(str.isdigit, str(has_style_attrs.group(2))))) * 5) +
-            #                         '%')
-        return value
-    else:
-        return ''
+    positive_text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)')
+    has_style_attrs = re.search(positive_text_indent_regexp, value)
+    if has_style_attrs:
+        if has_style_attrs.group(1):
+            value = value.replace(has_style_attrs.group(1),
+                                str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
+                                'px')
+
+        elif has_style_attrs.group(2):
+            value = value.replace(has_style_attrs.group(2),
+                                str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
+                                'px')
+
+        elif has_style_attrs.group(4):
+            value = value.replace(has_style_attrs.group(4), '30px')
+
+        elif has_style_attrs.group(5):
+            value = value.replace(has_style_attrs.group(5),
+                                  str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(5))))))) + 'px')
+    return value
 """ 
 LIVECARTA_STYLE_ATTRS = { css property: value }
 
@@ -83,11 +88,11 @@ If property has not empty list, it means that only certain property-value combin
 LIVECARTA_STYLE_ATTRS = {
     'text-indent': [],
     'font-variant': ['small-caps'],
-    'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
+    'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
     'align': [],  # ???
     'font': [],  # ???
-    'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
-                    if x != LawCartaConfig.DEFAULT_FONT_NAME],
+    'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
+                    if x != LiveCartaConfig.DEFAULT_FONT_NAME],
     'font-size': [],
     'font-weight': ['bold', '600', '700', '800', '900'],  # <strong>
     'font-style': ['italic'],  # <i>
@@ -129,11 +134,11 @@ def get_text_color(x):
 
 
 LIVECARTA_STYLE_ATTRS_MAPPING = {
-    #'text-indent': convert_indents,
+    'text-indent': convert_indents,
     'font-variant': lambda x: x,
     'text-align': lambda x: x,
     'font': lambda x: '',
-    'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x) or LawCartaConfig.font_correspondence_table.get(x.capitalize()),
+    'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
     'font-size': convert_font_size,
     'color': get_text_color,
     'background-color': get_bg_color,
@@ -145,7 +150,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
     'border-bottom-width': lambda x: x if x != '0' else '',
     'list-style-type': lambda x: x if x in list_types else 'disc',
     'list-style-image': lambda x: 'disc',
-    'margin-left': lambda x: x
+    'margin-left': convert_indents
 }
 
 """
@@ -245,31 +250,46 @@ class TagStyleConverter:
     @staticmethod
     def convert_indentions_to_px(style):
         margin_left_regexp = re.compile(
-            r'(margin-left:( *-*\w+%*);*)')
+            r'(margin-left:( *-*\w+%);*)|(margin-left:( *-*\w+);*)')
         text_indent_regexp = re.compile(
             r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)')
 
         has_margin_left = re.search(margin_left_regexp, style)
         has_text_indent = re.search(text_indent_regexp, style)
         # consider that 5% = 30px
-        if has_margin_left and has_text_indent:
-            num_ml = abs(int("".join(
-                filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
-            if has_text_indent.group(1):
-                num_ti = abs(int("".join(
-                    filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
-                style = style.replace(has_text_indent.group(1), 'text-indent: ' +
-                                      str(abs(num_ml - num_ti)) + 'px; ')
-                style = style.replace(has_margin_left.group(1), '')
-                return style
+        if has_margin_left:
+            hml_group = 0
+            num_ml = 0
+            if has_margin_left.group(1):
+                hml_group = has_margin_left.group(1)
+                num_ml = abs(int("".join(
+                    filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
 
-            elif has_text_indent.group(3):
-                num_ti = abs(int("".join(
-                    filter(str.isdigit, str(has_text_indent.group(4))))) * 6)
-                style = style.replace(has_text_indent.group(3), 'text-indent: ' +
-                                      str(abs(num_ml - num_ti)) + 'px; ')
-                style = style.replace(has_margin_left.group(1), '')
-                return style
+            elif has_margin_left.group(3):
+                hml_group = has_margin_left.group(3)
+                num_ml = abs(int("".join(
+                    filter(str.isdigit, str(has_margin_left.group(4))))))
+
+            if has_text_indent:
+                if has_text_indent.group(1):
+                    num_ti = abs(int("".join(
+                        filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
+                    style = style.replace(has_text_indent.group(1), 'text-indent: ' +
+                                          str(abs(num_ml - num_ti)) + 'px; ')
+                    style = style.replace(hml_group, '')
+                    return style
+
+                elif has_text_indent.group(3):
+                    num_ti = abs(int("".join(
+                        filter(str.isdigit, str(has_text_indent.group(4))))))
+                    style = style.replace(has_text_indent.group(3), 'text-indent: ' +
+                                          str(abs(num_ml - num_ti)) + 'px; ')
+                    style = style.replace(hml_group, '')
+                    return style
+
+            style = style.replace(hml_group, 'text-indent: ' +
+                                  str(abs(num_ml)) + 'px; ')
+            return style
 
         elif has_text_indent:
             if has_text_indent.group(1):
@@ -282,12 +302,6 @@ class TagStyleConverter:
                                       str("".join(
                                           filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ')
                 return style
-        elif has_margin_left:
-            num_ml = abs(int("".join(
-                filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
-            style = style.replace(has_margin_left.group(1), 'text-indent: ' +
-                              str(abs(num_ml)) + 'px; ')
-            return style
         return style
 
     def preprocess_style(self):
diff --git a/src/data_objects.py b/src/data_objects.py
index ebb62d5..fd0f2e5 100644
--- a/src/data_objects.py
+++ b/src/data_objects.py
@@ -2,7 +2,7 @@ import re
 from typing import Union
 
 from ebooklib.epub import Section, Link
-from livecarta_config import LawCartaConfig
+from livecarta_config import LiveCartaConfig
 
 """
 These are data structures which form mapping from NCX to python data structures.
@@ -64,14 +64,14 @@ class ChapterItem:
             for i in self.sub_items:
                 sub_dicts.append(i.to_dict(lvl + 1))
 
-        if lvl > LawCartaConfig.SUPPORTED_LEVELS:
+        if lvl > LiveCartaConfig.SUPPORTED_LEVELS:
             return {
                 "title": self.title,
                 "contents": [self.content] + [x['contents'] for x in sub_dicts],
                 "sub_items": []
             }
 
-        if (lvl == LawCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
+        if (lvl == LiveCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
             return {
                 "title": self.title,
                 "contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),
diff --git a/src/epub_converter.py b/src/epub_converter.py
index ead91d2..b86b13a 100644
--- a/src/epub_converter.py
+++ b/src/epub_converter.py
@@ -18,7 +18,7 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chap
     update_src_links_in_images, preprocess_footnotes
 
 from css_reader import clean_css, add_inline_style_to_html_soup
-from livecarta_config import LawCartaConfig
+from livecarta_config import LiveCartaConfig
 from util.helpers import BookLogger
 
 
@@ -107,6 +107,9 @@ class EpubConverter:
         return nodes
 
     def _read_css(self, css_href, html_path):
+        '''
+
+        '''
         path_to_css_from_html = css_href
         html_folder = dirname(html_path)
         path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
@@ -117,8 +120,8 @@ class EpubConverter:
 
     def build_css_content(self):
         css_href2content, html_href2css_href = {}, {}
-        # html_href2css_href 1-to-1, todo: 1-to-many
-
+        html_href2css_href = defaultdict(list)
+        # html_href2css_href 1-to-many
         for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
             html_text = item.content
             html_path = item.file_name
@@ -127,13 +130,13 @@ class EpubConverter:
                 if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
                     continue
                 css_href = tag.attrs.get('href')
-                html_href2css_href[html_path] = css_href
+                html_href2css_href[html_path].append(css_href)
                 if css_href not in css_href2content:
                     css_href2content[css_href] = clean_css(self._read_css(css_href, html_path))
 
             for i, tag in enumerate(soup.find_all('style')):
                 css_content = tag.string
-                html_href2css_href[html_path] = f'href{i}'
+                html_href2css_href[html_path].append(f'href{i}')
                 css_href2content[f'href{i}'] = clean_css(css_content)
 
         return css_href2content, html_href2css_href
@@ -141,7 +144,9 @@ class EpubConverter:
     def add_css_styles2soup(self):
         for href in self.href2soup_html:
             if self.html_href2css_href.get(href):
-                css: str = self.css_href2content[self.html_href2css_href[href]]
+                css =''
+                for key in self.html_href2css_href[href]:
+                    css += self.css_href2content[key]
                 content: BeautifulSoup = self.href2soup_html[href]
                 content = add_inline_style_to_html_soup(content, css)
                 self.href2soup_html[href] = content
@@ -399,7 +404,7 @@ class EpubConverter:
                                                                    access=self.access,
                                                                    path2aws_path=self.old_image_path2_aws_path)
 
-        is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS
+        is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
         title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
                                                                              remove_title_from_chapter=is_chapter)
 
@@ -442,7 +447,7 @@ if __name__ == "__main__":
 
     logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
 
-    json_converter = EpubConverter('../epub/9781634256063.epub',
+    json_converter = EpubConverter('../epub/index_with_html.epub',
                                    logger=logger_object)
     tmp = json_converter.convert_to_dict()
 
diff --git a/src/html_docx_preprocessor.py b/src/html_docx_preprocessor.py
index 989085c..c1acb5c 100644
--- a/src/html_docx_preprocessor.py
+++ b/src/html_docx_preprocessor.py
@@ -7,7 +7,7 @@ from typing import List
 
 from bs4 import BeautifulSoup, NavigableString, Tag
 
-from livecarta_config import LawCartaConfig
+from livecarta_config import LiveCartaConfig
 from util.helpers import BookLogger, BookStatusWrapper
 
 
@@ -52,8 +52,8 @@ class HTMLDocxPreprocessor:
     @classmethod
     def convert_pt_to_px(cls, value):
         value = float(value)
-        if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE:
-            return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
+        if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
+            return LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
         else:
             return value
 
@@ -73,7 +73,7 @@ class HTMLDocxPreprocessor:
         size = size.group(1)
         new_size = cls.convert_pt_to_px(size)
 
-        if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
+        if new_size == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
             return ""
 
         return re.sub(size + "pt", str(new_size) + "px", style)
@@ -93,18 +93,18 @@ class HTMLDocxPreprocessor:
             if style:
                 style = self.convert_font_pt_to_px(style)
                 if style != "":
-                    if color and color in LawCartaConfig.COLORS_MAP:
+                    if color and color in LiveCartaConfig.COLORS_MAP:
                         style += f'; color: {color};'
                     font.attrs["style"] = style
-            elif color and color in LawCartaConfig.COLORS_MAP:
+            elif color and color in LiveCartaConfig.COLORS_MAP:
                 font.attrs["style"] = f'color: {color};'
 
             if face is not None:
                 face = re.sub(r",[\w,\- ]*$", "", face)
-                if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face):
-                    font.attrs["face"] = LawCartaConfig.font_correspondence_table[face]
+                if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.font_correspondence_table.get(face):
+                    font.attrs["face"] = LiveCartaConfig.font_correspondence_table[face]
                 else:
-                    font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME
+                    font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME
 
             if len(font.attrs) == 0:
                 font.unwrap()
@@ -182,12 +182,12 @@ class HTMLDocxPreprocessor:
             p.attrs = {}
             style = ''
 
-            if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE:
+            if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE:
                 style += f'text-align: {align};'
 
             if indent is not None or indent_should_be_added:
                 # indent = indent.group(1)
-                style += f'text-indent: {LawCartaConfig.INDENT};'
+                style += f'text-indent: {LiveCartaConfig.INDENT};'
 
             if style:
                 p.attrs['style'] = style
@@ -488,7 +488,7 @@ class HTMLDocxPreprocessor:
         """
         Function to convert all lower level headings to p tags
         """
-        pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
+        pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
         header_tags = self.body_tag.find_all(re.compile(pattern))
         for tag in header_tags:
             tag.name = 'p'
@@ -592,8 +592,8 @@ class HTMLDocxPreprocessor:
             if title == "":
                 tag.unwrap()
             else:
-                assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
-                    f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
+                assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
+                    f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
 
                 content = list(tag.children)
 
diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py
index 8689189..3065171 100644
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -6,7 +6,7 @@ from typing import List, Tuple
 from bs4 import BeautifulSoup, NavigableString, Tag, Comment
 
 from access import Access
-from livecarta_config import LawCartaConfig
+from livecarta_config import LiveCartaConfig
 
 
 def save_image_locally(img_file_path, img_content, book_id):
@@ -148,7 +148,7 @@ def _heading_tag2p_tag(body_tag):
     """
     Function to convert all lower level headings to p tags
     """
-    pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
+    pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
     header_tags = body_tag.find_all(re.compile(pattern))
     for tag in header_tags:
         tag.name = 'p'
diff --git a/src/libra_html2json_converter.py b/src/libra_html2json_converter.py
index 5c47d3e..9a39b93 100644
--- a/src/libra_html2json_converter.py
+++ b/src/libra_html2json_converter.py
@@ -2,7 +2,7 @@ import logging
 import re
 from copy import copy
 
-from livecarta_config import LawCartaConfig
+from livecarta_config import LiveCartaConfig
 
 
 class LibraHTML2JSONConverter:
@@ -32,7 +32,7 @@ class LibraHTML2JSONConverter:
 
         :param ind: Index of header in content list.
         """
-        if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
+        if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
             title = str(self.content[ind])
             title = title.replace(f'<{self.content[ind].name}>', '')
             title = title.replace(f'</{self.content[ind].name}>', '')
@@ -49,7 +49,7 @@ class LibraHTML2JSONConverter:
 
             while ind < len(self.content):
                 # 1. next tag is a header
-                if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
+                if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
                     outline = int(re.sub(r"^h", "", self.content[ind].name))
                     # - recursion step until h_i > h_initial
                     if outline > curr_outline:
@@ -102,13 +102,13 @@ class LibraHTML2JSONConverter:
             while ind < len(self.content):
                 res = {}
 
-                if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
+                if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
                     res, ind = self.header_to_livecarta_chapter_item(ind)
 
                 else:
                     chapter_title = f'Untitled chapter {ch_num}'
                     chapter = []
-                    while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
+                    while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
                         if not self._is_empty_p_tag(self.content[ind]):
                             chapter.append(self.format_html(str(self.content[ind])))
                         ind += 1
diff --git a/src/livecarta_config.py b/src/livecarta_config.py
index 3820ce4..65a5426 100644
--- a/src/livecarta_config.py
+++ b/src/livecarta_config.py
@@ -1,5 +1,5 @@
 
-class LawCartaConfig:
+class LiveCartaConfig:
     SUPPORTED_LEVELS = 5
     SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
     HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}