From ebb5f0802e71cd9e8894b1e56bf65b170b41dda2 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 28 Sep 2021 13:37:37 +0300 Subject: [PATCH] Add 1-many css + Fix bug 4635 --- src/book_solver.py | 4 +- src/css_reader.py | 108 +++++++++++++++++-------------- src/data_objects.py | 6 +- src/epub_converter.py | 21 +++--- src/html_docx_preprocessor.py | 28 ++++---- src/html_epub_preprocessor.py | 4 +- src/libra_html2json_converter.py | 10 +-- src/livecarta_config.py | 2 +- 8 files changed, 101 insertions(+), 82 deletions(-) diff --git a/src/book_solver.py b/src/book_solver.py index 4c3d8f2..f4294d1 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -13,7 +13,7 @@ import os import pathlib from abc import abstractmethod, ABCMeta -from livecarta_config import LawCartaConfig +from livecarta_config import LiveCartaConfig from util.helpers import BookLogger, BookStatusWrapper @@ -32,7 +32,7 @@ class BookSolver: main_logger=main_logger) self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id) - assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \ + assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \ "Length of headers doesn't match allowed levels." def save_book_file(self, content): diff --git a/src/css_reader.py b/src/css_reader.py index 8d5c2aa..92f8814 100644 --- a/src/css_reader.py +++ b/src/css_reader.py @@ -9,7 +9,7 @@ from premailer import transform from itertools import takewhile from logging import CRITICAL -from livecarta_config import LawCartaConfig +from livecarta_config import LiveCartaConfig from util.color_reader import str2hex cssutils.log.setLevel(CRITICAL) @@ -30,7 +30,7 @@ list_types = ['circle', 'disc', 'armenian', 'decimal', def convert_font_size(value): if 'pt' in value: - if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE: + if int(value.replace('pt', '')) == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE: return '' else: return value.replace('pt', 'px') @@ -57,22 +57,27 @@ def convert_font_size(value): return '' def convert_indents(value): - if '-' not in value[0]: # 30px = 3.2% = 1.25em = 23pt - positive_text_indent_regexp = re.compile(r'(\w+%)|(\w*.*\w+em)') - has_style_attrs = re.search(positive_text_indent_regexp, value) - if has_style_attrs: - if has_style_attrs.group(1): - value = value.replace(has_style_attrs.group(1), - str(int("".join(filter(str.isdigit, str(has_style_attrs.group(1)))))) + - '%') - # elif has_style_attrs.group(2): - # value = value.replace(has_style_attrs.group(2), - # str(int("".join(filter(str.isdigit, str(has_style_attrs.group(2))))) * 5) + - # '%') - return value - else: - return '' + positive_text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)') + has_style_attrs = re.search(positive_text_indent_regexp, value) + if has_style_attrs: + if has_style_attrs.group(1): + value = value.replace(has_style_attrs.group(1), + str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) + + 'px') + + elif has_style_attrs.group(2): + value = value.replace(has_style_attrs.group(2), + str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) + + 'px') + + elif has_style_attrs.group(4): + value = value.replace(has_style_attrs.group(4), '30px') + + elif has_style_attrs.group(5): + value = value.replace(has_style_attrs.group(5), + str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(5))))))) + 'px') + return value """ LIVECARTA_STYLE_ATTRS = { css property: value } @@ -83,11 +88,11 @@ If property has not empty list, it means that only certain property-value combin LIVECARTA_STYLE_ATTRS = { 'text-indent': [], 'font-variant': ['small-caps'], - 'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE], + 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], 'align': [], # ??? 'font': [], # ??? - 'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys() - if x != LawCartaConfig.DEFAULT_FONT_NAME], + 'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys() + if x != LiveCartaConfig.DEFAULT_FONT_NAME], 'font-size': [], 'font-weight': ['bold', '600', '700', '800', '900'], # 'font-style': ['italic'], # @@ -129,11 +134,11 @@ def get_text_color(x): LIVECARTA_STYLE_ATTRS_MAPPING = { - #'text-indent': convert_indents, + 'text-indent': convert_indents, 'font-variant': lambda x: x, 'text-align': lambda x: x, 'font': lambda x: '', - 'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x) or LawCartaConfig.font_correspondence_table.get(x.capitalize()), + 'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()), 'font-size': convert_font_size, 'color': get_text_color, 'background-color': get_bg_color, @@ -145,7 +150,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = { 'border-bottom-width': lambda x: x if x != '0' else '', 'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-image': lambda x: 'disc', - 'margin-left': lambda x: x + 'margin-left': convert_indents } """ @@ -245,31 +250,46 @@ class TagStyleConverter: @staticmethod def convert_indentions_to_px(style): margin_left_regexp = re.compile( - r'(margin-left:( *-*\w+%*);*)') + r'(margin-left:( *-*\w+%);*)|(margin-left:( *-*\w+);*)') text_indent_regexp = re.compile( r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)') has_margin_left = re.search(margin_left_regexp, style) has_text_indent = re.search(text_indent_regexp, style) # consider that 5% = 30px - if has_margin_left and has_text_indent: - num_ml = abs(int("".join( - filter(str.isdigit, str(has_margin_left.group(2))))) * 6) - if has_text_indent.group(1): - num_ti = abs(int("".join( - filter(str.isdigit, str(has_text_indent.group(2))))) * 6) - style = style.replace(has_text_indent.group(1), 'text-indent: ' + - str(abs(num_ml - num_ti)) + 'px; ') - style = style.replace(has_margin_left.group(1), '') - return style + if has_margin_left: + hml_group = 0 + num_ml = 0 + if has_margin_left.group(1): + hml_group = has_margin_left.group(1) + num_ml = abs(int("".join( + filter(str.isdigit, str(has_margin_left.group(2))))) * 6) - elif has_text_indent.group(3): - num_ti = abs(int("".join( - filter(str.isdigit, str(has_text_indent.group(4))))) * 6) - style = style.replace(has_text_indent.group(3), 'text-indent: ' + - str(abs(num_ml - num_ti)) + 'px; ') - style = style.replace(has_margin_left.group(1), '') - return style + elif has_margin_left.group(3): + hml_group = has_margin_left.group(3) + num_ml = abs(int("".join( + filter(str.isdigit, str(has_margin_left.group(4)))))) + + if has_text_indent: + if has_text_indent.group(1): + num_ti = abs(int("".join( + filter(str.isdigit, str(has_text_indent.group(2))))) * 6) + style = style.replace(has_text_indent.group(1), 'text-indent: ' + + str(abs(num_ml - num_ti)) + 'px; ') + style = style.replace(hml_group, '') + return style + + elif has_text_indent.group(3): + num_ti = abs(int("".join( + filter(str.isdigit, str(has_text_indent.group(4)))))) + style = style.replace(has_text_indent.group(3), 'text-indent: ' + + str(abs(num_ml - num_ti)) + 'px; ') + style = style.replace(hml_group, '') + return style + + style = style.replace(hml_group, 'text-indent: ' + + str(abs(num_ml)) + 'px; ') + return style elif has_text_indent: if has_text_indent.group(1): @@ -282,12 +302,6 @@ class TagStyleConverter: str("".join( filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ') return style - elif has_margin_left: - num_ml = abs(int("".join( - filter(str.isdigit, str(has_margin_left.group(2))))) * 6) - style = style.replace(has_margin_left.group(1), 'text-indent: ' + - str(abs(num_ml)) + 'px; ') - return style return style def preprocess_style(self): diff --git a/src/data_objects.py b/src/data_objects.py index ebb62d5..fd0f2e5 100644 --- a/src/data_objects.py +++ b/src/data_objects.py @@ -2,7 +2,7 @@ import re from typing import Union from ebooklib.epub import Section, Link -from livecarta_config import LawCartaConfig +from livecarta_config import LiveCartaConfig """ These are data structures which form mapping from NCX to python data structures. @@ -64,14 +64,14 @@ class ChapterItem: for i in self.sub_items: sub_dicts.append(i.to_dict(lvl + 1)) - if lvl > LawCartaConfig.SUPPORTED_LEVELS: + if lvl > LiveCartaConfig.SUPPORTED_LEVELS: return { "title": self.title, "contents": [self.content] + [x['contents'] for x in sub_dicts], "sub_items": [] } - if (lvl == LawCartaConfig.SUPPORTED_LEVELS) and sub_dicts: + if (lvl == LiveCartaConfig.SUPPORTED_LEVELS) and sub_dicts: return { "title": self.title, "contents": [self.content] + flatten([x['contents'] for x in sub_dicts]), diff --git a/src/epub_converter.py b/src/epub_converter.py index ead91d2..b86b13a 100644 --- a/src/epub_converter.py +++ b/src/epub_converter.py @@ -18,7 +18,7 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chap update_src_links_in_images, preprocess_footnotes from css_reader import clean_css, add_inline_style_to_html_soup -from livecarta_config import LawCartaConfig +from livecarta_config import LiveCartaConfig from util.helpers import BookLogger @@ -107,6 +107,9 @@ class EpubConverter: return nodes def _read_css(self, css_href, html_path): + ''' + + ''' path_to_css_from_html = css_href html_folder = dirname(html_path) path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/') @@ -117,8 +120,8 @@ class EpubConverter: def build_css_content(self): css_href2content, html_href2css_href = {}, {} - # html_href2css_href 1-to-1, todo: 1-to-many - + html_href2css_href = defaultdict(list) + # html_href2css_href 1-to-many for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_text = item.content html_path = item.file_name @@ -127,13 +130,13 @@ class EpubConverter: if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): continue css_href = tag.attrs.get('href') - html_href2css_href[html_path] = css_href + html_href2css_href[html_path].append(css_href) if css_href not in css_href2content: css_href2content[css_href] = clean_css(self._read_css(css_href, html_path)) for i, tag in enumerate(soup.find_all('style')): css_content = tag.string - html_href2css_href[html_path] = f'href{i}' + html_href2css_href[html_path].append(f'href{i}') css_href2content[f'href{i}'] = clean_css(css_content) return css_href2content, html_href2css_href @@ -141,7 +144,9 @@ class EpubConverter: def add_css_styles2soup(self): for href in self.href2soup_html: if self.html_href2css_href.get(href): - css: str = self.css_href2content[self.html_href2css_href[href]] + css ='' + for key in self.html_href2css_href[href]: + css += self.css_href2content[key] content: BeautifulSoup = self.href2soup_html[href] content = add_inline_style_to_html_soup(content, css) self.href2soup_html[href] = content @@ -399,7 +404,7 @@ class EpubConverter: access=self.access, path2aws_path=self.old_image_path2_aws_path) - is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS + is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS title_preprocessed, content_preprocessed = prepare_title_and_content(title, content, remove_title_from_chapter=is_chapter) @@ -442,7 +447,7 @@ if __name__ == "__main__": logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) - json_converter = EpubConverter('../epub/9781634256063.epub', + json_converter = EpubConverter('../epub/index_with_html.epub', logger=logger_object) tmp = json_converter.convert_to_dict() diff --git a/src/html_docx_preprocessor.py b/src/html_docx_preprocessor.py index 989085c..c1acb5c 100644 --- a/src/html_docx_preprocessor.py +++ b/src/html_docx_preprocessor.py @@ -7,7 +7,7 @@ from typing import List from bs4 import BeautifulSoup, NavigableString, Tag -from livecarta_config import LawCartaConfig +from livecarta_config import LiveCartaConfig from util.helpers import BookLogger, BookStatusWrapper @@ -52,8 +52,8 @@ class HTMLDocxPreprocessor: @classmethod def convert_pt_to_px(cls, value): value = float(value) - if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE: - return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE + if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE: + return LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE else: return value @@ -73,7 +73,7 @@ class HTMLDocxPreprocessor: size = size.group(1) new_size = cls.convert_pt_to_px(size) - if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE: + if new_size == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE: return "" return re.sub(size + "pt", str(new_size) + "px", style) @@ -93,18 +93,18 @@ class HTMLDocxPreprocessor: if style: style = self.convert_font_pt_to_px(style) if style != "": - if color and color in LawCartaConfig.COLORS_MAP: + if color and color in LiveCartaConfig.COLORS_MAP: style += f'; color: {color};' font.attrs["style"] = style - elif color and color in LawCartaConfig.COLORS_MAP: + elif color and color in LiveCartaConfig.COLORS_MAP: font.attrs["style"] = f'color: {color};' if face is not None: face = re.sub(r",[\w,\- ]*$", "", face) - if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face): - font.attrs["face"] = LawCartaConfig.font_correspondence_table[face] + if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.font_correspondence_table.get(face): + font.attrs["face"] = LiveCartaConfig.font_correspondence_table[face] else: - font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME + font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME if len(font.attrs) == 0: font.unwrap() @@ -182,12 +182,12 @@ class HTMLDocxPreprocessor: p.attrs = {} style = '' - if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE: + if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE: style += f'text-align: {align};' if indent is not None or indent_should_be_added: # indent = indent.group(1) - style += f'text-indent: {LawCartaConfig.INDENT};' + style += f'text-indent: {LiveCartaConfig.INDENT};' if style: p.attrs['style'] = style @@ -488,7 +488,7 @@ class HTMLDocxPreprocessor: """ Function to convert all lower level headings to p tags """ - pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$' + pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = self.body_tag.find_all(re.compile(pattern)) for tag in header_tags: tag.name = 'p' @@ -592,8 +592,8 @@ class HTMLDocxPreprocessor: if title == "": tag.unwrap() else: - assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \ - f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' + assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \ + f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' content = list(tag.children) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 8689189..3065171 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -6,7 +6,7 @@ from typing import List, Tuple from bs4 import BeautifulSoup, NavigableString, Tag, Comment from access import Access -from livecarta_config import LawCartaConfig +from livecarta_config import LiveCartaConfig def save_image_locally(img_file_path, img_content, book_id): @@ -148,7 +148,7 @@ def _heading_tag2p_tag(body_tag): """ Function to convert all lower level headings to p tags """ - pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$' + pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = body_tag.find_all(re.compile(pattern)) for tag in header_tags: tag.name = 'p' diff --git a/src/libra_html2json_converter.py b/src/libra_html2json_converter.py index 5c47d3e..9a39b93 100644 --- a/src/libra_html2json_converter.py +++ b/src/libra_html2json_converter.py @@ -2,7 +2,7 @@ import logging import re from copy import copy -from livecarta_config import LawCartaConfig +from livecarta_config import LiveCartaConfig class LibraHTML2JSONConverter: @@ -32,7 +32,7 @@ class LibraHTML2JSONConverter: :param ind: Index of header in content list. """ - if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS: + if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: title = str(self.content[ind]) title = title.replace(f'<{self.content[ind].name}>', '') title = title.replace(f'', '') @@ -49,7 +49,7 @@ class LibraHTML2JSONConverter: while ind < len(self.content): # 1. next tag is a header - if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS: + if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: outline = int(re.sub(r"^h", "", self.content[ind].name)) # - recursion step until h_i > h_initial if outline > curr_outline: @@ -102,13 +102,13 @@ class LibraHTML2JSONConverter: while ind < len(self.content): res = {} - if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS: + if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: res, ind = self.header_to_livecarta_chapter_item(ind) else: chapter_title = f'Untitled chapter {ch_num}' chapter = [] - while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS: + while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: if not self._is_empty_p_tag(self.content[ind]): chapter.append(self.format_html(str(self.content[ind]))) ind += 1 diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 3820ce4..65a5426 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -1,5 +1,5 @@ -class LawCartaConfig: +class LiveCartaConfig: SUPPORTED_LEVELS = 5 SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"} HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}