Add 1-many css + Fix bug 4635

This commit is contained in:
Kiryl
2021-09-28 13:37:37 +03:00
parent 955a64380c
commit ebb5f0802e
8 changed files with 101 additions and 82 deletions

View File

@@ -13,7 +13,7 @@ import os
import pathlib
from abc import abstractmethod, ABCMeta
from livecarta_config import LawCartaConfig
from livecarta_config import LiveCartaConfig
from util.helpers import BookLogger, BookStatusWrapper
@@ -32,7 +32,7 @@ class BookSolver:
main_logger=main_logger)
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels."
def save_book_file(self, content):

View File

@@ -9,7 +9,7 @@ from premailer import transform
from itertools import takewhile
from logging import CRITICAL
from livecarta_config import LawCartaConfig
from livecarta_config import LiveCartaConfig
from util.color_reader import str2hex
cssutils.log.setLevel(CRITICAL)
@@ -30,7 +30,7 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
def convert_font_size(value):
if 'pt' in value:
if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
if int(value.replace('pt', '')) == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
return ''
else:
return value.replace('pt', 'px')
@@ -57,22 +57,27 @@ def convert_font_size(value):
return ''
def convert_indents(value):
if '-' not in value[0]:
# 30px = 3.2% = 1.25em = 23pt
positive_text_indent_regexp = re.compile(r'(\w+%)|(\w*.*\w+em)')
has_style_attrs = re.search(positive_text_indent_regexp, value)
if has_style_attrs:
if has_style_attrs.group(1):
value = value.replace(has_style_attrs.group(1),
str(int("".join(filter(str.isdigit, str(has_style_attrs.group(1)))))) +
'%')
# elif has_style_attrs.group(2):
# value = value.replace(has_style_attrs.group(2),
# str(int("".join(filter(str.isdigit, str(has_style_attrs.group(2))))) * 5) +
# '%')
return value
else:
return ''
positive_text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)')
has_style_attrs = re.search(positive_text_indent_regexp, value)
if has_style_attrs:
if has_style_attrs.group(1):
value = value.replace(has_style_attrs.group(1),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
'px')
elif has_style_attrs.group(2):
value = value.replace(has_style_attrs.group(2),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
'px')
elif has_style_attrs.group(4):
value = value.replace(has_style_attrs.group(4), '30px')
elif has_style_attrs.group(5):
value = value.replace(has_style_attrs.group(5),
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(5))))))) + 'px')
return value
"""
LIVECARTA_STYLE_ATTRS = { css property: value }
@@ -83,11 +88,11 @@ If property has not empty list, it means that only certain property-value combin
LIVECARTA_STYLE_ATTRS = {
'text-indent': [],
'font-variant': ['small-caps'],
'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
'align': [], # ???
'font': [], # ???
'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
if x != LawCartaConfig.DEFAULT_FONT_NAME],
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
'font-size': [],
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
'font-style': ['italic'], # <i>
@@ -129,11 +134,11 @@ def get_text_color(x):
LIVECARTA_STYLE_ATTRS_MAPPING = {
#'text-indent': convert_indents,
'text-indent': convert_indents,
'font-variant': lambda x: x,
'text-align': lambda x: x,
'font': lambda x: '',
'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x) or LawCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_font_size,
'color': get_text_color,
'background-color': get_bg_color,
@@ -145,7 +150,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
'border-bottom-width': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in list_types else 'disc',
'list-style-image': lambda x: 'disc',
'margin-left': lambda x: x
'margin-left': convert_indents
}
"""
@@ -245,31 +250,46 @@ class TagStyleConverter:
@staticmethod
def convert_indentions_to_px(style):
margin_left_regexp = re.compile(
r'(margin-left:( *-*\w+%*);*)')
r'(margin-left:( *-*\w+%);*)|(margin-left:( *-*\w+);*)')
text_indent_regexp = re.compile(
r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)')
has_margin_left = re.search(margin_left_regexp, style)
has_text_indent = re.search(text_indent_regexp, style)
# consider that 5% = 30px
if has_margin_left and has_text_indent:
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
if has_text_indent.group(1):
num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
style = style.replace(has_margin_left.group(1), '')
return style
if has_margin_left:
hml_group = 0
num_ml = 0
if has_margin_left.group(1):
hml_group = has_margin_left.group(1)
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
elif has_text_indent.group(3):
num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(4))))) * 6)
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
style = style.replace(has_margin_left.group(1), '')
return style
elif has_margin_left.group(3):
hml_group = has_margin_left.group(3)
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(4))))))
if has_text_indent:
if has_text_indent.group(1):
num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
style = style.replace(hml_group, '')
return style
elif has_text_indent.group(3):
num_ti = abs(int("".join(
filter(str.isdigit, str(has_text_indent.group(4))))))
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
str(abs(num_ml - num_ti)) + 'px; ')
style = style.replace(hml_group, '')
return style
style = style.replace(hml_group, 'text-indent: ' +
str(abs(num_ml)) + 'px; ')
return style
elif has_text_indent:
if has_text_indent.group(1):
@@ -282,12 +302,6 @@ class TagStyleConverter:
str("".join(
filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ')
return style
elif has_margin_left:
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
style = style.replace(has_margin_left.group(1), 'text-indent: ' +
str(abs(num_ml)) + 'px; ')
return style
return style
def preprocess_style(self):

View File

@@ -2,7 +2,7 @@ import re
from typing import Union
from ebooklib.epub import Section, Link
from livecarta_config import LawCartaConfig
from livecarta_config import LiveCartaConfig
"""
These are data structures which form mapping from NCX to python data structures.
@@ -64,14 +64,14 @@ class ChapterItem:
for i in self.sub_items:
sub_dicts.append(i.to_dict(lvl + 1))
if lvl > LawCartaConfig.SUPPORTED_LEVELS:
if lvl > LiveCartaConfig.SUPPORTED_LEVELS:
return {
"title": self.title,
"contents": [self.content] + [x['contents'] for x in sub_dicts],
"sub_items": []
}
if (lvl == LawCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
if (lvl == LiveCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
return {
"title": self.title,
"contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),

View File

@@ -18,7 +18,7 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chap
update_src_links_in_images, preprocess_footnotes
from css_reader import clean_css, add_inline_style_to_html_soup
from livecarta_config import LawCartaConfig
from livecarta_config import LiveCartaConfig
from util.helpers import BookLogger
@@ -107,6 +107,9 @@ class EpubConverter:
return nodes
def _read_css(self, css_href, html_path):
'''
'''
path_to_css_from_html = css_href
html_folder = dirname(html_path)
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
@@ -117,8 +120,8 @@ class EpubConverter:
def build_css_content(self):
css_href2content, html_href2css_href = {}, {}
# html_href2css_href 1-to-1, todo: 1-to-many
html_href2css_href = defaultdict(list)
# html_href2css_href 1-to-many
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_text = item.content
html_path = item.file_name
@@ -127,13 +130,13 @@ class EpubConverter:
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
continue
css_href = tag.attrs.get('href')
html_href2css_href[html_path] = css_href
html_href2css_href[html_path].append(css_href)
if css_href not in css_href2content:
css_href2content[css_href] = clean_css(self._read_css(css_href, html_path))
for i, tag in enumerate(soup.find_all('style')):
css_content = tag.string
html_href2css_href[html_path] = f'href{i}'
html_href2css_href[html_path].append(f'href{i}')
css_href2content[f'href{i}'] = clean_css(css_content)
return css_href2content, html_href2css_href
@@ -141,7 +144,9 @@ class EpubConverter:
def add_css_styles2soup(self):
for href in self.href2soup_html:
if self.html_href2css_href.get(href):
css: str = self.css_href2content[self.html_href2css_href[href]]
css =''
for key in self.html_href2css_href[href]:
css += self.css_href2content[key]
content: BeautifulSoup = self.href2soup_html[href]
content = add_inline_style_to_html_soup(content, css)
self.href2soup_html[href] = content
@@ -399,7 +404,7 @@ class EpubConverter:
access=self.access,
path2aws_path=self.old_image_path2_aws_path)
is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
remove_title_from_chapter=is_chapter)
@@ -442,7 +447,7 @@ if __name__ == "__main__":
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubConverter('../epub/9781634256063.epub',
json_converter = EpubConverter('../epub/index_with_html.epub',
logger=logger_object)
tmp = json_converter.convert_to_dict()

View File

@@ -7,7 +7,7 @@ from typing import List
from bs4 import BeautifulSoup, NavigableString, Tag
from livecarta_config import LawCartaConfig
from livecarta_config import LiveCartaConfig
from util.helpers import BookLogger, BookStatusWrapper
@@ -52,8 +52,8 @@ class HTMLDocxPreprocessor:
@classmethod
def convert_pt_to_px(cls, value):
value = float(value)
if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE:
return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
return LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
else:
return value
@@ -73,7 +73,7 @@ class HTMLDocxPreprocessor:
size = size.group(1)
new_size = cls.convert_pt_to_px(size)
if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
if new_size == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
return ""
return re.sub(size + "pt", str(new_size) + "px", style)
@@ -93,18 +93,18 @@ class HTMLDocxPreprocessor:
if style:
style = self.convert_font_pt_to_px(style)
if style != "":
if color and color in LawCartaConfig.COLORS_MAP:
if color and color in LiveCartaConfig.COLORS_MAP:
style += f'; color: {color};'
font.attrs["style"] = style
elif color and color in LawCartaConfig.COLORS_MAP:
elif color and color in LiveCartaConfig.COLORS_MAP:
font.attrs["style"] = f'color: {color};'
if face is not None:
face = re.sub(r",[\w,\- ]*$", "", face)
if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face):
font.attrs["face"] = LawCartaConfig.font_correspondence_table[face]
if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.font_correspondence_table.get(face):
font.attrs["face"] = LiveCartaConfig.font_correspondence_table[face]
else:
font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME
font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME
if len(font.attrs) == 0:
font.unwrap()
@@ -182,12 +182,12 @@ class HTMLDocxPreprocessor:
p.attrs = {}
style = ''
if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE:
if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE:
style += f'text-align: {align};'
if indent is not None or indent_should_be_added:
# indent = indent.group(1)
style += f'text-indent: {LawCartaConfig.INDENT};'
style += f'text-indent: {LiveCartaConfig.INDENT};'
if style:
p.attrs['style'] = style
@@ -488,7 +488,7 @@ class HTMLDocxPreprocessor:
"""
Function to convert all lower level headings to p tags
"""
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
@@ -592,8 +592,8 @@ class HTMLDocxPreprocessor:
if title == "":
tag.unwrap()
else:
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
content = list(tag.children)

View File

@@ -6,7 +6,7 @@ from typing import List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from access import Access
from livecarta_config import LawCartaConfig
from livecarta_config import LiveCartaConfig
def save_image_locally(img_file_path, img_content, book_id):
@@ -148,7 +148,7 @@ def _heading_tag2p_tag(body_tag):
"""
Function to convert all lower level headings to p tags
"""
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'

View File

@@ -2,7 +2,7 @@ import logging
import re
from copy import copy
from livecarta_config import LawCartaConfig
from livecarta_config import LiveCartaConfig
class LibraHTML2JSONConverter:
@@ -32,7 +32,7 @@ class LibraHTML2JSONConverter:
:param ind: Index of header in content list.
"""
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
title = str(self.content[ind])
title = title.replace(f'<{self.content[ind].name}>', '')
title = title.replace(f'</{self.content[ind].name}>', '')
@@ -49,7 +49,7 @@ class LibraHTML2JSONConverter:
while ind < len(self.content):
# 1. next tag is a header
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
outline = int(re.sub(r"^h", "", self.content[ind].name))
# - recursion step until h_i > h_initial
if outline > curr_outline:
@@ -102,13 +102,13 @@ class LibraHTML2JSONConverter:
while ind < len(self.content):
res = {}
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
res, ind = self.header_to_livecarta_chapter_item(ind)
else:
chapter_title = f'Untitled chapter {ch_num}'
chapter = []
while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
if not self._is_empty_p_tag(self.content[ind]):
chapter.append(self.format_html(str(self.content[ind])))
ind += 1

View File

@@ -1,5 +1,5 @@
class LawCartaConfig:
class LiveCartaConfig:
SUPPORTED_LEVELS = 5
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}