forked from LiveCarta/BookConverter
Add 1-many css + Fix bug 4635
This commit is contained in:
@@ -13,7 +13,7 @@ import os
|
||||
import pathlib
|
||||
from abc import abstractmethod, ABCMeta
|
||||
|
||||
from livecarta_config import LawCartaConfig
|
||||
from livecarta_config import LiveCartaConfig
|
||||
from util.helpers import BookLogger, BookStatusWrapper
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ class BookSolver:
|
||||
main_logger=main_logger)
|
||||
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
|
||||
|
||||
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
|
||||
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
|
||||
"Length of headers doesn't match allowed levels."
|
||||
|
||||
def save_book_file(self, content):
|
||||
|
||||
@@ -9,7 +9,7 @@ from premailer import transform
|
||||
from itertools import takewhile
|
||||
from logging import CRITICAL
|
||||
|
||||
from livecarta_config import LawCartaConfig
|
||||
from livecarta_config import LiveCartaConfig
|
||||
from util.color_reader import str2hex
|
||||
|
||||
cssutils.log.setLevel(CRITICAL)
|
||||
@@ -30,7 +30,7 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
|
||||
|
||||
def convert_font_size(value):
|
||||
if 'pt' in value:
|
||||
if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
||||
if int(value.replace('pt', '')) == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
||||
return ''
|
||||
else:
|
||||
return value.replace('pt', 'px')
|
||||
@@ -57,22 +57,27 @@ def convert_font_size(value):
|
||||
return ''
|
||||
|
||||
def convert_indents(value):
|
||||
if '-' not in value[0]:
|
||||
# 30px = 3.2% = 1.25em = 23pt
|
||||
positive_text_indent_regexp = re.compile(r'(\w+%)|(\w*.*\w+em)')
|
||||
has_style_attrs = re.search(positive_text_indent_regexp, value)
|
||||
if has_style_attrs:
|
||||
if has_style_attrs.group(1):
|
||||
value = value.replace(has_style_attrs.group(1),
|
||||
str(int("".join(filter(str.isdigit, str(has_style_attrs.group(1)))))) +
|
||||
'%')
|
||||
# elif has_style_attrs.group(2):
|
||||
# value = value.replace(has_style_attrs.group(2),
|
||||
# str(int("".join(filter(str.isdigit, str(has_style_attrs.group(2))))) * 5) +
|
||||
# '%')
|
||||
return value
|
||||
else:
|
||||
return ''
|
||||
positive_text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)')
|
||||
has_style_attrs = re.search(positive_text_indent_regexp, value)
|
||||
if has_style_attrs:
|
||||
if has_style_attrs.group(1):
|
||||
value = value.replace(has_style_attrs.group(1),
|
||||
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
|
||||
'px')
|
||||
|
||||
elif has_style_attrs.group(2):
|
||||
value = value.replace(has_style_attrs.group(2),
|
||||
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
|
||||
'px')
|
||||
|
||||
elif has_style_attrs.group(4):
|
||||
value = value.replace(has_style_attrs.group(4), '30px')
|
||||
|
||||
elif has_style_attrs.group(5):
|
||||
value = value.replace(has_style_attrs.group(5),
|
||||
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(5))))))) + 'px')
|
||||
return value
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||
|
||||
@@ -83,11 +88,11 @@ If property has not empty list, it means that only certain property-value combin
|
||||
LIVECARTA_STYLE_ATTRS = {
|
||||
'text-indent': [],
|
||||
'font-variant': ['small-caps'],
|
||||
'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
|
||||
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
|
||||
'align': [], # ???
|
||||
'font': [], # ???
|
||||
'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
|
||||
if x != LawCartaConfig.DEFAULT_FONT_NAME],
|
||||
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
|
||||
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
|
||||
'font-size': [],
|
||||
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
|
||||
'font-style': ['italic'], # <i>
|
||||
@@ -129,11 +134,11 @@ def get_text_color(x):
|
||||
|
||||
|
||||
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||
#'text-indent': convert_indents,
|
||||
'text-indent': convert_indents,
|
||||
'font-variant': lambda x: x,
|
||||
'text-align': lambda x: x,
|
||||
'font': lambda x: '',
|
||||
'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x) or LawCartaConfig.font_correspondence_table.get(x.capitalize()),
|
||||
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
|
||||
'font-size': convert_font_size,
|
||||
'color': get_text_color,
|
||||
'background-color': get_bg_color,
|
||||
@@ -145,7 +150,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||
'border-bottom-width': lambda x: x if x != '0' else '',
|
||||
'list-style-type': lambda x: x if x in list_types else 'disc',
|
||||
'list-style-image': lambda x: 'disc',
|
||||
'margin-left': lambda x: x
|
||||
'margin-left': convert_indents
|
||||
}
|
||||
|
||||
"""
|
||||
@@ -245,31 +250,46 @@ class TagStyleConverter:
|
||||
@staticmethod
|
||||
def convert_indentions_to_px(style):
|
||||
margin_left_regexp = re.compile(
|
||||
r'(margin-left:( *-*\w+%*);*)')
|
||||
r'(margin-left:( *-*\w+%);*)|(margin-left:( *-*\w+);*)')
|
||||
text_indent_regexp = re.compile(
|
||||
r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)')
|
||||
|
||||
has_margin_left = re.search(margin_left_regexp, style)
|
||||
has_text_indent = re.search(text_indent_regexp, style)
|
||||
# consider that 5% = 30px
|
||||
if has_margin_left and has_text_indent:
|
||||
num_ml = abs(int("".join(
|
||||
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
|
||||
if has_text_indent.group(1):
|
||||
num_ti = abs(int("".join(
|
||||
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
|
||||
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
|
||||
str(abs(num_ml - num_ti)) + 'px; ')
|
||||
style = style.replace(has_margin_left.group(1), '')
|
||||
return style
|
||||
if has_margin_left:
|
||||
hml_group = 0
|
||||
num_ml = 0
|
||||
if has_margin_left.group(1):
|
||||
hml_group = has_margin_left.group(1)
|
||||
num_ml = abs(int("".join(
|
||||
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
|
||||
|
||||
elif has_text_indent.group(3):
|
||||
num_ti = abs(int("".join(
|
||||
filter(str.isdigit, str(has_text_indent.group(4))))) * 6)
|
||||
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
|
||||
str(abs(num_ml - num_ti)) + 'px; ')
|
||||
style = style.replace(has_margin_left.group(1), '')
|
||||
return style
|
||||
elif has_margin_left.group(3):
|
||||
hml_group = has_margin_left.group(3)
|
||||
num_ml = abs(int("".join(
|
||||
filter(str.isdigit, str(has_margin_left.group(4))))))
|
||||
|
||||
if has_text_indent:
|
||||
if has_text_indent.group(1):
|
||||
num_ti = abs(int("".join(
|
||||
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
|
||||
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
|
||||
str(abs(num_ml - num_ti)) + 'px; ')
|
||||
style = style.replace(hml_group, '')
|
||||
return style
|
||||
|
||||
elif has_text_indent.group(3):
|
||||
num_ti = abs(int("".join(
|
||||
filter(str.isdigit, str(has_text_indent.group(4))))))
|
||||
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
|
||||
str(abs(num_ml - num_ti)) + 'px; ')
|
||||
style = style.replace(hml_group, '')
|
||||
return style
|
||||
|
||||
style = style.replace(hml_group, 'text-indent: ' +
|
||||
str(abs(num_ml)) + 'px; ')
|
||||
return style
|
||||
|
||||
elif has_text_indent:
|
||||
if has_text_indent.group(1):
|
||||
@@ -282,12 +302,6 @@ class TagStyleConverter:
|
||||
str("".join(
|
||||
filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ')
|
||||
return style
|
||||
elif has_margin_left:
|
||||
num_ml = abs(int("".join(
|
||||
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
|
||||
style = style.replace(has_margin_left.group(1), 'text-indent: ' +
|
||||
str(abs(num_ml)) + 'px; ')
|
||||
return style
|
||||
return style
|
||||
|
||||
def preprocess_style(self):
|
||||
|
||||
@@ -2,7 +2,7 @@ import re
|
||||
from typing import Union
|
||||
|
||||
from ebooklib.epub import Section, Link
|
||||
from livecarta_config import LawCartaConfig
|
||||
from livecarta_config import LiveCartaConfig
|
||||
|
||||
"""
|
||||
These are data structures which form mapping from NCX to python data structures.
|
||||
@@ -64,14 +64,14 @@ class ChapterItem:
|
||||
for i in self.sub_items:
|
||||
sub_dicts.append(i.to_dict(lvl + 1))
|
||||
|
||||
if lvl > LawCartaConfig.SUPPORTED_LEVELS:
|
||||
if lvl > LiveCartaConfig.SUPPORTED_LEVELS:
|
||||
return {
|
||||
"title": self.title,
|
||||
"contents": [self.content] + [x['contents'] for x in sub_dicts],
|
||||
"sub_items": []
|
||||
}
|
||||
|
||||
if (lvl == LawCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
|
||||
if (lvl == LiveCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
|
||||
return {
|
||||
"title": self.title,
|
||||
"contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),
|
||||
|
||||
@@ -18,7 +18,7 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chap
|
||||
update_src_links_in_images, preprocess_footnotes
|
||||
|
||||
from css_reader import clean_css, add_inline_style_to_html_soup
|
||||
from livecarta_config import LawCartaConfig
|
||||
from livecarta_config import LiveCartaConfig
|
||||
from util.helpers import BookLogger
|
||||
|
||||
|
||||
@@ -107,6 +107,9 @@ class EpubConverter:
|
||||
return nodes
|
||||
|
||||
def _read_css(self, css_href, html_path):
|
||||
'''
|
||||
|
||||
'''
|
||||
path_to_css_from_html = css_href
|
||||
html_folder = dirname(html_path)
|
||||
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
|
||||
@@ -117,8 +120,8 @@ class EpubConverter:
|
||||
|
||||
def build_css_content(self):
|
||||
css_href2content, html_href2css_href = {}, {}
|
||||
# html_href2css_href 1-to-1, todo: 1-to-many
|
||||
|
||||
html_href2css_href = defaultdict(list)
|
||||
# html_href2css_href 1-to-many
|
||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
html_text = item.content
|
||||
html_path = item.file_name
|
||||
@@ -127,13 +130,13 @@ class EpubConverter:
|
||||
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
|
||||
continue
|
||||
css_href = tag.attrs.get('href')
|
||||
html_href2css_href[html_path] = css_href
|
||||
html_href2css_href[html_path].append(css_href)
|
||||
if css_href not in css_href2content:
|
||||
css_href2content[css_href] = clean_css(self._read_css(css_href, html_path))
|
||||
|
||||
for i, tag in enumerate(soup.find_all('style')):
|
||||
css_content = tag.string
|
||||
html_href2css_href[html_path] = f'href{i}'
|
||||
html_href2css_href[html_path].append(f'href{i}')
|
||||
css_href2content[f'href{i}'] = clean_css(css_content)
|
||||
|
||||
return css_href2content, html_href2css_href
|
||||
@@ -141,7 +144,9 @@ class EpubConverter:
|
||||
def add_css_styles2soup(self):
|
||||
for href in self.href2soup_html:
|
||||
if self.html_href2css_href.get(href):
|
||||
css: str = self.css_href2content[self.html_href2css_href[href]]
|
||||
css =''
|
||||
for key in self.html_href2css_href[href]:
|
||||
css += self.css_href2content[key]
|
||||
content: BeautifulSoup = self.href2soup_html[href]
|
||||
content = add_inline_style_to_html_soup(content, css)
|
||||
self.href2soup_html[href] = content
|
||||
@@ -399,7 +404,7 @@ class EpubConverter:
|
||||
access=self.access,
|
||||
path2aws_path=self.old_image_path2_aws_path)
|
||||
|
||||
is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS
|
||||
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
|
||||
remove_title_from_chapter=is_chapter)
|
||||
|
||||
@@ -442,7 +447,7 @@ if __name__ == "__main__":
|
||||
|
||||
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
||||
|
||||
json_converter = EpubConverter('../epub/9781634256063.epub',
|
||||
json_converter = EpubConverter('../epub/index_with_html.epub',
|
||||
logger=logger_object)
|
||||
tmp = json_converter.convert_to_dict()
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ from typing import List
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
|
||||
from livecarta_config import LawCartaConfig
|
||||
from livecarta_config import LiveCartaConfig
|
||||
from util.helpers import BookLogger, BookStatusWrapper
|
||||
|
||||
|
||||
@@ -52,8 +52,8 @@ class HTMLDocxPreprocessor:
|
||||
@classmethod
|
||||
def convert_pt_to_px(cls, value):
|
||||
value = float(value)
|
||||
if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE:
|
||||
return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
|
||||
if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
|
||||
return LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
|
||||
else:
|
||||
return value
|
||||
|
||||
@@ -73,7 +73,7 @@ class HTMLDocxPreprocessor:
|
||||
size = size.group(1)
|
||||
new_size = cls.convert_pt_to_px(size)
|
||||
|
||||
if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
||||
if new_size == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
||||
return ""
|
||||
|
||||
return re.sub(size + "pt", str(new_size) + "px", style)
|
||||
@@ -93,18 +93,18 @@ class HTMLDocxPreprocessor:
|
||||
if style:
|
||||
style = self.convert_font_pt_to_px(style)
|
||||
if style != "":
|
||||
if color and color in LawCartaConfig.COLORS_MAP:
|
||||
if color and color in LiveCartaConfig.COLORS_MAP:
|
||||
style += f'; color: {color};'
|
||||
font.attrs["style"] = style
|
||||
elif color and color in LawCartaConfig.COLORS_MAP:
|
||||
elif color and color in LiveCartaConfig.COLORS_MAP:
|
||||
font.attrs["style"] = f'color: {color};'
|
||||
|
||||
if face is not None:
|
||||
face = re.sub(r",[\w,\- ]*$", "", face)
|
||||
if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face):
|
||||
font.attrs["face"] = LawCartaConfig.font_correspondence_table[face]
|
||||
if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.font_correspondence_table.get(face):
|
||||
font.attrs["face"] = LiveCartaConfig.font_correspondence_table[face]
|
||||
else:
|
||||
font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME
|
||||
font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME
|
||||
|
||||
if len(font.attrs) == 0:
|
||||
font.unwrap()
|
||||
@@ -182,12 +182,12 @@ class HTMLDocxPreprocessor:
|
||||
p.attrs = {}
|
||||
style = ''
|
||||
|
||||
if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE:
|
||||
if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE:
|
||||
style += f'text-align: {align};'
|
||||
|
||||
if indent is not None or indent_should_be_added:
|
||||
# indent = indent.group(1)
|
||||
style += f'text-indent: {LawCartaConfig.INDENT};'
|
||||
style += f'text-indent: {LiveCartaConfig.INDENT};'
|
||||
|
||||
if style:
|
||||
p.attrs['style'] = style
|
||||
@@ -488,7 +488,7 @@ class HTMLDocxPreprocessor:
|
||||
"""
|
||||
Function to convert all lower level headings to p tags
|
||||
"""
|
||||
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||
header_tags = self.body_tag.find_all(re.compile(pattern))
|
||||
for tag in header_tags:
|
||||
tag.name = 'p'
|
||||
@@ -592,8 +592,8 @@ class HTMLDocxPreprocessor:
|
||||
if title == "":
|
||||
tag.unwrap()
|
||||
else:
|
||||
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
|
||||
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
||||
assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
|
||||
f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
||||
|
||||
content = list(tag.children)
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import List, Tuple
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||||
|
||||
from access import Access
|
||||
from livecarta_config import LawCartaConfig
|
||||
from livecarta_config import LiveCartaConfig
|
||||
|
||||
|
||||
def save_image_locally(img_file_path, img_content, book_id):
|
||||
@@ -148,7 +148,7 @@ def _heading_tag2p_tag(body_tag):
|
||||
"""
|
||||
Function to convert all lower level headings to p tags
|
||||
"""
|
||||
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||
header_tags = body_tag.find_all(re.compile(pattern))
|
||||
for tag in header_tags:
|
||||
tag.name = 'p'
|
||||
|
||||
@@ -2,7 +2,7 @@ import logging
|
||||
import re
|
||||
from copy import copy
|
||||
|
||||
from livecarta_config import LawCartaConfig
|
||||
from livecarta_config import LiveCartaConfig
|
||||
|
||||
|
||||
class LibraHTML2JSONConverter:
|
||||
@@ -32,7 +32,7 @@ class LibraHTML2JSONConverter:
|
||||
|
||||
:param ind: Index of header in content list.
|
||||
"""
|
||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
title = str(self.content[ind])
|
||||
title = title.replace(f'<{self.content[ind].name}>', '')
|
||||
title = title.replace(f'</{self.content[ind].name}>', '')
|
||||
@@ -49,7 +49,7 @@ class LibraHTML2JSONConverter:
|
||||
|
||||
while ind < len(self.content):
|
||||
# 1. next tag is a header
|
||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
||||
# - recursion step until h_i > h_initial
|
||||
if outline > curr_outline:
|
||||
@@ -102,13 +102,13 @@ class LibraHTML2JSONConverter:
|
||||
while ind < len(self.content):
|
||||
res = {}
|
||||
|
||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
res, ind = self.header_to_livecarta_chapter_item(ind)
|
||||
|
||||
else:
|
||||
chapter_title = f'Untitled chapter {ch_num}'
|
||||
chapter = []
|
||||
while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
|
||||
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||
if not self._is_empty_p_tag(self.content[ind]):
|
||||
chapter.append(self.format_html(str(self.content[ind])))
|
||||
ind += 1
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
class LawCartaConfig:
|
||||
class LiveCartaConfig:
|
||||
SUPPORTED_LEVELS = 5
|
||||
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
|
||||
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
||||
|
||||
Reference in New Issue
Block a user