forked from LiveCarta/BookConverter
Add 1-many css + Fix bug 4635
This commit is contained in:
@@ -13,7 +13,7 @@ import os
|
|||||||
import pathlib
|
import pathlib
|
||||||
from abc import abstractmethod, ABCMeta
|
from abc import abstractmethod, ABCMeta
|
||||||
|
|
||||||
from livecarta_config import LawCartaConfig
|
from livecarta_config import LiveCartaConfig
|
||||||
from util.helpers import BookLogger, BookStatusWrapper
|
from util.helpers import BookLogger, BookStatusWrapper
|
||||||
|
|
||||||
|
|
||||||
@@ -32,7 +32,7 @@ class BookSolver:
|
|||||||
main_logger=main_logger)
|
main_logger=main_logger)
|
||||||
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
|
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
|
||||||
|
|
||||||
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
|
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
|
||||||
"Length of headers doesn't match allowed levels."
|
"Length of headers doesn't match allowed levels."
|
||||||
|
|
||||||
def save_book_file(self, content):
|
def save_book_file(self, content):
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from premailer import transform
|
|||||||
from itertools import takewhile
|
from itertools import takewhile
|
||||||
from logging import CRITICAL
|
from logging import CRITICAL
|
||||||
|
|
||||||
from livecarta_config import LawCartaConfig
|
from livecarta_config import LiveCartaConfig
|
||||||
from util.color_reader import str2hex
|
from util.color_reader import str2hex
|
||||||
|
|
||||||
cssutils.log.setLevel(CRITICAL)
|
cssutils.log.setLevel(CRITICAL)
|
||||||
@@ -30,7 +30,7 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
|
|||||||
|
|
||||||
def convert_font_size(value):
|
def convert_font_size(value):
|
||||||
if 'pt' in value:
|
if 'pt' in value:
|
||||||
if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
if int(value.replace('pt', '')) == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
||||||
return ''
|
return ''
|
||||||
else:
|
else:
|
||||||
return value.replace('pt', 'px')
|
return value.replace('pt', 'px')
|
||||||
@@ -57,22 +57,27 @@ def convert_font_size(value):
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
def convert_indents(value):
|
def convert_indents(value):
|
||||||
if '-' not in value[0]:
|
|
||||||
# 30px = 3.2% = 1.25em = 23pt
|
# 30px = 3.2% = 1.25em = 23pt
|
||||||
positive_text_indent_regexp = re.compile(r'(\w+%)|(\w*.*\w+em)')
|
positive_text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(\w+px)|(-*\w+pt)')
|
||||||
has_style_attrs = re.search(positive_text_indent_regexp, value)
|
has_style_attrs = re.search(positive_text_indent_regexp, value)
|
||||||
if has_style_attrs:
|
if has_style_attrs:
|
||||||
if has_style_attrs.group(1):
|
if has_style_attrs.group(1):
|
||||||
value = value.replace(has_style_attrs.group(1),
|
value = value.replace(has_style_attrs.group(1),
|
||||||
str(int("".join(filter(str.isdigit, str(has_style_attrs.group(1)))))) +
|
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(1))))) * 6)) +
|
||||||
'%')
|
'px')
|
||||||
# elif has_style_attrs.group(2):
|
|
||||||
# value = value.replace(has_style_attrs.group(2),
|
elif has_style_attrs.group(2):
|
||||||
# str(int("".join(filter(str.isdigit, str(has_style_attrs.group(2))))) * 5) +
|
value = value.replace(has_style_attrs.group(2),
|
||||||
# '%')
|
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(3))))) * 30)) +
|
||||||
return value
|
'px')
|
||||||
else:
|
|
||||||
return ''
|
elif has_style_attrs.group(4):
|
||||||
|
value = value.replace(has_style_attrs.group(4), '30px')
|
||||||
|
|
||||||
|
elif has_style_attrs.group(5):
|
||||||
|
value = value.replace(has_style_attrs.group(5),
|
||||||
|
str(abs(int("".join(filter(str.isdigit, str(has_style_attrs.group(5))))))) + 'px')
|
||||||
|
return value
|
||||||
"""
|
"""
|
||||||
LIVECARTA_STYLE_ATTRS = { css property: value }
|
LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||||
|
|
||||||
@@ -83,11 +88,11 @@ If property has not empty list, it means that only certain property-value combin
|
|||||||
LIVECARTA_STYLE_ATTRS = {
|
LIVECARTA_STYLE_ATTRS = {
|
||||||
'text-indent': [],
|
'text-indent': [],
|
||||||
'font-variant': ['small-caps'],
|
'font-variant': ['small-caps'],
|
||||||
'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
|
'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE],
|
||||||
'align': [], # ???
|
'align': [], # ???
|
||||||
'font': [], # ???
|
'font': [], # ???
|
||||||
'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
|
'font-family': [x for x in LiveCartaConfig.font_correspondence_table.keys()
|
||||||
if x != LawCartaConfig.DEFAULT_FONT_NAME],
|
if x != LiveCartaConfig.DEFAULT_FONT_NAME],
|
||||||
'font-size': [],
|
'font-size': [],
|
||||||
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
|
'font-weight': ['bold', '600', '700', '800', '900'], # <strong>
|
||||||
'font-style': ['italic'], # <i>
|
'font-style': ['italic'], # <i>
|
||||||
@@ -129,11 +134,11 @@ def get_text_color(x):
|
|||||||
|
|
||||||
|
|
||||||
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||||
#'text-indent': convert_indents,
|
'text-indent': convert_indents,
|
||||||
'font-variant': lambda x: x,
|
'font-variant': lambda x: x,
|
||||||
'text-align': lambda x: x,
|
'text-align': lambda x: x,
|
||||||
'font': lambda x: '',
|
'font': lambda x: '',
|
||||||
'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x) or LawCartaConfig.font_correspondence_table.get(x.capitalize()),
|
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
|
||||||
'font-size': convert_font_size,
|
'font-size': convert_font_size,
|
||||||
'color': get_text_color,
|
'color': get_text_color,
|
||||||
'background-color': get_bg_color,
|
'background-color': get_bg_color,
|
||||||
@@ -145,7 +150,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
|
|||||||
'border-bottom-width': lambda x: x if x != '0' else '',
|
'border-bottom-width': lambda x: x if x != '0' else '',
|
||||||
'list-style-type': lambda x: x if x in list_types else 'disc',
|
'list-style-type': lambda x: x if x in list_types else 'disc',
|
||||||
'list-style-image': lambda x: 'disc',
|
'list-style-image': lambda x: 'disc',
|
||||||
'margin-left': lambda x: x
|
'margin-left': convert_indents
|
||||||
}
|
}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -245,31 +250,46 @@ class TagStyleConverter:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def convert_indentions_to_px(style):
|
def convert_indentions_to_px(style):
|
||||||
margin_left_regexp = re.compile(
|
margin_left_regexp = re.compile(
|
||||||
r'(margin-left:( *-*\w+%*);*)')
|
r'(margin-left:( *-*\w+%);*)|(margin-left:( *-*\w+);*)')
|
||||||
text_indent_regexp = re.compile(
|
text_indent_regexp = re.compile(
|
||||||
r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)')
|
r'(text-indent:( *-*\w+%);*)|(text-indent:( *-*\w+);*)')
|
||||||
|
|
||||||
has_margin_left = re.search(margin_left_regexp, style)
|
has_margin_left = re.search(margin_left_regexp, style)
|
||||||
has_text_indent = re.search(text_indent_regexp, style)
|
has_text_indent = re.search(text_indent_regexp, style)
|
||||||
# consider that 5% = 30px
|
# consider that 5% = 30px
|
||||||
if has_margin_left and has_text_indent:
|
if has_margin_left:
|
||||||
num_ml = abs(int("".join(
|
hml_group = 0
|
||||||
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
|
num_ml = 0
|
||||||
if has_text_indent.group(1):
|
if has_margin_left.group(1):
|
||||||
num_ti = abs(int("".join(
|
hml_group = has_margin_left.group(1)
|
||||||
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
|
num_ml = abs(int("".join(
|
||||||
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
|
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
|
||||||
str(abs(num_ml - num_ti)) + 'px; ')
|
|
||||||
style = style.replace(has_margin_left.group(1), '')
|
|
||||||
return style
|
|
||||||
|
|
||||||
elif has_text_indent.group(3):
|
elif has_margin_left.group(3):
|
||||||
num_ti = abs(int("".join(
|
hml_group = has_margin_left.group(3)
|
||||||
filter(str.isdigit, str(has_text_indent.group(4))))) * 6)
|
num_ml = abs(int("".join(
|
||||||
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
|
filter(str.isdigit, str(has_margin_left.group(4))))))
|
||||||
str(abs(num_ml - num_ti)) + 'px; ')
|
|
||||||
style = style.replace(has_margin_left.group(1), '')
|
if has_text_indent:
|
||||||
return style
|
if has_text_indent.group(1):
|
||||||
|
num_ti = abs(int("".join(
|
||||||
|
filter(str.isdigit, str(has_text_indent.group(2))))) * 6)
|
||||||
|
style = style.replace(has_text_indent.group(1), 'text-indent: ' +
|
||||||
|
str(abs(num_ml - num_ti)) + 'px; ')
|
||||||
|
style = style.replace(hml_group, '')
|
||||||
|
return style
|
||||||
|
|
||||||
|
elif has_text_indent.group(3):
|
||||||
|
num_ti = abs(int("".join(
|
||||||
|
filter(str.isdigit, str(has_text_indent.group(4))))))
|
||||||
|
style = style.replace(has_text_indent.group(3), 'text-indent: ' +
|
||||||
|
str(abs(num_ml - num_ti)) + 'px; ')
|
||||||
|
style = style.replace(hml_group, '')
|
||||||
|
return style
|
||||||
|
|
||||||
|
style = style.replace(hml_group, 'text-indent: ' +
|
||||||
|
str(abs(num_ml)) + 'px; ')
|
||||||
|
return style
|
||||||
|
|
||||||
elif has_text_indent:
|
elif has_text_indent:
|
||||||
if has_text_indent.group(1):
|
if has_text_indent.group(1):
|
||||||
@@ -282,12 +302,6 @@ class TagStyleConverter:
|
|||||||
str("".join(
|
str("".join(
|
||||||
filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ')
|
filter(str.isdigit, str(has_text_indent.group(4))))) + 'px; ')
|
||||||
return style
|
return style
|
||||||
elif has_margin_left:
|
|
||||||
num_ml = abs(int("".join(
|
|
||||||
filter(str.isdigit, str(has_margin_left.group(2))))) * 6)
|
|
||||||
style = style.replace(has_margin_left.group(1), 'text-indent: ' +
|
|
||||||
str(abs(num_ml)) + 'px; ')
|
|
||||||
return style
|
|
||||||
return style
|
return style
|
||||||
|
|
||||||
def preprocess_style(self):
|
def preprocess_style(self):
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import re
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ebooklib.epub import Section, Link
|
from ebooklib.epub import Section, Link
|
||||||
from livecarta_config import LawCartaConfig
|
from livecarta_config import LiveCartaConfig
|
||||||
|
|
||||||
"""
|
"""
|
||||||
These are data structures which form mapping from NCX to python data structures.
|
These are data structures which form mapping from NCX to python data structures.
|
||||||
@@ -64,14 +64,14 @@ class ChapterItem:
|
|||||||
for i in self.sub_items:
|
for i in self.sub_items:
|
||||||
sub_dicts.append(i.to_dict(lvl + 1))
|
sub_dicts.append(i.to_dict(lvl + 1))
|
||||||
|
|
||||||
if lvl > LawCartaConfig.SUPPORTED_LEVELS:
|
if lvl > LiveCartaConfig.SUPPORTED_LEVELS:
|
||||||
return {
|
return {
|
||||||
"title": self.title,
|
"title": self.title,
|
||||||
"contents": [self.content] + [x['contents'] for x in sub_dicts],
|
"contents": [self.content] + [x['contents'] for x in sub_dicts],
|
||||||
"sub_items": []
|
"sub_items": []
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lvl == LawCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
|
if (lvl == LiveCartaConfig.SUPPORTED_LEVELS) and sub_dicts:
|
||||||
return {
|
return {
|
||||||
"title": self.title,
|
"title": self.title,
|
||||||
"contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),
|
"contents": [self.content] + flatten([x['contents'] for x in sub_dicts]),
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ from html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chap
|
|||||||
update_src_links_in_images, preprocess_footnotes
|
update_src_links_in_images, preprocess_footnotes
|
||||||
|
|
||||||
from css_reader import clean_css, add_inline_style_to_html_soup
|
from css_reader import clean_css, add_inline_style_to_html_soup
|
||||||
from livecarta_config import LawCartaConfig
|
from livecarta_config import LiveCartaConfig
|
||||||
from util.helpers import BookLogger
|
from util.helpers import BookLogger
|
||||||
|
|
||||||
|
|
||||||
@@ -107,6 +107,9 @@ class EpubConverter:
|
|||||||
return nodes
|
return nodes
|
||||||
|
|
||||||
def _read_css(self, css_href, html_path):
|
def _read_css(self, css_href, html_path):
|
||||||
|
'''
|
||||||
|
|
||||||
|
'''
|
||||||
path_to_css_from_html = css_href
|
path_to_css_from_html = css_href
|
||||||
html_folder = dirname(html_path)
|
html_folder = dirname(html_path)
|
||||||
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
|
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
|
||||||
@@ -117,8 +120,8 @@ class EpubConverter:
|
|||||||
|
|
||||||
def build_css_content(self):
|
def build_css_content(self):
|
||||||
css_href2content, html_href2css_href = {}, {}
|
css_href2content, html_href2css_href = {}, {}
|
||||||
# html_href2css_href 1-to-1, todo: 1-to-many
|
html_href2css_href = defaultdict(list)
|
||||||
|
# html_href2css_href 1-to-many
|
||||||
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||||
html_text = item.content
|
html_text = item.content
|
||||||
html_path = item.file_name
|
html_path = item.file_name
|
||||||
@@ -127,13 +130,13 @@ class EpubConverter:
|
|||||||
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
|
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
|
||||||
continue
|
continue
|
||||||
css_href = tag.attrs.get('href')
|
css_href = tag.attrs.get('href')
|
||||||
html_href2css_href[html_path] = css_href
|
html_href2css_href[html_path].append(css_href)
|
||||||
if css_href not in css_href2content:
|
if css_href not in css_href2content:
|
||||||
css_href2content[css_href] = clean_css(self._read_css(css_href, html_path))
|
css_href2content[css_href] = clean_css(self._read_css(css_href, html_path))
|
||||||
|
|
||||||
for i, tag in enumerate(soup.find_all('style')):
|
for i, tag in enumerate(soup.find_all('style')):
|
||||||
css_content = tag.string
|
css_content = tag.string
|
||||||
html_href2css_href[html_path] = f'href{i}'
|
html_href2css_href[html_path].append(f'href{i}')
|
||||||
css_href2content[f'href{i}'] = clean_css(css_content)
|
css_href2content[f'href{i}'] = clean_css(css_content)
|
||||||
|
|
||||||
return css_href2content, html_href2css_href
|
return css_href2content, html_href2css_href
|
||||||
@@ -141,7 +144,9 @@ class EpubConverter:
|
|||||||
def add_css_styles2soup(self):
|
def add_css_styles2soup(self):
|
||||||
for href in self.href2soup_html:
|
for href in self.href2soup_html:
|
||||||
if self.html_href2css_href.get(href):
|
if self.html_href2css_href.get(href):
|
||||||
css: str = self.css_href2content[self.html_href2css_href[href]]
|
css =''
|
||||||
|
for key in self.html_href2css_href[href]:
|
||||||
|
css += self.css_href2content[key]
|
||||||
content: BeautifulSoup = self.href2soup_html[href]
|
content: BeautifulSoup = self.href2soup_html[href]
|
||||||
content = add_inline_style_to_html_soup(content, css)
|
content = add_inline_style_to_html_soup(content, css)
|
||||||
self.href2soup_html[href] = content
|
self.href2soup_html[href] = content
|
||||||
@@ -399,7 +404,7 @@ class EpubConverter:
|
|||||||
access=self.access,
|
access=self.access,
|
||||||
path2aws_path=self.old_image_path2_aws_path)
|
path2aws_path=self.old_image_path2_aws_path)
|
||||||
|
|
||||||
is_chapter = lvl <= LawCartaConfig.SUPPORTED_LEVELS
|
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
|
||||||
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
|
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
|
||||||
remove_title_from_chapter=is_chapter)
|
remove_title_from_chapter=is_chapter)
|
||||||
|
|
||||||
@@ -442,7 +447,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
|
||||||
|
|
||||||
json_converter = EpubConverter('../epub/9781634256063.epub',
|
json_converter = EpubConverter('../epub/index_with_html.epub',
|
||||||
logger=logger_object)
|
logger=logger_object)
|
||||||
tmp = json_converter.convert_to_dict()
|
tmp = json_converter.convert_to_dict()
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from typing import List
|
|||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||||
|
|
||||||
from livecarta_config import LawCartaConfig
|
from livecarta_config import LiveCartaConfig
|
||||||
from util.helpers import BookLogger, BookStatusWrapper
|
from util.helpers import BookLogger, BookStatusWrapper
|
||||||
|
|
||||||
|
|
||||||
@@ -52,8 +52,8 @@ class HTMLDocxPreprocessor:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def convert_pt_to_px(cls, value):
|
def convert_pt_to_px(cls, value):
|
||||||
value = float(value)
|
value = float(value)
|
||||||
if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE:
|
if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
|
||||||
return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
|
return LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
|
||||||
else:
|
else:
|
||||||
return value
|
return value
|
||||||
|
|
||||||
@@ -73,7 +73,7 @@ class HTMLDocxPreprocessor:
|
|||||||
size = size.group(1)
|
size = size.group(1)
|
||||||
new_size = cls.convert_pt_to_px(size)
|
new_size = cls.convert_pt_to_px(size)
|
||||||
|
|
||||||
if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
if new_size == LiveCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
return re.sub(size + "pt", str(new_size) + "px", style)
|
return re.sub(size + "pt", str(new_size) + "px", style)
|
||||||
@@ -93,18 +93,18 @@ class HTMLDocxPreprocessor:
|
|||||||
if style:
|
if style:
|
||||||
style = self.convert_font_pt_to_px(style)
|
style = self.convert_font_pt_to_px(style)
|
||||||
if style != "":
|
if style != "":
|
||||||
if color and color in LawCartaConfig.COLORS_MAP:
|
if color and color in LiveCartaConfig.COLORS_MAP:
|
||||||
style += f'; color: {color};'
|
style += f'; color: {color};'
|
||||||
font.attrs["style"] = style
|
font.attrs["style"] = style
|
||||||
elif color and color in LawCartaConfig.COLORS_MAP:
|
elif color and color in LiveCartaConfig.COLORS_MAP:
|
||||||
font.attrs["style"] = f'color: {color};'
|
font.attrs["style"] = f'color: {color};'
|
||||||
|
|
||||||
if face is not None:
|
if face is not None:
|
||||||
face = re.sub(r",[\w,\- ]*$", "", face)
|
face = re.sub(r",[\w,\- ]*$", "", face)
|
||||||
if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face):
|
if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.font_correspondence_table.get(face):
|
||||||
font.attrs["face"] = LawCartaConfig.font_correspondence_table[face]
|
font.attrs["face"] = LiveCartaConfig.font_correspondence_table[face]
|
||||||
else:
|
else:
|
||||||
font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME
|
font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME
|
||||||
|
|
||||||
if len(font.attrs) == 0:
|
if len(font.attrs) == 0:
|
||||||
font.unwrap()
|
font.unwrap()
|
||||||
@@ -182,12 +182,12 @@ class HTMLDocxPreprocessor:
|
|||||||
p.attrs = {}
|
p.attrs = {}
|
||||||
style = ''
|
style = ''
|
||||||
|
|
||||||
if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE:
|
if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE:
|
||||||
style += f'text-align: {align};'
|
style += f'text-align: {align};'
|
||||||
|
|
||||||
if indent is not None or indent_should_be_added:
|
if indent is not None or indent_should_be_added:
|
||||||
# indent = indent.group(1)
|
# indent = indent.group(1)
|
||||||
style += f'text-indent: {LawCartaConfig.INDENT};'
|
style += f'text-indent: {LiveCartaConfig.INDENT};'
|
||||||
|
|
||||||
if style:
|
if style:
|
||||||
p.attrs['style'] = style
|
p.attrs['style'] = style
|
||||||
@@ -488,7 +488,7 @@ class HTMLDocxPreprocessor:
|
|||||||
"""
|
"""
|
||||||
Function to convert all lower level headings to p tags
|
Function to convert all lower level headings to p tags
|
||||||
"""
|
"""
|
||||||
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||||
header_tags = self.body_tag.find_all(re.compile(pattern))
|
header_tags = self.body_tag.find_all(re.compile(pattern))
|
||||||
for tag in header_tags:
|
for tag in header_tags:
|
||||||
tag.name = 'p'
|
tag.name = 'p'
|
||||||
@@ -592,8 +592,8 @@ class HTMLDocxPreprocessor:
|
|||||||
if title == "":
|
if title == "":
|
||||||
tag.unwrap()
|
tag.unwrap()
|
||||||
else:
|
else:
|
||||||
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
|
assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
|
||||||
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
||||||
|
|
||||||
content = list(tag.children)
|
content = list(tag.children)
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from typing import List, Tuple
|
|||||||
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||||||
|
|
||||||
from access import Access
|
from access import Access
|
||||||
from livecarta_config import LawCartaConfig
|
from livecarta_config import LiveCartaConfig
|
||||||
|
|
||||||
|
|
||||||
def save_image_locally(img_file_path, img_content, book_id):
|
def save_image_locally(img_file_path, img_content, book_id):
|
||||||
@@ -148,7 +148,7 @@ def _heading_tag2p_tag(body_tag):
|
|||||||
"""
|
"""
|
||||||
Function to convert all lower level headings to p tags
|
Function to convert all lower level headings to p tags
|
||||||
"""
|
"""
|
||||||
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||||
header_tags = body_tag.find_all(re.compile(pattern))
|
header_tags = body_tag.find_all(re.compile(pattern))
|
||||||
for tag in header_tags:
|
for tag in header_tags:
|
||||||
tag.name = 'p'
|
tag.name = 'p'
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import logging
|
|||||||
import re
|
import re
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from livecarta_config import LawCartaConfig
|
from livecarta_config import LiveCartaConfig
|
||||||
|
|
||||||
|
|
||||||
class LibraHTML2JSONConverter:
|
class LibraHTML2JSONConverter:
|
||||||
@@ -32,7 +32,7 @@ class LibraHTML2JSONConverter:
|
|||||||
|
|
||||||
:param ind: Index of header in content list.
|
:param ind: Index of header in content list.
|
||||||
"""
|
"""
|
||||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||||
title = str(self.content[ind])
|
title = str(self.content[ind])
|
||||||
title = title.replace(f'<{self.content[ind].name}>', '')
|
title = title.replace(f'<{self.content[ind].name}>', '')
|
||||||
title = title.replace(f'</{self.content[ind].name}>', '')
|
title = title.replace(f'</{self.content[ind].name}>', '')
|
||||||
@@ -49,7 +49,7 @@ class LibraHTML2JSONConverter:
|
|||||||
|
|
||||||
while ind < len(self.content):
|
while ind < len(self.content):
|
||||||
# 1. next tag is a header
|
# 1. next tag is a header
|
||||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||||
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
||||||
# - recursion step until h_i > h_initial
|
# - recursion step until h_i > h_initial
|
||||||
if outline > curr_outline:
|
if outline > curr_outline:
|
||||||
@@ -102,13 +102,13 @@ class LibraHTML2JSONConverter:
|
|||||||
while ind < len(self.content):
|
while ind < len(self.content):
|
||||||
res = {}
|
res = {}
|
||||||
|
|
||||||
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||||
res, ind = self.header_to_livecarta_chapter_item(ind)
|
res, ind = self.header_to_livecarta_chapter_item(ind)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
chapter_title = f'Untitled chapter {ch_num}'
|
chapter_title = f'Untitled chapter {ch_num}'
|
||||||
chapter = []
|
chapter = []
|
||||||
while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
|
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
|
||||||
if not self._is_empty_p_tag(self.content[ind]):
|
if not self._is_empty_p_tag(self.content[ind]):
|
||||||
chapter.append(self.format_html(str(self.content[ind])))
|
chapter.append(self.format_html(str(self.content[ind])))
|
||||||
ind += 1
|
ind += 1
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
class LawCartaConfig:
|
class LiveCartaConfig:
|
||||||
SUPPORTED_LEVELS = 5
|
SUPPORTED_LEVELS = 5
|
||||||
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
|
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
|
||||||
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
|
||||||
|
|||||||
Reference in New Issue
Block a user