diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index dc83b62..52a6744 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -10,8 +10,7 @@ from ebooklib.epub import Link, Section from src.data_objects import ChapterItem, NavPoint from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \ - preprocess_image, preprocess_footnotes - + update_src_links_in_images, preprocess_footnotes # epub3 examples: # https://github.com/IDPF/epub3-samples @@ -27,12 +26,15 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ # поиск toc в epublib: # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap # если его там нет, пробуют искать nav tag в manifest -> EpubNav. +from src.util.css_reader import clean_css, add_inline_style_to_html_soup + class EpubPostprocessor: def __init__(self, file, access=None): self.file = file self.access = access self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib + # read images self.href2img_bytes = {} for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE): file_name = x.file_name @@ -42,21 +44,26 @@ class EpubPostprocessor: # read html self.id_anchor_exist_in_nav_points = False self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() + # read css + self.html_href2css_href = {} + self.css_href2content = {} + self.build_css_content() + # add css + self.add_css_styles2soup() + # read footnotes self.footnotes = [] for href in self.href2soup_html: - self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html, - noteref_attr_name='data-type')) - # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap - # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo) + self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html)) + # read toc self.href2ids = defaultdict(list) self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf self.build_adjacency_list_from_toc(self.ebooklib_book.toc) - self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed - self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} - + # build simple toc from spine if needed if not self.is_toc_valid(): self.build_adjacency_list_from_spine() - + # read anchored blocks, split html into separate block + self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed + self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} self.build_anchor2soup() # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list @@ -68,12 +75,37 @@ class EpubPostprocessor: # todo: check if other chapters exist nodes = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): - html_text = item.get_body_content() - soup = BeautifulSoup(html_text, features='lxml') + html_body_text = item.get_body_content() + soup = BeautifulSoup(html_body_text, features='lxml') nodes[item.file_name] = soup return nodes + def build_css_content(self): + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + html_text = item.content + soup = BeautifulSoup(html_text, features='lxml') + for tag in soup.find_all('link', attrs={"type": "text/css"}): + css_href = tag.attrs.get('href') + self.html_href2css_href[item.file_name] = css_href + if css_href not in self.css_href2content: + print(css_href) + css_content: str = self.ebooklib_book.get_item_with_href(css_href).get_content().decode() + self.css_href2content[css_href] = clean_css(css_content) + + for i, tag in enumerate(soup.find_all('style')): + css_content = tag.string + self.html_href2css_href[item.file_name] = f'href{i}' + self.css_href2content[f'href{i}'] = clean_css(css_content) + + def add_css_styles2soup(self): + for href in self.href2soup_html: + if self.html_href2css_href.get(href): + css: str = self.css_href2content[self.html_href2css_href[href]] + content = self.href2soup_html[href] + content = add_inline_style_to_html_soup(content, css) + self.href2soup_html[href] = content + def build_manifest_id2href(self): links = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): @@ -193,7 +225,7 @@ class EpubPostprocessor: else: content: BeautifulSoup = self.href2soup_html[node.href] - preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access) + update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access) title_preprocessed, content_preprocessed = prepare_title_and_content(title, content) sub_nodes = [] diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index ef372c3..d7ab675 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -27,7 +27,7 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id return link -def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None): +def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None): img_tags = body_tag.find_all('img') for img in img_tags: @@ -189,8 +189,7 @@ def unwrap_structural_tags(body_tag): 'figure', 'footer', 'iframe', 'span', 'p' ] - divs = body_tag.find_all("div") - for div in divs: + for div in body_tag.find_all("div"): if div.contents: is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents] if all(is_not_struct_tag): @@ -198,35 +197,34 @@ def unwrap_structural_tags(body_tag): continue div.unwrap() - secs = body_tag.find_all("section") - for s in secs: + for s in body_tag.find_all("section"): s.unwrap() - articles = body_tag.find_all("article") - for s in articles: + for s in body_tag.find_all("article"): s.unwrap() - articles = body_tag.find_all("main") - for s in articles: + for s in body_tag.find_all("aside"): + s.name = 'blockquote' + + for s in body_tag.find_all("main"): s.unwrap() - articles = body_tag.find_all("body") - for s in articles: + for s in body_tag.find_all("body"): s.unwrap() - articles = body_tag.find_all("html") - for s in articles: + for s in body_tag.find_all("html"): s.unwrap() - spans = body_tag.find_all("span") # not all cases, if span has
s and NavigableString, it won't unwrap
- for s in spans:
- if not s.string and s.contents:
+ for s in body_tag.find_all("span"):
+ if s.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
if all(is_not_struct_tag):
continue
s.unwrap()
+ _preprocessing_headings(body_tag)
+
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
@@ -278,6 +276,6 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
_process_lists(content_tag)
_preprocessing_headings(content_tag)
- content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
+ # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str)
- return title_str, content_str
+ return title_str, str(content_tag)
diff --git a/src/util/css_reader.py b/src/util/css_reader.py
new file mode 100644
index 0000000..80a60b9
--- /dev/null
+++ b/src/util/css_reader.py
@@ -0,0 +1,168 @@
+import re
+
+from itertools import takewhile
+
+import cssutils
+from bs4 import BeautifulSoup
+from ebooklib import epub
+from premailer import transform
+
+from src.config import LawCartaConfig
+
+
+def convert_font_property(property):
+ return ''
+
+
+sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
+ 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
+ 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
+
+sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
+ '22px',
+ '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '35px',
+ '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px',
+ '49px', '50px', '64px', '72px']
+
+
+def convert_font_size(value):
+ if 'pt' in value:
+ if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
+ return ''
+ else:
+ return value.replace('pt', 'px')
+
+ if value == '100%':
+ return ''
+ try:
+ if '%' in value:
+ value = float(value.replace('%', ''))
+ value = value / 100.0
+ elif 'em' in value:
+ value = float(value.replace('em', ''))
+ else:
+ return ''
+
+ if value > 5:
+ return ''
+
+ possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
+ last_possible_size_index = sizes_pr.index(possible_sizes[-1])
+ return sizes_px[last_possible_size_index]
+
+ except ValueError:
+ return ''
+
+
+LIVECARTA_STYLE_ATTRS = {
+ 'text-indent': [],
+ 'font-variant': ['small-caps'],
+ 'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
+ 'align': [], # ???
+ 'font': [], # ???
+ 'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
+ if x != LawCartaConfig.DEFAULT_FONT_NAME],
+ 'font-size': [],
+ 'font-weight': ['bold', '600', '700', '800', '900'], #
+ 'font-style': ['italic'], #
+ 'text-decoration': ['underline', 'line-through'], # ,
+ 'text-decoration-line': ['underline', 'line-through'], # ,
+ 'vertical-align': ['super'], #
+ 'color': [],
+ 'background-color': [],
+}
+LIVECARTA_STYLE_ATTRS_MAPPING = {
+ 'text-indent': lambda x: LawCartaConfig.INDENT,
+ 'font-variant': lambda x: x,
+ 'text-align': lambda x: x,
+ 'font': convert_font_property,
+ 'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x.capitalize()),
+ 'font-size': convert_font_size,
+}
+
+LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
+ 'font-weight': ['bold', '600', '700', '800', '900'], #
+ 'font-style': ['italic'], #
+ 'text-decoration': ['underline', 'line-through'], # ,
+ 'text-decoration-line': ['underline', 'line-through'], # ,
+ 'vertical-align': ['super'], #
+}
+'''
+FONT ->
+font-size:14pt; pt->px
+
+LATER:
+vertical-align: sub; o
+text-transform: uppercase;
+text-decoration-color: red;
+
+em, in, pt -> px
+'''
+
+
+def clean_css(css):
+ sheet = cssutils.parseString(css, validate=False)
+ for rule in sheet:
+ if rule.type == rule.STYLE_RULE:
+ for property_ in rule.style:
+
+ if property_.name not in LIVECARTA_STYLE_ATTRS:
+ rule.style[property_.name] = ''
+ # not remove based on property value
+ elif LIVECARTA_STYLE_ATTRS.get(property_.name):
+ tmp = property_.value.replace('\"', '')
+ if tmp in LIVECARTA_STYLE_ATTRS[property_.name]:
+ if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
+ func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
+ tmp = property_.value.replace('\"', '')
+ rule.style[property_.name] = func(tmp)
+ print(property_.name, rule.style[property_.name], )
+ else:
+ rule.style[property_.name] = ''
+ else:
+ if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
+ func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
+ tmp = property_.value.replace('\"', '')
+ rule.style[property_.name] = func(tmp)
+ print(property_.name, rule.style[property_.name], )
+
+ css_text = sheet._getCssText().decode()
+ return css_text
+
+
+def style_property2livecarta_convention(style_str):
+ return style_str
+
+
+def add_inline_style_to_html_soup(soup1, css_text):
+ livecarta_p_ids = []
+ h_regex = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
+ for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)'))):
+ x.attrs['livecarta_id'] = i
+ livecarta_p_ids.append(i)
+
+ html_with_inline_style = transform(str(soup1), css_text=css_text, remove_classes=False, external_styles=False,
+ disable_validation=True)
+ soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
+
+ for i in livecarta_p_ids:
+ tag = soup1.find(attrs={'livecarta_id': i})
+ tag_with_style = soup2.find(attrs={'livecarta_id': i})
+ if tag_with_style.attrs.get('style'):
+ style = tag_with_style.attrs.get('style') + ';'
+ tag.attrs['style'] = style_property2livecarta_convention(style)
+ del tag.attrs['livecarta_id']
+
+ return soup1
+
+
+if __name__ == '__main__':
+ file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub'
+ ebooklib_book = epub.read_epub(file)
+ css_ = ebooklib_book.get_item_with_href('css/epub.css')
+ css_ = css_.get_content().decode()
+ css_cleaned = clean_css(css_)
+ html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
+ html_soup = BeautifulSoup(html_, features='lxml')
+
+ print(add_inline_style_to_html_soup(html_soup, css_cleaned))