diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index dc83b62..52a6744 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -10,8 +10,7 @@ from ebooklib.epub import Link, Section from src.data_objects import ChapterItem, NavPoint from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \ - preprocess_image, preprocess_footnotes - + update_src_links_in_images, preprocess_footnotes # epub3 examples: # https://github.com/IDPF/epub3-samples @@ -27,12 +26,15 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ # поиск toc в epublib: # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap # если его там нет, пробуют искать nav tag в manifest -> EpubNav. +from src.util.css_reader import clean_css, add_inline_style_to_html_soup + class EpubPostprocessor: def __init__(self, file, access=None): self.file = file self.access = access self.ebooklib_book = epub.read_epub(file) # todo: log error from ebooklib + # read images self.href2img_bytes = {} for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE): file_name = x.file_name @@ -42,21 +44,26 @@ class EpubPostprocessor: # read html self.id_anchor_exist_in_nav_points = False self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content() + # read css + self.html_href2css_href = {} + self.css_href2content = {} + self.build_css_content() + # add css + self.add_css_styles2soup() + # read footnotes self.footnotes = [] for href in self.href2soup_html: - self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html, - noteref_attr_name='data-type')) - # если в content.opf есть в spine toc атрибут -> можно найти ncx файл -> из него достать navMap - # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo) + self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html)) + # read toc self.href2ids = defaultdict(list) self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # k = -1 if root, v = None if leaf self.build_adjacency_list_from_toc(self.ebooklib_book.toc) - self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed - self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} - + # build simple toc from spine if needed if not self.is_toc_valid(): self.build_adjacency_list_from_spine() - + # read anchored blocks, split html into separate block + self.mark_and_line_href2soup_html() # used only after parsed toc, ids from toc needed + self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {} self.build_anchor2soup() # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list @@ -68,12 +75,37 @@ class EpubPostprocessor: # todo: check if other chapters exist nodes = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): - html_text = item.get_body_content() - soup = BeautifulSoup(html_text, features='lxml') + html_body_text = item.get_body_content() + soup = BeautifulSoup(html_body_text, features='lxml') nodes[item.file_name] = soup return nodes + def build_css_content(self): + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + html_text = item.content + soup = BeautifulSoup(html_text, features='lxml') + for tag in soup.find_all('link', attrs={"type": "text/css"}): + css_href = tag.attrs.get('href') + self.html_href2css_href[item.file_name] = css_href + if css_href not in self.css_href2content: + print(css_href) + css_content: str = self.ebooklib_book.get_item_with_href(css_href).get_content().decode() + self.css_href2content[css_href] = clean_css(css_content) + + for i, tag in enumerate(soup.find_all('style')): + css_content = tag.string + self.html_href2css_href[item.file_name] = f'href{i}' + self.css_href2content[f'href{i}'] = clean_css(css_content) + + def add_css_styles2soup(self): + for href in self.href2soup_html: + if self.html_href2css_href.get(href): + css: str = self.css_href2content[self.html_href2css_href[href]] + content = self.href2soup_html[href] + content = add_inline_style_to_html_soup(content, css) + self.href2soup_html[href] = content + def build_manifest_id2href(self): links = dict() for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): @@ -193,7 +225,7 @@ class EpubPostprocessor: else: content: BeautifulSoup = self.href2soup_html[node.href] - preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access) + update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access) title_preprocessed, content_preprocessed = prepare_title_and_content(title, content) sub_nodes = [] diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index ef372c3..d7ab675 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -27,7 +27,7 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id return link -def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None): +def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None): img_tags = body_tag.find_all('img') for img in img_tags: @@ -189,8 +189,7 @@ def unwrap_structural_tags(body_tag): 'figure', 'footer', 'iframe', 'span', 'p' ] - divs = body_tag.find_all("div") - for div in divs: + for div in body_tag.find_all("div"): if div.contents: is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents] if all(is_not_struct_tag): @@ -198,35 +197,34 @@ def unwrap_structural_tags(body_tag): continue div.unwrap() - secs = body_tag.find_all("section") - for s in secs: + for s in body_tag.find_all("section"): s.unwrap() - articles = body_tag.find_all("article") - for s in articles: + for s in body_tag.find_all("article"): s.unwrap() - articles = body_tag.find_all("main") - for s in articles: + for s in body_tag.find_all("aside"): + s.name = 'blockquote' + + for s in body_tag.find_all("main"): s.unwrap() - articles = body_tag.find_all("body") - for s in articles: + for s in body_tag.find_all("body"): s.unwrap() - articles = body_tag.find_all("html") - for s in articles: + for s in body_tag.find_all("html"): s.unwrap() - spans = body_tag.find_all("span") # not all cases, if span has

s and NavigableString, it won't unwrap - for s in spans: - if not s.string and s.contents: + for s in body_tag.find_all("span"): + if s.contents: is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents] if all(is_not_struct_tag): continue s.unwrap() + _preprocessing_headings(body_tag) + for node in body_tag: if isinstance(node, NavigableString): content = str(node) @@ -278,6 +276,6 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup): _process_lists(content_tag) _preprocessing_headings(content_tag) - content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) + # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) title_str = clean_title_from_numbering(title_str) - return title_str, content_str + return title_str, str(content_tag) diff --git a/src/util/css_reader.py b/src/util/css_reader.py new file mode 100644 index 0000000..80a60b9 --- /dev/null +++ b/src/util/css_reader.py @@ -0,0 +1,168 @@ +import re + +from itertools import takewhile + +import cssutils +from bs4 import BeautifulSoup +from ebooklib import epub +from premailer import transform + +from src.config import LawCartaConfig + + +def convert_font_property(property): + return '' + + +sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, + 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, + 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] + +sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px', + '22px', + '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '35px', + '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', + '49px', '50px', '64px', '72px'] + + +def convert_font_size(value): + if 'pt' in value: + if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE: + return '' + else: + return value.replace('pt', 'px') + + if value == '100%': + return '' + try: + if '%' in value: + value = float(value.replace('%', '')) + value = value / 100.0 + elif 'em' in value: + value = float(value.replace('em', '')) + else: + return '' + + if value > 5: + return '' + + possible_sizes = list(takewhile(lambda x: value > x, sizes_pr)) + last_possible_size_index = sizes_pr.index(possible_sizes[-1]) + return sizes_px[last_possible_size_index] + + except ValueError: + return '' + + +LIVECARTA_STYLE_ATTRS = { + 'text-indent': [], + 'font-variant': ['small-caps'], + 'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE], + 'align': [], # ??? + 'font': [], # ??? + 'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys() + if x != LawCartaConfig.DEFAULT_FONT_NAME], + 'font-size': [], + 'font-weight': ['bold', '600', '700', '800', '900'], # + 'font-style': ['italic'], # + 'text-decoration': ['underline', 'line-through'], # , + 'text-decoration-line': ['underline', 'line-through'], # , + 'vertical-align': ['super'], # + 'color': [], + 'background-color': [], +} +LIVECARTA_STYLE_ATTRS_MAPPING = { + 'text-indent': lambda x: LawCartaConfig.INDENT, + 'font-variant': lambda x: x, + 'text-align': lambda x: x, + 'font': convert_font_property, + 'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x.capitalize()), + 'font-size': convert_font_size, +} + +LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { + 'font-weight': ['bold', '600', '700', '800', '900'], # + 'font-style': ['italic'], # + 'text-decoration': ['underline', 'line-through'], # , + 'text-decoration-line': ['underline', 'line-through'], # , + 'vertical-align': ['super'], # +} +''' +FONT -> +font-size:14pt; pt->px + +LATER: +vertical-align: sub; o +text-transform: uppercase; +text-decoration-color: red; + +em, in, pt -> px +''' + + +def clean_css(css): + sheet = cssutils.parseString(css, validate=False) + for rule in sheet: + if rule.type == rule.STYLE_RULE: + for property_ in rule.style: + + if property_.name not in LIVECARTA_STYLE_ATTRS: + rule.style[property_.name] = '' + # not remove based on property value + elif LIVECARTA_STYLE_ATTRS.get(property_.name): + tmp = property_.value.replace('\"', '') + if tmp in LIVECARTA_STYLE_ATTRS[property_.name]: + if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING: + func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name] + tmp = property_.value.replace('\"', '') + rule.style[property_.name] = func(tmp) + print(property_.name, rule.style[property_.name], ) + else: + rule.style[property_.name] = '' + else: + if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING: + func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name] + tmp = property_.value.replace('\"', '') + rule.style[property_.name] = func(tmp) + print(property_.name, rule.style[property_.name], ) + + css_text = sheet._getCssText().decode() + return css_text + + +def style_property2livecarta_convention(style_str): + return style_str + + +def add_inline_style_to_html_soup(soup1, css_text): + livecarta_p_ids = [] + h_regex = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$' + for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)'))): + x.attrs['livecarta_id'] = i + livecarta_p_ids.append(i) + + html_with_inline_style = transform(str(soup1), css_text=css_text, remove_classes=False, external_styles=False, + disable_validation=True) + soup2 = BeautifulSoup(html_with_inline_style, features='lxml') + + for i in livecarta_p_ids: + tag = soup1.find(attrs={'livecarta_id': i}) + tag_with_style = soup2.find(attrs={'livecarta_id': i}) + if tag_with_style.attrs.get('style'): + style = tag_with_style.attrs.get('style') + ';' + tag.attrs['style'] = style_property2livecarta_convention(style) + del tag.attrs['livecarta_id'] + + return soup1 + + +if __name__ == '__main__': + file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub' + ebooklib_book = epub.read_epub(file) + css_ = ebooklib_book.get_item_with_href('css/epub.css') + css_ = css_.get_content().decode() + css_cleaned = clean_css(css_) + html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode() + html_soup = BeautifulSoup(html_, features='lxml') + + print(add_inline_style_to_html_soup(html_soup, css_cleaned))