epub converter: add css processing

2021-04-22 17:26:17 +03:00
parent 8f284651c4
commit e0e64a0c38
3 changed files with 229 additions and 31 deletions
--- a/src/epub_postprocessor.py
+++ b/src/epub_postprocessor.py
@@ -10,8 +10,7 @@ from ebooklib.epub import Link, Section

 from src.data_objects import ChapterItem, NavPoint
 from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_ids, prepare_title_and_content, \
-    preprocess_image, preprocess_footnotes
-
+    update_src_links_in_images, preprocess_footnotes

 # epub3 examples:
 # https://github.com/IDPF/epub3-samples
@@ -27,12 +26,15 @@ from src.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_
 # поиск toc в epublib:
 # если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
 # если его там нет, пробуют искать nav tag в manifest -> EpubNav.
+from src.util.css_reader import clean_css, add_inline_style_to_html_soup
+

 class EpubPostprocessor:
    def __init__(self, file, access=None):
        self.file = file
        self.access = access
        self.ebooklib_book = epub.read_epub(file)  # todo: log error from ebooklib
+        # read images
        self.href2img_bytes = {}
        for x in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE):
            file_name = x.file_name
@@ -42,21 +44,26 @@ class EpubPostprocessor:
        # read html
        self.id_anchor_exist_in_nav_points = False
        self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
+        # read css
+        self.html_href2css_href = {}
+        self.css_href2content = {}
+        self.build_css_content()
+        # add css
+        self.add_css_styles2soup()
+        # read footnotes
        self.footnotes = []
        for href in self.href2soup_html:
-            self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html,
-                                                       noteref_attr_name='data-type'))
-        # если в content.opf есть в spine toc атрибут  -> можно найти ncx файл -> из него достать navMap
-        # если его там нет, пробуют искать nav tag в manifest -> EpubNav. это у epub3 (не тестировалось todo)
+            self.footnotes.extend(preprocess_footnotes(self.href2soup_html[href], self.href2soup_html))
+        # read toc
        self.href2ids = defaultdict(list)
        self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}  # k = -1 if root, v = None if leaf
        self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
-        self.mark_and_line_href2soup_html()  # used only after parsed toc, ids from toc needed
-        self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
-
+        # build simple toc from spine if needed
        if not self.is_toc_valid():
            self.build_adjacency_list_from_spine()
-
+        # read anchored blocks, split html into separate block
+        self.mark_and_line_href2soup_html()  # used only after parsed toc, ids from toc needed
+        self.id_anchor2soup: Dict[tuple, BeautifulSoup] = {}
        self.build_anchor2soup()

        # if not self.is_all_html_epub_items_added(): # not all hrefs in adjacency_list
@@ -68,12 +75,37 @@ class EpubPostprocessor:
        # todo: check if other chapters exist
        nodes = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
-            html_text = item.get_body_content()
-            soup = BeautifulSoup(html_text, features='lxml')
+            html_body_text = item.get_body_content()
+            soup = BeautifulSoup(html_body_text, features='lxml')
            nodes[item.file_name] = soup

        return nodes

+    def build_css_content(self):
+        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+            html_text = item.content
+            soup = BeautifulSoup(html_text, features='lxml')
+            for tag in soup.find_all('link', attrs={"type": "text/css"}):
+                css_href = tag.attrs.get('href')
+                self.html_href2css_href[item.file_name] = css_href
+                if css_href not in self.css_href2content:
+                    print(css_href)
+                    css_content: str = self.ebooklib_book.get_item_with_href(css_href).get_content().decode()
+                    self.css_href2content[css_href] = clean_css(css_content)
+
+            for i, tag in enumerate(soup.find_all('style')):
+                css_content = tag.string
+                self.html_href2css_href[item.file_name] = f'href{i}'
+                self.css_href2content[f'href{i}'] = clean_css(css_content)
+
+    def add_css_styles2soup(self):
+        for href in self.href2soup_html:
+            if self.html_href2css_href.get(href):
+                css: str = self.css_href2content[self.html_href2css_href[href]]
+                content = self.href2soup_html[href]
+                content = add_inline_style_to_html_soup(content, css)
+                self.href2soup_html[href] = content
+
    def build_manifest_id2href(self):
        links = dict()
        for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
@@ -193,7 +225,7 @@ class EpubPostprocessor:
        else:
            content: BeautifulSoup = self.href2soup_html[node.href]

-        preprocess_image(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
+        update_src_links_in_images(content, self.href2img_bytes, path_to_html=node.href, access=self.access)
        title_preprocessed, content_preprocessed = prepare_title_and_content(title, content)

        sub_nodes = []
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -27,7 +27,7 @@ def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id
    return link


-def preprocess_image(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
+def update_src_links_in_images(body_tag: Tag, href2img_content: dict, path_to_html, access=None):
    img_tags = body_tag.find_all('img')

    for img in img_tags:
@@ -189,8 +189,7 @@ def unwrap_structural_tags(body_tag):
        'figure', 'footer', 'iframe', 'span', 'p'
    ]

-    divs = body_tag.find_all("div")
-    for div in divs:
+    for div in body_tag.find_all("div"):
        if div.contents:
            is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
            if all(is_not_struct_tag):
@@ -198,35 +197,34 @@ def unwrap_structural_tags(body_tag):
                continue
        div.unwrap()

-    secs = body_tag.find_all("section")
-    for s in secs:
+    for s in body_tag.find_all("section"):
        s.unwrap()

-    articles = body_tag.find_all("article")
-    for s in articles:
+    for s in body_tag.find_all("article"):
        s.unwrap()

-    articles = body_tag.find_all("main")
-    for s in articles:
+    for s in body_tag.find_all("aside"):
+        s.name = 'blockquote'
+
+    for s in body_tag.find_all("main"):
        s.unwrap()

-    articles = body_tag.find_all("body")
-    for s in articles:
+    for s in body_tag.find_all("body"):
        s.unwrap()

-    articles = body_tag.find_all("html")
-    for s in articles:
+    for s in body_tag.find_all("html"):
        s.unwrap()

-    spans = body_tag.find_all("span")
    # not all cases, if span has <p>s and NavigableString, it won't unwrap
-    for s in spans:
-        if not s.string and s.contents:
+    for s in body_tag.find_all("span"):
+        if s.contents:
            is_not_struct_tag = [child.name not in structural_tags_names for child in s.contents]
            if all(is_not_struct_tag):
                continue
        s.unwrap()

+    _preprocessing_headings(body_tag)
+
    for node in body_tag:
        if isinstance(node, NavigableString):
            content = str(node)
@@ -278,6 +276,6 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
    _process_lists(content_tag)
    _preprocessing_headings(content_tag)

-    content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
+    # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
    title_str = clean_title_from_numbering(title_str)
-    return title_str, content_str
+    return title_str, str(content_tag)
--- a/src/util/css_reader.py
+++ b/src/util/css_reader.py
@@ -0,0 +1,168 @@
+import re
+
+from itertools import takewhile
+
+import cssutils
+from bs4 import BeautifulSoup
+from ebooklib import epub
+from premailer import transform
+
+from src.config import LawCartaConfig
+
+
+def convert_font_property(property):
+    return ''
+
+
+sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
+            1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
+            2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
+
+sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
+            '22px',
+            '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '35px',
+            '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px',
+            '49px', '50px', '64px', '72px']
+
+
+def convert_font_size(value):
+    if 'pt' in value:
+        if int(value.replace('pt', '')) == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
+            return ''
+        else:
+            return value.replace('pt', 'px')
+
+    if value == '100%':
+        return ''
+    try:
+        if '%' in value:
+            value = float(value.replace('%', ''))
+            value = value / 100.0
+        elif 'em' in value:
+            value = float(value.replace('em', ''))
+        else:
+            return ''
+
+        if value > 5:
+            return ''
+
+        possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
+        last_possible_size_index = sizes_pr.index(possible_sizes[-1])
+        return sizes_px[last_possible_size_index]
+
+    except ValueError:
+        return ''
+
+
+LIVECARTA_STYLE_ATTRS = {
+    'text-indent': [],
+    'font-variant': ['small-caps'],
+    'text-align': [x for x in LawCartaConfig.ALIGN_STYLES if x != LawCartaConfig.DEFAULT_ALIGN_STYLE],
+    'align': [],  # ???
+    'font': [],  # ???
+    'font-family': [x for x in LawCartaConfig.font_correspondence_table.keys()
+                    if x != LawCartaConfig.DEFAULT_FONT_NAME],
+    'font-size': [],
+    'font-weight': ['bold', '600', '700', '800', '900'],  # <strong>
+    'font-style': ['italic'],  # <i>
+    'text-decoration': ['underline', 'line-through'],  # <u> , <s>
+    'text-decoration-line': ['underline', 'line-through'],  # <u> , <s>
+    'vertical-align': ['super'],  # <sup>
+    'color': [],
+    'background-color': [],
+}
+LIVECARTA_STYLE_ATTRS_MAPPING = {
+    'text-indent': lambda x: LawCartaConfig.INDENT,
+    'font-variant': lambda x: x,
+    'text-align': lambda x: x,
+    'font': convert_font_property,
+    'font-family': lambda x: LawCartaConfig.font_correspondence_table.get(x.capitalize()),
+    'font-size': convert_font_size,
+}
+
+LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
+    'font-weight': ['bold', '600', '700', '800', '900'],  # <strong>
+    'font-style': ['italic'],  # <i>
+    'text-decoration': ['underline', 'line-through'],  # <u> , <s>
+    'text-decoration-line': ['underline', 'line-through'],  # <u> , <s>
+    'vertical-align': ['super'],  # <sup>
+}
+'''
+FONT  -> <span>
+font-size:14pt; pt->px
+
+LATER:
+vertical-align: sub;  <span style="font-size:10px">o</span>
+text-transform: uppercase;
+text-decoration-color: red;
+
+em, in, pt -> px
+'''
+
+
+def clean_css(css):
+    sheet = cssutils.parseString(css, validate=False)
+    for rule in sheet:
+        if rule.type == rule.STYLE_RULE:
+            for property_ in rule.style:
+
+                if property_.name not in LIVECARTA_STYLE_ATTRS:
+                    rule.style[property_.name] = ''
+                # not remove based on property value
+                elif LIVECARTA_STYLE_ATTRS.get(property_.name):
+                    tmp = property_.value.replace('\"', '')
+                    if tmp in LIVECARTA_STYLE_ATTRS[property_.name]:
+                        if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
+                            func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
+                            tmp = property_.value.replace('\"', '')
+                            rule.style[property_.name] = func(tmp)
+                            print(property_.name, rule.style[property_.name], )
+                    else:
+                        rule.style[property_.name] = ''
+                else:
+                    if property_.name in LIVECARTA_STYLE_ATTRS_MAPPING:
+                        func = LIVECARTA_STYLE_ATTRS_MAPPING[property_.name]
+                        tmp = property_.value.replace('\"', '')
+                        rule.style[property_.name] = func(tmp)
+                        print(property_.name, rule.style[property_.name], )
+
+    css_text = sheet._getCssText().decode()
+    return css_text
+
+
+def style_property2livecarta_convention(style_str):
+    return style_str
+
+
+def add_inline_style_to_html_soup(soup1, css_text):
+    livecarta_p_ids = []
+    h_regex = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
+    for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)'))):
+        x.attrs['livecarta_id'] = i
+        livecarta_p_ids.append(i)
+
+    html_with_inline_style = transform(str(soup1), css_text=css_text, remove_classes=False, external_styles=False,
+                                       disable_validation=True)
+    soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
+
+    for i in livecarta_p_ids:
+        tag = soup1.find(attrs={'livecarta_id': i})
+        tag_with_style = soup2.find(attrs={'livecarta_id': i})
+        if tag_with_style.attrs.get('style'):
+            style = tag_with_style.attrs.get('style') + ';'
+            tag.attrs['style'] = style_property2livecarta_convention(style)
+        del tag.attrs['livecarta_id']
+
+    return soup1
+
+
+if __name__ == '__main__':
+    file = '/home/katerina/PycharmProjects/Jenia/converter/epub/accessible_epub_3.epub'
+    ebooklib_book = epub.read_epub(file)
+    css_ = ebooklib_book.get_item_with_href('css/epub.css')
+    css_ = css_.get_content().decode()
+    css_cleaned = clean_css(css_)
+    html_ = ebooklib_book.get_item_with_href('pr01s05.xhtml').get_body_content().decode()
+    html_soup = BeautifulSoup(html_, features='lxml')
+
+    print(add_inline_style_to_html_soup(html_soup, css_cleaned))