Make preprocessing of inline css

This commit is contained in:
Kiryl
2022-06-06 16:36:14 +03:00
parent 0d1ec03f57
commit 002316f086
5 changed files with 669 additions and 581 deletions

View File

@@ -17,7 +17,8 @@ from bs4 import BeautifulSoup, Tag
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content
from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\
prepare_title, prepare_content, update_images_src_links, preprocess_footnotes
@@ -68,6 +69,8 @@ class EpubConverter:
BeautifulSoup] = self.build_href2soup_content()
# TODO Presets
self.logger.log('Process CSS inline styles.')
self.process_inline_styles_in_html_soup()
self.logger.log('CSS files processing.')
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
self.logger.log('CSS styles adding.')
@@ -106,7 +109,7 @@ class EpubConverter:
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
# using EpubElements
# for now just for HTML objects, as it is simplest chapter
# for now just for HTML objects, as it is the simplest chapter
nodes = dict()
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
@@ -122,6 +125,7 @@ class EpubConverter:
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace('\\', '/')
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
# if in css file we import another css
if "@import" in str(css_obj.content):
path_to_css_from_root = "css/" + \
re.search('"(.*)"', str(css_obj.content)).group(1)
@@ -131,12 +135,26 @@ class EpubConverter:
css_content: str = css_obj.get_content().decode()
return css_content
def process_inline_styles_in_html_soup(self):
"""This function is designed to convert inline html styles"""
for html_href in self.html_href2html_body_soup:
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
could_have_style_in_livecarta_regexp = re.compile(
'(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_inline_style = html_content.find_all(could_have_style_in_livecarta_regexp,
attrs={'style': re.compile('.*')})
for tag_initial_inline_style in tags_with_inline_style:
inline_style = tag_initial_inline_style.attrs['style']
tag_initial_inline_style.attrs['style'] = \
build_inline_style_content(inline_style)
def build_html_and_css_relations(self) -> tuple[dict, dict]:
"""
Function is designed to get 2 dictionaries:
The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them
The first is html_href2css_href. It is created to connect href of html to css files(hrefs of them
) which are used on this html
The second is css_href2css_content. It is created to connect href of css to content of css
...2... = key2value
Returns
----------
@@ -154,26 +172,27 @@ class EpubConverter:
soup_html_content = BeautifulSoup(html_content, features='lxml')
# check if file links to css file
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
# alternate page of original page (e.g. another language)
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
continue
css_href = tag.attrs.get('href')
html_href2css_href[html_href].append(css_href)
if css_href not in css_href2css_content:
# css_href not in css_href2css_content, add to this dict
css_href2css_content[css_href] = build_css_content(
css_href2css_content[css_href] = build_css_file_content(
self.get_css_content(css_href, html_href))
for i, tag in enumerate(soup_html_content.find_all('style')):
css_content = tag.string
html_href2css_href[html_href].append(f'href{i}')
css_href2css_content[f'href{i}'] = build_css_content(
css_href2css_content[f'href{i}'] = build_css_file_content(
css_content)
return html_href2css_href, css_href2css_content
def add_css_styles_to_html_soup(self):
"""
This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content
- add to html_inline_style css_style_content
"""
for html_href in self.html_href2html_body_soup:
@@ -181,9 +200,9 @@ class EpubConverter:
css = ''
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
content: BeautifulSoup = self.html_href2html_body_soup[html_href]
content = convert_html_soup_with_css_style(content, css)
self.html_href2html_body_soup[html_href] = content
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href]
html_content = convert_html_soup_with_css_style(html_content, css)
self.html_href2html_body_soup[html_href] = html_content
def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0):
"""
@@ -191,7 +210,7 @@ class EpubConverter:
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
key = -1 if root(top chapters),
value = None if leaf(least chapters)
value = None if leaf(the least chapters)
Parameters
----------
element: [Link, tuple, list]
@@ -299,8 +318,7 @@ class EpubConverter:
# go to line structure
for html_href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(
soup)
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
@staticmethod
def create_unique_id(href, id_):
@@ -314,7 +332,7 @@ class EpubConverter:
new_anchor_span.string = "\xa0"
return new_anchor_span
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]:
"""
Function used to find full path to file that is parsed from tag link
TOC: a/b/c.xhtml
@@ -327,7 +345,7 @@ class EpubConverter:
href_in_link: str
filename got from tag link, like file1.xhtml
internal_link_tag: Tag
tag object that is parsed now
object that is parsed now
Returns
-------
@@ -362,6 +380,10 @@ class EpubConverter:
1. rebuild ids to be unique in all documents
2a. process anchor which is a whole xhtml file
2b. process anchor which is an element in xhtml file
Returns
-------
None
process links in html
"""
# 1. rebuild ids to be unique in all documents
@@ -393,14 +415,14 @@ class EpubConverter:
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
# insert a new span to the begin of the file
# insert a new span to the beginning of the file
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href']
# 2b. process anchor which is an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)')
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
@@ -459,7 +481,7 @@ class EpubConverter:
id wraps chapter's content + subchapters' content
id points to the start of title of a chapter
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id
and id of the next chapter/subchapter
Parameters
----------
@@ -504,7 +526,8 @@ class EpubConverter:
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.book_image_src_path2aws_path,
book_id=self.file_path.stem if hasattr(self.file_path, 'stem') else 'book_id')
book_id=self.file_path.stem
if hasattr(self.file_path, 'stem') else 'book_id')
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)