forked from LiveCarta/BookConverter
epub converter: prettified and comments added
This commit is contained in:
@@ -63,10 +63,6 @@ def update_src_links_in_images(body_tag: Tag,
|
|||||||
return path2aws_path
|
return path2aws_path
|
||||||
|
|
||||||
|
|
||||||
def preprocess_figure():
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_table(body_tag: BeautifulSoup):
|
def preprocess_table(body_tag: BeautifulSoup):
|
||||||
tables = body_tag.find_all("table")
|
tables = body_tag.find_all("table")
|
||||||
for table in tables:
|
for table in tables:
|
||||||
@@ -81,10 +77,7 @@ def preprocess_table(body_tag: BeautifulSoup):
|
|||||||
units = width_match.group(2)
|
units = width_match.group(2)
|
||||||
width = size+'px'
|
width = size+'px'
|
||||||
|
|
||||||
width = td.get('width') or width
|
td.attrs['width'] = td.get('width') or width
|
||||||
|
|
||||||
if width:
|
|
||||||
td.attrs['width'] = width
|
|
||||||
|
|
||||||
if td.attrs.get('style'):
|
if td.attrs.get('style'):
|
||||||
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
|
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
|
||||||
@@ -151,7 +144,7 @@ def clean_headings_content(content: Tag, title: str):
|
|||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def _preprocessing_headings(body_tag):
|
def _heading_tag2p_tag(body_tag):
|
||||||
"""
|
"""
|
||||||
Function to convert all lower level headings to p tags
|
Function to convert all lower level headings to p tags
|
||||||
"""
|
"""
|
||||||
@@ -184,8 +177,8 @@ def replace_with_livecarta_anchor_tag(anchor, i):
|
|||||||
return new_tag
|
return new_tag
|
||||||
|
|
||||||
|
|
||||||
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> Tuple[
|
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
|
||||||
list, list, list]:
|
-> Tuple[list, list, list]:
|
||||||
"""
|
"""
|
||||||
This function should be earlier that adding fonts in pipeline.
|
This function should be earlier that adding fonts in pipeline.
|
||||||
|
|
||||||
@@ -248,6 +241,23 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
|||||||
|
|
||||||
|
|
||||||
def unwrap_structural_tags(body_tag):
|
def unwrap_structural_tags(body_tag):
|
||||||
|
"""
|
||||||
|
Main function that works with structure of html.
|
||||||
|
Make changes inplace.
|
||||||
|
|
||||||
|
1. Extracts tags that are not needed
|
||||||
|
|
||||||
|
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
|
||||||
|
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
|
||||||
|
This tag must have a body_tag as a parent.
|
||||||
|
Otherwise, it is wrapped with some tags. Like:
|
||||||
|
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
|
||||||
|
|
||||||
|
3. Headings that are not supported by livecarta converts to <p>
|
||||||
|
4. Wrapping NavigableString
|
||||||
|
:param body_tag: Tag, soup object
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
|
||||||
def _preserve_class_in_aside_tag(tag_):
|
def _preserve_class_in_aside_tag(tag_):
|
||||||
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
|
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
|
||||||
@@ -362,8 +372,9 @@ def unwrap_structural_tags(body_tag):
|
|||||||
parents_marks_are_body = [x.parent == body_tag for x in marks]
|
parents_marks_are_body = [x.parent == body_tag for x in marks]
|
||||||
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
|
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
|
||||||
|
|
||||||
_preprocessing_headings(body_tag)
|
_heading_tag2p_tag(body_tag)
|
||||||
|
|
||||||
|
# wrap NavigableString with <p>
|
||||||
for node in body_tag:
|
for node in body_tag:
|
||||||
if isinstance(node, NavigableString):
|
if isinstance(node, NavigableString):
|
||||||
content = str(node)
|
content = str(node)
|
||||||
@@ -378,19 +389,28 @@ def unwrap_structural_tags(body_tag):
|
|||||||
|
|
||||||
|
|
||||||
def get_tags_between_chapter_marks(first_id, href, html_soup):
|
def get_tags_between_chapter_marks(first_id, href, html_soup):
|
||||||
|
"""
|
||||||
|
After processing on a first_id that corresponds to current chapter,
|
||||||
|
from initial html_soup all tags from current chapter are extracted
|
||||||
|
|
||||||
|
:param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
||||||
|
:param href: name of current chapter's file
|
||||||
|
:param html_soup: soup object of current file
|
||||||
|
:return: list [Tag, NavigableString]; chapter's tags
|
||||||
|
"""
|
||||||
marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
|
marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
|
||||||
if marked_tags:
|
if marked_tags:
|
||||||
next_tag = marked_tags.next_sibling
|
next_tag = marked_tags.next_sibling
|
||||||
tags = []
|
tags = []
|
||||||
while next_tag:
|
while next_tag:
|
||||||
# TODO: why we hve there NavString
|
|
||||||
|
|
||||||
if not isinstance(next_tag, NavigableString) and\
|
if not isinstance(next_tag, NavigableString) and\
|
||||||
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
|
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
|
||||||
break
|
break
|
||||||
tags.append(next_tag)
|
tags.append(next_tag)
|
||||||
next_tag = next_tag.next_sibling
|
next_tag = next_tag.next_sibling
|
||||||
|
|
||||||
|
# remove tags between first_id and next found id
|
||||||
|
# save them in list for next steps
|
||||||
tags = [tag.extract() for tag in tags]
|
tags = [tag.extract() for tag in tags]
|
||||||
html_soup.smooth()
|
html_soup.smooth()
|
||||||
|
|
||||||
@@ -513,6 +533,14 @@ def preprocess_code_tags(chapter_tag):
|
|||||||
|
|
||||||
|
|
||||||
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
|
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Final processing/cleaning function.
|
||||||
|
|
||||||
|
:param title: title of the chapter
|
||||||
|
:param chapter_tag: soup object
|
||||||
|
:param remove_title_from_chapter: bool
|
||||||
|
:return: tuple[str, str]
|
||||||
|
"""
|
||||||
title_str = BeautifulSoup(title, features='lxml').string
|
title_str = BeautifulSoup(title, features='lxml').string
|
||||||
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||||
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
||||||
|
|||||||
Reference in New Issue
Block a user