forked from LiveCarta/BookConverter
epub converter: prettified and comments added
This commit is contained in:
@@ -63,10 +63,6 @@ def update_src_links_in_images(body_tag: Tag,
|
||||
return path2aws_path
|
||||
|
||||
|
||||
def preprocess_figure():
|
||||
pass
|
||||
|
||||
|
||||
def preprocess_table(body_tag: BeautifulSoup):
|
||||
tables = body_tag.find_all("table")
|
||||
for table in tables:
|
||||
@@ -81,10 +77,7 @@ def preprocess_table(body_tag: BeautifulSoup):
|
||||
units = width_match.group(2)
|
||||
width = size+'px'
|
||||
|
||||
width = td.get('width') or width
|
||||
|
||||
if width:
|
||||
td.attrs['width'] = width
|
||||
td.attrs['width'] = td.get('width') or width
|
||||
|
||||
if td.attrs.get('style'):
|
||||
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
|
||||
@@ -151,7 +144,7 @@ def clean_headings_content(content: Tag, title: str):
|
||||
break
|
||||
|
||||
|
||||
def _preprocessing_headings(body_tag):
|
||||
def _heading_tag2p_tag(body_tag):
|
||||
"""
|
||||
Function to convert all lower level headings to p tags
|
||||
"""
|
||||
@@ -184,8 +177,8 @@ def replace_with_livecarta_anchor_tag(anchor, i):
|
||||
return new_tag
|
||||
|
||||
|
||||
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> Tuple[
|
||||
list, list, list]:
|
||||
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
|
||||
-> Tuple[list, list, list]:
|
||||
"""
|
||||
This function should be earlier that adding fonts in pipeline.
|
||||
|
||||
@@ -248,6 +241,23 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
|
||||
|
||||
def unwrap_structural_tags(body_tag):
|
||||
"""
|
||||
Main function that works with structure of html.
|
||||
Make changes inplace.
|
||||
|
||||
1. Extracts tags that are not needed
|
||||
|
||||
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
|
||||
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
|
||||
This tag must have a body_tag as a parent.
|
||||
Otherwise, it is wrapped with some tags. Like:
|
||||
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
|
||||
|
||||
3. Headings that are not supported by livecarta converts to <p>
|
||||
4. Wrapping NavigableString
|
||||
:param body_tag: Tag, soup object
|
||||
:return: None
|
||||
"""
|
||||
|
||||
def _preserve_class_in_aside_tag(tag_):
|
||||
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
|
||||
@@ -362,8 +372,9 @@ def unwrap_structural_tags(body_tag):
|
||||
parents_marks_are_body = [x.parent == body_tag for x in marks]
|
||||
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
|
||||
|
||||
_preprocessing_headings(body_tag)
|
||||
_heading_tag2p_tag(body_tag)
|
||||
|
||||
# wrap NavigableString with <p>
|
||||
for node in body_tag:
|
||||
if isinstance(node, NavigableString):
|
||||
content = str(node)
|
||||
@@ -378,19 +389,28 @@ def unwrap_structural_tags(body_tag):
|
||||
|
||||
|
||||
def get_tags_between_chapter_marks(first_id, href, html_soup):
|
||||
"""
|
||||
After processing on a first_id that corresponds to current chapter,
|
||||
from initial html_soup all tags from current chapter are extracted
|
||||
|
||||
:param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
||||
:param href: name of current chapter's file
|
||||
:param html_soup: soup object of current file
|
||||
:return: list [Tag, NavigableString]; chapter's tags
|
||||
"""
|
||||
marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
|
||||
if marked_tags:
|
||||
next_tag = marked_tags.next_sibling
|
||||
tags = []
|
||||
while next_tag:
|
||||
# TODO: why we hve there NavString
|
||||
|
||||
if not isinstance(next_tag, NavigableString) and\
|
||||
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
|
||||
break
|
||||
tags.append(next_tag)
|
||||
next_tag = next_tag.next_sibling
|
||||
|
||||
# remove tags between first_id and next found id
|
||||
# save them in list for next steps
|
||||
tags = [tag.extract() for tag in tags]
|
||||
html_soup.smooth()
|
||||
|
||||
@@ -513,6 +533,14 @@ def preprocess_code_tags(chapter_tag):
|
||||
|
||||
|
||||
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
|
||||
"""
|
||||
Final processing/cleaning function.
|
||||
|
||||
:param title: title of the chapter
|
||||
:param chapter_tag: soup object
|
||||
:param remove_title_from_chapter: bool
|
||||
:return: tuple[str, str]
|
||||
"""
|
||||
title_str = BeautifulSoup(title, features='lxml').string
|
||||
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
||||
|
||||
Reference in New Issue
Block a user