epub converter: prettified and comments added

This commit is contained in:
shirshasa
2021-09-01 16:46:59 +03:00
parent c4c776ea3e
commit 50193eb25b

View File

@@ -63,10 +63,6 @@ def update_src_links_in_images(body_tag: Tag,
return path2aws_path
def preprocess_figure():
pass
def preprocess_table(body_tag: BeautifulSoup):
tables = body_tag.find_all("table")
for table in tables:
@@ -81,10 +77,7 @@ def preprocess_table(body_tag: BeautifulSoup):
units = width_match.group(2)
width = size+'px'
width = td.get('width') or width
if width:
td.attrs['width'] = width
td.attrs['width'] = td.get('width') or width
if td.attrs.get('style'):
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
@@ -151,7 +144,7 @@ def clean_headings_content(content: Tag, title: str):
break
def _preprocessing_headings(body_tag):
def _heading_tag2p_tag(body_tag):
"""
Function to convert all lower level headings to p tags
"""
@@ -184,8 +177,8 @@ def replace_with_livecarta_anchor_tag(anchor, i):
return new_tag
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') -> Tuple[
list, list, list]:
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-> Tuple[list, list, list]:
"""
This function should be earlier that adding fonts in pipeline.
@@ -248,6 +241,23 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
def unwrap_structural_tags(body_tag):
"""
Main function that works with structure of html.
Make changes inplace.
1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
This tag must have a body_tag as a parent.
Otherwise, it is wrapped with some tags. Like:
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString
:param body_tag: Tag, soup object
:return: None
"""
def _preserve_class_in_aside_tag(tag_):
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
@@ -362,8 +372,9 @@ def unwrap_structural_tags(body_tag):
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
_preprocessing_headings(body_tag)
_heading_tag2p_tag(body_tag)
# wrap NavigableString with <p>
for node in body_tag:
if isinstance(node, NavigableString):
content = str(node)
@@ -378,19 +389,28 @@ def unwrap_structural_tags(body_tag):
def get_tags_between_chapter_marks(first_id, href, html_soup):
"""
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
:param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
:param href: name of current chapter's file
:param html_soup: soup object of current file
:return: list [Tag, NavigableString]; chapter's tags
"""
marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
while next_tag:
# TODO: why we hve there NavString
if not isinstance(next_tag, NavigableString) and\
(next_tag.attrs.get('class') == 'converter-chapter-mark'):
break
tags.append(next_tag)
next_tag = next_tag.next_sibling
# remove tags between first_id and next found id
# save them in list for next steps
tags = [tag.extract() for tag in tags]
html_soup.smooth()
@@ -513,6 +533,14 @@ def preprocess_code_tags(chapter_tag):
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
"""
Final processing/cleaning function.
:param title: title of the chapter
:param chapter_tag: soup object
:param remove_title_from_chapter: bool
:return: tuple[str, str]
"""
title_str = BeautifulSoup(title, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()