forked from LiveCarta/BookConverter
Change documentation
This commit is contained in:
@@ -10,7 +10,7 @@ from src.livecarta_config import LiveCartaConfig
|
|||||||
|
|
||||||
|
|
||||||
def save_image_locally(img_file_path, img_content, book_id):
|
def save_image_locally(img_file_path, img_content, book_id):
|
||||||
""" Function saves all images locally """
|
"""Function saves all images locally"""
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
new_path = pathlib.Path(os.path.join(
|
new_path = pathlib.Path(os.path.join(
|
||||||
folder_path, f'../json/img_{book_id}/'))
|
folder_path, f'../json/img_{book_id}/'))
|
||||||
@@ -25,7 +25,7 @@ def save_image_locally(img_file_path, img_content, book_id):
|
|||||||
|
|
||||||
|
|
||||||
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
|
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
|
||||||
""" Function saves all images to Amazon web service """
|
"""Function saves all images to Amazon web service"""
|
||||||
link_path = access.send_image(
|
link_path = access.send_image(
|
||||||
img_file_path, doc_id=book_id, img_content=img_content)
|
img_file_path, doc_id=book_id, img_content=img_content)
|
||||||
return link_path
|
return link_path
|
||||||
@@ -37,7 +37,7 @@ def update_images_src_links(body_tag: Tag,
|
|||||||
access=None,
|
access=None,
|
||||||
path2aws_path=None,
|
path2aws_path=None,
|
||||||
book_id=None):
|
book_id=None):
|
||||||
""" Function makes dictionary image_src_path -> Amazon web service_path """
|
"""Function makes dictionary image_src_path -> Amazon web service_path"""
|
||||||
img_tags = body_tag.find_all('img')
|
img_tags = body_tag.find_all('img')
|
||||||
|
|
||||||
for img in img_tags:
|
for img in img_tags:
|
||||||
@@ -72,7 +72,7 @@ def update_images_src_links(body_tag: Tag,
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_table(body_tag: BeautifulSoup):
|
def preprocess_table(body_tag: BeautifulSoup):
|
||||||
""" Function to preprocess tables and tags(td|th|tr): style """
|
"""Function to preprocess tables and tags(td|th|tr): style"""
|
||||||
tables = body_tag.find_all("table")
|
tables = body_tag.find_all("table")
|
||||||
for table in tables:
|
for table in tables:
|
||||||
ts = table.find_all(re.compile("td|th|tr"))
|
ts = table.find_all(re.compile("td|th|tr"))
|
||||||
@@ -84,13 +84,13 @@ def preprocess_table(body_tag: BeautifulSoup):
|
|||||||
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
|
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
|
||||||
if width_match:
|
if width_match:
|
||||||
size = width_match.group(1)
|
size = width_match.group(1)
|
||||||
units = width_match.group(2)
|
|
||||||
width = size+'px'
|
width = size+'px'
|
||||||
|
|
||||||
t_tag.attrs['width'] = t_tag.get('width') or width
|
t_tag.attrs['width'] = t_tag.get('width') or width
|
||||||
|
|
||||||
if t_tag.attrs.get('style'):
|
if t_tag.attrs.get('style'):
|
||||||
t_tag.attrs['style'] = t_tag.attrs['style'].replace('border:0;', '')
|
t_tag.attrs['style'] = t_tag.attrs['style'].replace(
|
||||||
|
'border:0;', '')
|
||||||
|
|
||||||
elif t_tag.attrs.get('style') == '':
|
elif t_tag.attrs.get('style') == '':
|
||||||
del t_tag.attrs['style']
|
del t_tag.attrs['style']
|
||||||
@@ -113,7 +113,7 @@ def process_lists(body_tag):
|
|||||||
|
|
||||||
|
|
||||||
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||||
""" Function inserts span before tag to be removed(aren't supported by livecarta) """
|
"""Function inserts span before tag to be removed(aren't supported by livecarta)"""
|
||||||
new_tag = main_tag.new_tag("span")
|
new_tag = main_tag.new_tag("span")
|
||||||
new_tag.attrs['id'] = id_ or ''
|
new_tag.attrs['id'] = id_ or ''
|
||||||
new_tag.attrs['class'] = class_ or ''
|
new_tag.attrs['class'] = class_ or ''
|
||||||
@@ -157,7 +157,7 @@ def clean_headings_content(content: Tag, title: str):
|
|||||||
|
|
||||||
|
|
||||||
def heading_tag_to_p_tag(body_tag):
|
def heading_tag_to_p_tag(body_tag):
|
||||||
""" Function to convert all lower level headings to p tags """
|
"""Function to convert all lower level headings to p tags"""
|
||||||
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||||
header_tags = body_tag.find_all(re.compile(pattern))
|
header_tags = body_tag.find_all(re.compile(pattern))
|
||||||
for tag in header_tags:
|
for tag in header_tags:
|
||||||
@@ -165,7 +165,7 @@ def heading_tag_to_p_tag(body_tag):
|
|||||||
|
|
||||||
|
|
||||||
def clean_title_from_numbering(title: str):
|
def clean_title_from_numbering(title: str):
|
||||||
""" Function removes numbering from titles """
|
"""Function removes numbering from titles"""
|
||||||
title = re.sub(r'^(\s+)+', '', title)
|
title = re.sub(r'^(\s+)+', '', title)
|
||||||
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
|
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
|
||||||
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
|
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
|
||||||
@@ -174,7 +174,7 @@ def clean_title_from_numbering(title: str):
|
|||||||
|
|
||||||
|
|
||||||
def replace_with_livecarta_anchor_tag(anchor, i):
|
def replace_with_livecarta_anchor_tag(anchor, i):
|
||||||
""" Function replace noteref_tag(anchor) with new livecarta tag """
|
"""Function replace noteref_tag(anchor) with new livecarta tag"""
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||||
new_tag['class'] = 'footnote-element'
|
new_tag['class'] = 'footnote-element'
|
||||||
new_tag['data-id'] = i + 1
|
new_tag['data-id'] = i + 1
|
||||||
@@ -194,7 +194,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
|||||||
|
|
||||||
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
|
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
|
||||||
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
||||||
"""
|
"""
|
||||||
footnotes = []
|
footnotes = []
|
||||||
noterefs_tags = source_html_tag.find_all(
|
noterefs_tags = source_html_tag.find_all(
|
||||||
attrs={noteref_attr_name: 'noteref'})
|
attrs={noteref_attr_name: 'noteref'})
|
||||||
@@ -207,13 +207,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
|||||||
[tag.decompose() for tag in bad_noterefs_tags]
|
[tag.decompose() for tag in bad_noterefs_tags]
|
||||||
|
|
||||||
def parse_a_tag_href(s: str) -> Tuple[str, str]:
|
def parse_a_tag_href(s: str) -> Tuple[str, str]:
|
||||||
""" Returns name of file & id of an anchor """
|
"""Returns name of file & id of an anchor"""
|
||||||
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
|
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
|
||||||
f, id_ = s.split('#')
|
f, id_ = s.split('#')
|
||||||
return f, id_
|
return f, id_
|
||||||
|
|
||||||
def verify_footnote_tag(tags: list):
|
def verify_footnote_tag(tags: list):
|
||||||
""" Function verifies is tag - footnote """
|
"""Function verifies is tag - footnote"""
|
||||||
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
|
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
|
||||||
if len(tags) == 0:
|
if len(tags) == 0:
|
||||||
anchored_tags = list(target_html_tag.find_all(id=element_id))
|
anchored_tags = list(target_html_tag.find_all(id=element_id))
|
||||||
@@ -259,9 +259,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
|||||||
|
|
||||||
|
|
||||||
def unwrap_structural_tags(body_tag):
|
def unwrap_structural_tags(body_tag):
|
||||||
"""
|
"""Main function that works with structure of html. Make changes inplace.
|
||||||
Main function that works with structure of html.
|
|
||||||
Make changes inplace.
|
|
||||||
|
|
||||||
1. Extracts tags that are not needed
|
1. Extracts tags that are not needed
|
||||||
|
|
||||||
@@ -278,7 +276,7 @@ def unwrap_structural_tags(body_tag):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def _preserve_class_in_aside_tag(tag_):
|
def _preserve_class_in_aside_tag(tag_):
|
||||||
""" to save css style inherited from class, copy class to aside tag (which is parent to tag_) """
|
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
|
||||||
# this is for Wiley books with boxes
|
# this is for Wiley books with boxes
|
||||||
tag_class = tag_.attrs['class'] if not isinstance(
|
tag_class = tag_.attrs['class'] if not isinstance(
|
||||||
tag_.attrs['class'], list) else tag_.attrs['class'][0]
|
tag_.attrs['class'], list) else tag_.attrs['class'][0]
|
||||||
@@ -434,14 +432,23 @@ def unwrap_structural_tags(body_tag):
|
|||||||
|
|
||||||
|
|
||||||
def get_tags_between_chapter_marks(first_id, href, html_soup):
|
def get_tags_between_chapter_marks(first_id, href, html_soup):
|
||||||
"""
|
"""After processing on a first_id that corresponds to current chapter,
|
||||||
After processing on a first_id that corresponds to current chapter,
|
|
||||||
from initial html_soup all tags from current chapter are extracted
|
from initial html_soup all tags from current chapter are extracted
|
||||||
|
|
||||||
:param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
Parameters
|
||||||
:param href: name of current chapter's file
|
----------
|
||||||
:param html_soup: soup object of current file
|
first_id :
|
||||||
:return: list [Tag, NavigableString]; chapter's tags
|
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
||||||
|
href :
|
||||||
|
Name of current chapter's file
|
||||||
|
html_soup :
|
||||||
|
Soup object of current file
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tags : list [Tag, NavigableString]
|
||||||
|
Chapter's tags
|
||||||
|
|
||||||
"""
|
"""
|
||||||
marked_tags = html_soup.find(
|
marked_tags = html_soup.find(
|
||||||
attrs={'id': first_id, 'class': 'converter-chapter-mark'})
|
attrs={'id': first_id, 'class': 'converter-chapter-mark'})
|
||||||
@@ -467,7 +474,7 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
|
|||||||
|
|
||||||
|
|
||||||
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
|
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
|
||||||
""" Function wraps <block> with <table> """
|
"""Function wraps <block> with <table>"""
|
||||||
table = main_tag.new_tag("table")
|
table = main_tag.new_tag("table")
|
||||||
table.attrs['border'] = border
|
table.attrs['border'] = border
|
||||||
table.attrs['align'] = 'center'
|
table.attrs['align'] = 'center'
|
||||||
@@ -497,7 +504,7 @@ def clean_wiley_block(block):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_block_tags(chapter_tag):
|
def preprocess_block_tags(chapter_tag):
|
||||||
""" Function preprocessing <block> tags """
|
"""Function preprocessing <block> tags"""
|
||||||
for block in chapter_tag.find_all("blockquote"):
|
for block in chapter_tag.find_all("blockquote"):
|
||||||
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
|
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
|
||||||
clean_wiley_block(block)
|
clean_wiley_block(block)
|
||||||
@@ -520,7 +527,7 @@ def preprocess_block_tags(chapter_tag):
|
|||||||
|
|
||||||
|
|
||||||
def prepare_formatted(text):
|
def prepare_formatted(text):
|
||||||
""" Function replaces special symbols with their Unicode representation """
|
"""Function replaces special symbols with their Unicode representation"""
|
||||||
text = text.replace("<", "\x3C")
|
text = text.replace("<", "\x3C")
|
||||||
text = text.replace(">", "\x3E")
|
text = text.replace(">", "\x3E")
|
||||||
text = text.replace('\t', "\xa0 \xa0 ") #
|
text = text.replace('\t', "\xa0 \xa0 ") #
|
||||||
@@ -530,7 +537,7 @@ def prepare_formatted(text):
|
|||||||
|
|
||||||
|
|
||||||
def wrap_preformatted_span_with_table(main_tag, old_tag):
|
def wrap_preformatted_span_with_table(main_tag, old_tag):
|
||||||
""" Function wraps <span> with <table> """
|
"""Function wraps <span> with <table>"""
|
||||||
table = main_tag.new_tag("table")
|
table = main_tag.new_tag("table")
|
||||||
table.attrs['border'] = '1px #ccc;'
|
table.attrs['border'] = '1px #ccc;'
|
||||||
table.attrs['style'] = 'width:100%;'
|
table.attrs['style'] = 'width:100%;'
|
||||||
@@ -547,7 +554,7 @@ def wrap_preformatted_span_with_table(main_tag, old_tag):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_pre_tags(chapter_tag):
|
def preprocess_pre_tags(chapter_tag):
|
||||||
""" Function preprocessing <pre> tags """
|
"""Function preprocessing <pre> tags"""
|
||||||
for pre in chapter_tag.find_all("pre"):
|
for pre in chapter_tag.find_all("pre"):
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag("span")
|
new_tag = BeautifulSoup(features='lxml').new_tag("span")
|
||||||
new_tag.attrs = pre.attrs.copy()
|
new_tag.attrs = pre.attrs.copy()
|
||||||
@@ -586,7 +593,7 @@ def preprocess_pre_tags(chapter_tag):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_code_tags(chapter_tag):
|
def preprocess_code_tags(chapter_tag):
|
||||||
""" Function that emulates style of <code>, <kdb>, <var> """
|
"""Function that emulates style of <code>, <kdb>, <var>"""
|
||||||
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
|
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
|
||||||
code.name = 'span'
|
code.name = 'span'
|
||||||
if code.parent.name == "pre":
|
if code.parent.name == "pre":
|
||||||
@@ -595,7 +602,7 @@ def preprocess_code_tags(chapter_tag):
|
|||||||
|
|
||||||
|
|
||||||
def prepare_title(title_of_chapter: str) -> str:
|
def prepare_title(title_of_chapter: str) -> str:
|
||||||
""" Function finalise processing/cleaning title """
|
"""Function finalise processing/cleaning title"""
|
||||||
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
|
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
|
||||||
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||||
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
||||||
@@ -604,12 +611,27 @@ def prepare_title(title_of_chapter: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
||||||
"""
|
"""Function finalise processing/cleaning content
|
||||||
Function finalise processing/cleaning content
|
Parameters
|
||||||
|
----------
|
||||||
|
title_str : str
|
||||||
|
|
||||||
|
content_tag : BeautifulSoup
|
||||||
|
|
||||||
|
remove_title_from_chapter : bool
|
||||||
|
|
||||||
|
Steps
|
||||||
|
----------
|
||||||
1. cleaning \n
|
1. cleaning \n
|
||||||
2. heading removal
|
2. heading removal
|
||||||
3. processing tags
|
3. processing tags
|
||||||
4. class removal
|
4. class removal
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
Prepared content
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# 0. cleaning \n
|
# 0. cleaning \n
|
||||||
to_remove = []
|
to_remove = []
|
||||||
|
|||||||
Reference in New Issue
Block a user