Change documentation

This commit is contained in:
Kiryl
2022-03-28 13:24:52 +03:00
parent 2798a93def
commit 8473ff443a

View File

@@ -10,7 +10,7 @@ from src.livecarta_config import LiveCartaConfig
def save_image_locally(img_file_path, img_content, book_id): def save_image_locally(img_file_path, img_content, book_id):
""" Function saves all images locally """ """Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join( new_path = pathlib.Path(os.path.join(
folder_path, f'../json/img_{book_id}/')) folder_path, f'../json/img_{book_id}/'))
@@ -25,7 +25,7 @@ def save_image_locally(img_file_path, img_content, book_id):
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id): def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
""" Function saves all images to Amazon web service """ """Function saves all images to Amazon web service"""
link_path = access.send_image( link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content) img_file_path, doc_id=book_id, img_content=img_content)
return link_path return link_path
@@ -37,7 +37,7 @@ def update_images_src_links(body_tag: Tag,
access=None, access=None,
path2aws_path=None, path2aws_path=None,
book_id=None): book_id=None):
""" Function makes dictionary image_src_path -> Amazon web service_path """ """Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all('img') img_tags = body_tag.find_all('img')
for img in img_tags: for img in img_tags:
@@ -72,7 +72,7 @@ def update_images_src_links(body_tag: Tag,
def preprocess_table(body_tag: BeautifulSoup): def preprocess_table(body_tag: BeautifulSoup):
""" Function to preprocess tables and tags(td|th|tr): style """ """Function to preprocess tables and tags(td|th|tr): style"""
tables = body_tag.find_all("table") tables = body_tag.find_all("table")
for table in tables: for table in tables:
ts = table.find_all(re.compile("td|th|tr")) ts = table.find_all(re.compile("td|th|tr"))
@@ -84,13 +84,13 @@ def preprocess_table(body_tag: BeautifulSoup):
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match: if width_match:
size = width_match.group(1) size = width_match.group(1)
units = width_match.group(2)
width = size+'px' width = size+'px'
t_tag.attrs['width'] = t_tag.get('width') or width t_tag.attrs['width'] = t_tag.get('width') or width
if t_tag.attrs.get('style'): if t_tag.attrs.get('style'):
t_tag.attrs['style'] = t_tag.attrs['style'].replace('border:0;', '') t_tag.attrs['style'] = t_tag.attrs['style'].replace(
'border:0;', '')
elif t_tag.attrs.get('style') == '': elif t_tag.attrs.get('style') == '':
del t_tag.attrs['style'] del t_tag.attrs['style']
@@ -113,7 +113,7 @@ def process_lists(body_tag):
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
""" Function inserts span before tag to be removed(aren't supported by livecarta) """ """Function inserts span before tag to be removed(aren't supported by livecarta)"""
new_tag = main_tag.new_tag("span") new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or '' new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or '' new_tag.attrs['class'] = class_ or ''
@@ -157,7 +157,7 @@ def clean_headings_content(content: Tag, title: str):
def heading_tag_to_p_tag(body_tag): def heading_tag_to_p_tag(body_tag):
""" Function to convert all lower level headings to p tags """ """Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern)) header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags: for tag in header_tags:
@@ -165,7 +165,7 @@ def heading_tag_to_p_tag(body_tag):
def clean_title_from_numbering(title: str): def clean_title_from_numbering(title: str):
""" Function removes numbering from titles """ """Function removes numbering from titles"""
title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(\s+)+', '', title)
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
@@ -174,7 +174,7 @@ def clean_title_from_numbering(title: str):
def replace_with_livecarta_anchor_tag(anchor, i): def replace_with_livecarta_anchor_tag(anchor, i):
""" Function replace noteref_tag(anchor) with new livecarta tag """ """Function replace noteref_tag(anchor) with new livecarta tag"""
new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element' new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1 new_tag['data-id'] = i + 1
@@ -194,7 +194,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p> <p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside> <aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
""" """
footnotes = [] footnotes = []
noterefs_tags = source_html_tag.find_all( noterefs_tags = source_html_tag.find_all(
attrs={noteref_attr_name: 'noteref'}) attrs={noteref_attr_name: 'noteref'})
@@ -207,13 +207,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
[tag.decompose() for tag in bad_noterefs_tags] [tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str) -> Tuple[str, str]: def parse_a_tag_href(s: str) -> Tuple[str, str]:
""" Returns name of file & id of an anchor """ """Returns name of file & id of an anchor"""
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#') f, id_ = s.split('#')
return f, id_ return f, id_
def verify_footnote_tag(tags: list): def verify_footnote_tag(tags: list):
""" Function verifies is tag - footnote """ """Function verifies is tag - footnote"""
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}' assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0: if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id)) anchored_tags = list(target_html_tag.find_all(id=element_id))
@@ -259,9 +259,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
def unwrap_structural_tags(body_tag): def unwrap_structural_tags(body_tag):
""" """Main function that works with structure of html. Make changes inplace.
Main function that works with structure of html.
Make changes inplace.
1. Extracts tags that are not needed 1. Extracts tags that are not needed
@@ -278,7 +276,7 @@ def unwrap_structural_tags(body_tag):
""" """
def _preserve_class_in_aside_tag(tag_): def _preserve_class_in_aside_tag(tag_):
""" to save css style inherited from class, copy class to aside tag (which is parent to tag_) """ """to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
# this is for Wiley books with boxes # this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance( tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0] tag_.attrs['class'], list) else tag_.attrs['class'][0]
@@ -434,14 +432,23 @@ def unwrap_structural_tags(body_tag):
def get_tags_between_chapter_marks(first_id, href, html_soup): def get_tags_between_chapter_marks(first_id, href, html_soup):
""" """After processing on a first_id that corresponds to current chapter,
After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted from initial html_soup all tags from current chapter are extracted
:param first_id: id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' Parameters
:param href: name of current chapter's file ----------
:param html_soup: soup object of current file first_id :
:return: list [Tag, NavigableString]; chapter's tags Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
href :
Name of current chapter's file
html_soup :
Soup object of current file
Returns
-------
tags : list [Tag, NavigableString]
Chapter's tags
""" """
marked_tags = html_soup.find( marked_tags = html_soup.find(
attrs={'id': first_id, 'class': 'converter-chapter-mark'}) attrs={'id': first_id, 'class': 'converter-chapter-mark'})
@@ -467,7 +474,7 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None): def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
""" Function wraps <block> with <table> """ """Function wraps <block> with <table>"""
table = main_tag.new_tag("table") table = main_tag.new_tag("table")
table.attrs['border'] = border table.attrs['border'] = border
table.attrs['align'] = 'center' table.attrs['align'] = 'center'
@@ -497,7 +504,7 @@ def clean_wiley_block(block):
def preprocess_block_tags(chapter_tag): def preprocess_block_tags(chapter_tag):
""" Function preprocessing <block> tags """ """Function preprocessing <block> tags"""
for block in chapter_tag.find_all("blockquote"): for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
clean_wiley_block(block) clean_wiley_block(block)
@@ -520,7 +527,7 @@ def preprocess_block_tags(chapter_tag):
def prepare_formatted(text): def prepare_formatted(text):
""" Function replaces special symbols with their Unicode representation """ """Function replaces special symbols with their Unicode representation"""
text = text.replace("<", "\x3C") text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E") text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp; text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
@@ -530,7 +537,7 @@ def prepare_formatted(text):
def wrap_preformatted_span_with_table(main_tag, old_tag): def wrap_preformatted_span_with_table(main_tag, old_tag):
""" Function wraps <span> with <table> """ """Function wraps <span> with <table>"""
table = main_tag.new_tag("table") table = main_tag.new_tag("table")
table.attrs['border'] = '1px #ccc;' table.attrs['border'] = '1px #ccc;'
table.attrs['style'] = 'width:100%;' table.attrs['style'] = 'width:100%;'
@@ -547,7 +554,7 @@ def wrap_preformatted_span_with_table(main_tag, old_tag):
def preprocess_pre_tags(chapter_tag): def preprocess_pre_tags(chapter_tag):
""" Function preprocessing <pre> tags """ """Function preprocessing <pre> tags"""
for pre in chapter_tag.find_all("pre"): for pre in chapter_tag.find_all("pre"):
new_tag = BeautifulSoup(features='lxml').new_tag("span") new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy() new_tag.attrs = pre.attrs.copy()
@@ -586,7 +593,7 @@ def preprocess_pre_tags(chapter_tag):
def preprocess_code_tags(chapter_tag): def preprocess_code_tags(chapter_tag):
""" Function that emulates style of <code>, <kdb>, <var> """ """Function that emulates style of <code>, <kdb>, <var>"""
for code in chapter_tag.find_all(re.compile("code|kdb|var")): for code in chapter_tag.find_all(re.compile("code|kdb|var")):
code.name = 'span' code.name = 'span'
if code.parent.name == "pre": if code.parent.name == "pre":
@@ -595,7 +602,7 @@ def preprocess_code_tags(chapter_tag):
def prepare_title(title_of_chapter: str) -> str: def prepare_title(title_of_chapter: str) -> str:
""" Function finalise processing/cleaning title """ """Function finalise processing/cleaning title"""
title_str = BeautifulSoup(title_of_chapter, features='lxml').string title_str = BeautifulSoup(title_of_chapter, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip() title_str = re.sub(r' +', ' ', title_str).rstrip()
@@ -604,12 +611,27 @@ def prepare_title(title_of_chapter: str) -> str:
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
""" """Function finalise processing/cleaning content
Function finalise processing/cleaning content Parameters
----------
title_str : str
content_tag : BeautifulSoup
remove_title_from_chapter : bool
Steps
----------
1. cleaning \n 1. cleaning \n
2. heading removal 2. heading removal
3. processing tags 3. processing tags
4. class removal 4. class removal
Returns
-------
str
Prepared content
""" """
# 0. cleaning \n # 0. cleaning \n
to_remove = [] to_remove = []