Wrote documentation for every func/class in .py

This commit is contained in:
Kiryl
2021-12-10 10:53:40 +03:00
parent ef3502cd0a
commit 4b1109e6b4
13 changed files with 198 additions and 172 deletions

View File

@@ -35,9 +35,7 @@ class HTMLDocxPreprocessor:
tag.unwrap()
def _clean_underline_links(self):
"""
Function cleans meaningless <u> tags before links.
"""
"""Function cleans meaningless <u> tags before links."""
underlines = self.body_tag.find_all("u")
for u in underlines:
if u.find_all('a'):
@@ -79,9 +77,7 @@ class HTMLDocxPreprocessor:
return re.sub(size + "pt", str(new_size) + "px", style)
def _font_to_span(self):
"""
Function to convert <font> tag to <span>. If font style is default, then remove this tag.
"""
"""Function to convert <font> tag to <span>. If font style is default, then remove this tag."""
fonts = self.body_tag.find_all("font")
for font in fonts:
face = font.get("face")
@@ -119,9 +115,7 @@ class HTMLDocxPreprocessor:
self.content = self.content[ind:]
def clean_trash(self):
"""
Function to remove all styles and tags we don't need.
"""
"""Function to remove all styles and tags we don't need."""
self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$'))
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
@@ -140,9 +134,7 @@ class HTMLDocxPreprocessor:
table.decompose()
def _process_paragraph(self):
"""
Function to process <p> tags (text-align and text-indent value).
"""
"""Function to process <p> tags (text-align and text-indent value)."""
paragraphs = self.body_tag.find_all('p')
for p in paragraphs:
@@ -193,9 +185,7 @@ class HTMLDocxPreprocessor:
p.attrs['style'] = style
def _process_two_columns(self):
"""
Function to process paragraphs which has two columns layout.
"""
"""Function to process paragraphs which has two columns layout."""
two_columns = self.body_tag.find_all("div", style="column-count: 2")
for div in two_columns:
for child in div.children:
@@ -204,9 +194,7 @@ class HTMLDocxPreprocessor:
div.unwrap()
def _process_tables(self):
"""
Function to process tables. Set "border" attribute.
"""
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
@@ -296,9 +284,7 @@ class HTMLDocxPreprocessor:
return content.strip()
def _process_footnotes(self):
"""
Function returns list of footnotes and delete them from html_soup.
"""
"""Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_anchors)
@@ -404,9 +390,7 @@ class HTMLDocxPreprocessor:
div.decompose()
def _process_div(self):
"""
Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.
"""
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
divs = self.body_tag.find_all("div")
for div in divs:
@@ -423,9 +407,7 @@ class HTMLDocxPreprocessor:
return len(toc_links) > 0
def _process_toc_links(self):
"""
Function to extract nodes which contains TOC links, remove links from file and detect headers.
"""
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
@@ -448,13 +430,11 @@ class HTMLDocxPreprocessor:
@staticmethod
def clean_title_from_numbering(title: str):
"""
Function to remove digits from headers.
"""
"""Function to remove digits from headers."""
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
@staticmethod
@@ -485,9 +465,7 @@ class HTMLDocxPreprocessor:
self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self):
"""
Function to convert all lower level headings to p tags
"""
"""Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
@@ -561,9 +539,7 @@ class HTMLDocxPreprocessor:
self.top_level_headers[i]['should_be_numbered'] = True
def _process_headings(self):
"""
Function to process tags <h>.
"""
"""Function to process tags <h>."""
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
# 1. remove <b>, <span>
@@ -634,9 +610,7 @@ class HTMLDocxPreprocessor:
il_tag.p.unwrap()
def process_html(self, access, html_path, book_id):
"""
Process html code to satisfy LiveCarta formatting.
"""
"""Process html code to satisfy LiveCarta formatting."""
try:
self.logger_object.log(f'Processing TOC and headers.')
self._process_toc_links()