Wrote documentation for every func/class in .py

This commit is contained in:
Kiryl
2021-12-10 10:53:40 +03:00
parent ef3502cd0a
commit 4b1109e6b4
13 changed files with 198 additions and 172 deletions

View File

@@ -12,6 +12,7 @@ from src.book_solver import BookSolver
class DocxBook(BookSolver):
"""Class of .docx type book - child of BookSolver"""
def __init__(self, book_id=0, access=None, html_path=None,
main_logger=None, libra_locker=None):
@@ -30,9 +31,7 @@ class DocxBook(BookSolver):
self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
def convert_doc_to_html(self):
"""
Method for convert .docx document to .html file.
"""
"""Method for convert .docx document to .html file."""
self.logger_object.log(f'File - {self.file_path}.')
print(f'{self.file_path}')
self.logger_object.log('Beginning of conversion from .docx to .html.')
@@ -92,9 +91,7 @@ class DocxBook(BookSolver):
self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
def read_html(self):
"""
Method for reading .html file into beautiful soup tag.
"""
"""Method for reading .html file into beautiful soup tag."""
try:
html_text = open(self.html_path, 'r', encoding='utf8').read()
self.logger_object.log('HTML for book has been loaded.')
@@ -130,7 +127,6 @@ class DocxBook(BookSolver):
1. Convert docx to html with libra office
2. Parse and clean html, get list of tags, get footnotes
3. Parse from line structure to nested structure with JSONConverter
"""
self.convert_doc_to_html()
self.check_output_directory()

View File

@@ -35,9 +35,7 @@ class HTMLDocxPreprocessor:
tag.unwrap()
def _clean_underline_links(self):
"""
Function cleans meaningless <u> tags before links.
"""
"""Function cleans meaningless <u> tags before links."""
underlines = self.body_tag.find_all("u")
for u in underlines:
if u.find_all('a'):
@@ -79,9 +77,7 @@ class HTMLDocxPreprocessor:
return re.sub(size + "pt", str(new_size) + "px", style)
def _font_to_span(self):
"""
Function to convert <font> tag to <span>. If font style is default, then remove this tag.
"""
"""Function to convert <font> tag to <span>. If font style is default, then remove this tag."""
fonts = self.body_tag.find_all("font")
for font in fonts:
face = font.get("face")
@@ -119,9 +115,7 @@ class HTMLDocxPreprocessor:
self.content = self.content[ind:]
def clean_trash(self):
"""
Function to remove all styles and tags we don't need.
"""
"""Function to remove all styles and tags we don't need."""
self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$'))
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
@@ -140,9 +134,7 @@ class HTMLDocxPreprocessor:
table.decompose()
def _process_paragraph(self):
"""
Function to process <p> tags (text-align and text-indent value).
"""
"""Function to process <p> tags (text-align and text-indent value)."""
paragraphs = self.body_tag.find_all('p')
for p in paragraphs:
@@ -193,9 +185,7 @@ class HTMLDocxPreprocessor:
p.attrs['style'] = style
def _process_two_columns(self):
"""
Function to process paragraphs which has two columns layout.
"""
"""Function to process paragraphs which has two columns layout."""
two_columns = self.body_tag.find_all("div", style="column-count: 2")
for div in two_columns:
for child in div.children:
@@ -204,9 +194,7 @@ class HTMLDocxPreprocessor:
div.unwrap()
def _process_tables(self):
"""
Function to process tables. Set "border" attribute.
"""
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
@@ -296,9 +284,7 @@ class HTMLDocxPreprocessor:
return content.strip()
def _process_footnotes(self):
"""
Function returns list of footnotes and delete them from html_soup.
"""
"""Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_anchors)
@@ -404,9 +390,7 @@ class HTMLDocxPreprocessor:
div.decompose()
def _process_div(self):
"""
Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.
"""
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
divs = self.body_tag.find_all("div")
for div in divs:
@@ -423,9 +407,7 @@ class HTMLDocxPreprocessor:
return len(toc_links) > 0
def _process_toc_links(self):
"""
Function to extract nodes which contains TOC links, remove links from file and detect headers.
"""
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
@@ -448,13 +430,11 @@ class HTMLDocxPreprocessor:
@staticmethod
def clean_title_from_numbering(title: str):
"""
Function to remove digits from headers.
"""
"""Function to remove digits from headers."""
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
@staticmethod
@@ -485,9 +465,7 @@ class HTMLDocxPreprocessor:
self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self):
"""
Function to convert all lower level headings to p tags
"""
"""Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
@@ -561,9 +539,7 @@ class HTMLDocxPreprocessor:
self.top_level_headers[i]['should_be_numbered'] = True
def _process_headings(self):
"""
Function to process tags <h>.
"""
"""Function to process tags <h>."""
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
# 1. remove <b>, <span>
@@ -634,9 +610,7 @@ class HTMLDocxPreprocessor:
il_tag.p.unwrap()
def process_html(self, access, html_path, book_id):
"""
Process html code to satisfy LiveCarta formatting.
"""
"""Process html code to satisfy LiveCarta formatting."""
try:
self.logger_object.log(f'Processing TOC and headers.')
self._process_toc_links()

View File

@@ -90,9 +90,7 @@ class LibraHTML2JSONConverter:
return True
def convert_to_dict(self):
"""
Function which convert list of html nodes to appropriate json structure.
"""
"""Function which convert list of html nodes to appropriate json structure."""
json_strc = []
ind = 0
ch_num = 0