forked from LiveCarta/BookConverter
Wrote documentation for every func/class in .py
This commit is contained in:
@@ -35,9 +35,7 @@ class HTMLDocxPreprocessor:
|
||||
tag.unwrap()
|
||||
|
||||
def _clean_underline_links(self):
|
||||
"""
|
||||
Function cleans meaningless <u> tags before links.
|
||||
"""
|
||||
"""Function cleans meaningless <u> tags before links."""
|
||||
underlines = self.body_tag.find_all("u")
|
||||
for u in underlines:
|
||||
if u.find_all('a'):
|
||||
@@ -79,9 +77,7 @@ class HTMLDocxPreprocessor:
|
||||
return re.sub(size + "pt", str(new_size) + "px", style)
|
||||
|
||||
def _font_to_span(self):
|
||||
"""
|
||||
Function to convert <font> tag to <span>. If font style is default, then remove this tag.
|
||||
"""
|
||||
"""Function to convert <font> tag to <span>. If font style is default, then remove this tag."""
|
||||
fonts = self.body_tag.find_all("font")
|
||||
for font in fonts:
|
||||
face = font.get("face")
|
||||
@@ -119,9 +115,7 @@ class HTMLDocxPreprocessor:
|
||||
self.content = self.content[ind:]
|
||||
|
||||
def clean_trash(self):
|
||||
"""
|
||||
Function to remove all styles and tags we don't need.
|
||||
"""
|
||||
"""Function to remove all styles and tags we don't need."""
|
||||
self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$'))
|
||||
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages
|
||||
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
|
||||
@@ -140,9 +134,7 @@ class HTMLDocxPreprocessor:
|
||||
table.decompose()
|
||||
|
||||
def _process_paragraph(self):
|
||||
"""
|
||||
Function to process <p> tags (text-align and text-indent value).
|
||||
"""
|
||||
"""Function to process <p> tags (text-align and text-indent value)."""
|
||||
paragraphs = self.body_tag.find_all('p')
|
||||
|
||||
for p in paragraphs:
|
||||
@@ -193,9 +185,7 @@ class HTMLDocxPreprocessor:
|
||||
p.attrs['style'] = style
|
||||
|
||||
def _process_two_columns(self):
|
||||
"""
|
||||
Function to process paragraphs which has two columns layout.
|
||||
"""
|
||||
"""Function to process paragraphs which has two columns layout."""
|
||||
two_columns = self.body_tag.find_all("div", style="column-count: 2")
|
||||
for div in two_columns:
|
||||
for child in div.children:
|
||||
@@ -204,9 +194,7 @@ class HTMLDocxPreprocessor:
|
||||
div.unwrap()
|
||||
|
||||
def _process_tables(self):
|
||||
"""
|
||||
Function to process tables. Set "border" attribute.
|
||||
"""
|
||||
"""Function to process tables. Set "border" attribute."""
|
||||
tables = self.body_tag.find_all("table")
|
||||
for table in tables:
|
||||
tds = table.find_all("td")
|
||||
@@ -296,9 +284,7 @@ class HTMLDocxPreprocessor:
|
||||
return content.strip()
|
||||
|
||||
def _process_footnotes(self):
|
||||
"""
|
||||
Function returns list of footnotes and delete them from html_soup.
|
||||
"""
|
||||
"""Function returns list of footnotes and delete them from html_soup."""
|
||||
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
|
||||
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
|
||||
footnote_amt = len(footnote_anchors)
|
||||
@@ -404,9 +390,7 @@ class HTMLDocxPreprocessor:
|
||||
div.decompose()
|
||||
|
||||
def _process_div(self):
|
||||
"""
|
||||
Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.
|
||||
"""
|
||||
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
|
||||
divs = self.body_tag.find_all("div")
|
||||
|
||||
for div in divs:
|
||||
@@ -423,9 +407,7 @@ class HTMLDocxPreprocessor:
|
||||
return len(toc_links) > 0
|
||||
|
||||
def _process_toc_links(self):
|
||||
"""
|
||||
Function to extract nodes which contains TOC links, remove links from file and detect headers.
|
||||
"""
|
||||
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
|
||||
toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')})
|
||||
headers = [link.parent for link in toc_links]
|
||||
outline_level = "1" # All the unknown outlines will be predicted as <h1>
|
||||
@@ -448,13 +430,11 @@ class HTMLDocxPreprocessor:
|
||||
|
||||
@staticmethod
|
||||
def clean_title_from_numbering(title: str):
|
||||
"""
|
||||
Function to remove digits from headers.
|
||||
"""
|
||||
"""Function to remove digits from headers."""
|
||||
title = re.sub(r'^(\s+)+', '', title)
|
||||
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
|
||||
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
|
||||
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
|
||||
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
|
||||
return title
|
||||
|
||||
@staticmethod
|
||||
@@ -485,9 +465,7 @@ class HTMLDocxPreprocessor:
|
||||
self.apply_func_to_last_child(children[0], func)
|
||||
|
||||
def _preprocessing_headings(self):
|
||||
"""
|
||||
Function to convert all lower level headings to p tags
|
||||
"""
|
||||
"""Function to convert all lower level headings to p tags"""
|
||||
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||
header_tags = self.body_tag.find_all(re.compile(pattern))
|
||||
for tag in header_tags:
|
||||
@@ -561,9 +539,7 @@ class HTMLDocxPreprocessor:
|
||||
self.top_level_headers[i]['should_be_numbered'] = True
|
||||
|
||||
def _process_headings(self):
|
||||
"""
|
||||
Function to process tags <h>.
|
||||
"""
|
||||
"""Function to process tags <h>."""
|
||||
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
|
||||
|
||||
# 1. remove <b>, <span>
|
||||
@@ -634,9 +610,7 @@ class HTMLDocxPreprocessor:
|
||||
il_tag.p.unwrap()
|
||||
|
||||
def process_html(self, access, html_path, book_id):
|
||||
"""
|
||||
Process html code to satisfy LiveCarta formatting.
|
||||
"""
|
||||
"""Process html code to satisfy LiveCarta formatting."""
|
||||
try:
|
||||
self.logger_object.log(f'Processing TOC and headers.')
|
||||
self._process_toc_links()
|
||||
|
||||
Reference in New Issue
Block a user