Formatting

This commit is contained in:
Kiryl
2022-06-01 16:23:53 +03:00
parent 5039417a0f
commit c0ef0b6d6e
13 changed files with 318 additions and 185 deletions

View File

@@ -21,13 +21,22 @@ class HTMLDocxPreprocessor:
self.top_level_headers = None
self.content = list()
def _clean_tag(self, tag, attr_name, attr_value):
def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
"""
Function to clean tags by its name and attribute value.
Parameters
----------
tag: str
tag name to clean
attr_name: str
attribute name
attr_value: [str,re]
attribute value
Returns
-------
clean tag
:param tag: Tag name to clean.
:param attr_name: Attribute name.
:param attr_value: Attribute value.
"""
tags = self.body_tag.find_all(tag, {attr_name: attr_value})
for tag in tags:
@@ -56,12 +65,19 @@ class HTMLDocxPreprocessor:
return value
@classmethod
def convert_font_pt_to_px(cls, style):
def convert_font_pt_to_px(cls, style: str) -> str:
"""
Method converts point in the font-size to pixels.
Function converts point in the font-size to pixels.
Parameters
----------
style: str
str with style to proces
Returns
-------
: str
str with converted style
:param style: Str with style to process.
:return: Str with converted style.
"""
size = re.search(r"font-size: (\d{1,3})pt", style)
@@ -77,7 +93,10 @@ class HTMLDocxPreprocessor:
return re.sub(size + "pt", str(new_size) + "px", style)
def _font_to_span(self):
"""Function to convert <font> tag to <span>. If font style is default, then remove this tag."""
"""
Function to convert <font> tag to <span>.
If font style is default, then remove this tag.
"""
fonts = self.body_tag.find_all("font")
for font in fonts:
face = font.get("face")
@@ -105,7 +124,8 @@ class HTMLDocxPreprocessor:
if len(font.attrs) == 0:
font.unwrap()
assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more <font> tags
# on this step there should be no more <font> tags
assert len(self.body_tag.find_all("font")) == 0
def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated
@@ -116,11 +136,15 @@ class HTMLDocxPreprocessor:
def clean_trash(self):
"""Function to remove all styles and tags we don't need."""
self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$'))
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
self._clean_tag('span', 'style', re.compile(
r'^background: #[0-9a-fA-F]{6}$'))
# todo: check for another languages
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))
self._clean_tag('span', 'style', re.compile(
'^letter-spacing: -?[\d\.]+pt$'))
self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$'))
self._clean_tag('font', 'face', re.compile(
r'^Times New Roman[\w, ]+$'))
self._clean_tag("a", "name", "_GoBack")
self._clean_underline_links()
@@ -128,7 +152,8 @@ class HTMLDocxPreprocessor:
self._font_to_span()
# replace toc with empty <TOC> tag
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
tables = self.body_tag.find_all(
"div", id=re.compile(r'^Table of Contents\d+'))
for table in tables:
table.wrap(self.html_soup.new_tag("TOC"))
table.decompose()
@@ -138,7 +163,7 @@ class HTMLDocxPreprocessor:
paragraphs = self.body_tag.find_all('p')
for p in paragraphs:
# libra converts some \n into <p> with 2 </br>
# libre converts some \n into <p> with 2 </br>
# there we remove 1 unnecessary <br>
brs = p.find_all('br')
text = p.text
@@ -156,9 +181,11 @@ class HTMLDocxPreprocessor:
if style:
indent = re.search(r'text-indent: ([\d\.]{1,4})in', style)
margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style)
margin_right = re.search(r'margin-right: ([\d\.]{1,4})in', style)
margin_right = re.search(
r'margin-right: ([\d\.]{1,4})in', style)
margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style)
margin_bottom = re.search(r'margin-bottom: ([\d\.]{1,4})in', style)
margin_bottom = re.search(
r'margin-bottom: ([\d\.]{1,4})in', style)
else:
indent = None
margin_left = None
@@ -195,6 +222,7 @@ class HTMLDocxPreprocessor:
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
@@ -258,21 +286,24 @@ class HTMLDocxPreprocessor:
for x in has_i_tag_or_br]
if all(has_i_tag_or_br) and is_zero_border:
new_div = BeautifulSoup(features='lxml').new_tag('blockquote')
new_div = BeautifulSoup(
features='lxml').new_tag('blockquote')
for p in paragraphs:
new_div.append(p)
table.replaceWith(new_div)
def _process_hrefs(self):
a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^.*http.+')})
a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^.*http.+')})
# remove char=end of file for some editors
for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '')
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^(?!#sdfootnote)')})
a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^(?!#sdfootnote)')})
for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '')
tag.string = tag.text.replace('\u200b', '') # zero-width-space
@@ -286,23 +317,25 @@ class HTMLDocxPreprocessor:
def _process_footnotes(self):
"""Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
footnote_content = self.body_tag.find_all(
'div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_anchors)
assert footnote_amt == len(footnote_content), \
'Something went wrong with footnotes after libra conversion'
'Something went wrong with footnotes after libre conversion'
footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
true_a_tag = cont_tag.find_all('a', class_=re.compile(r'^sdfootnote.+$'))[0]
true_a_tag = cont_tag.find_all(
'a', class_=re.compile(r'^sdfootnote.+$'))[0]
if true_a_tag.attrs.get('href') is None:
cont_tag.a.decompose()
continue
assert anc_tag['name'] == true_a_tag['href'][1:], \
'Something went wrong with footnotes after libra conversion'
'Something went wrong with footnotes after libre conversion'
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
@@ -355,8 +388,10 @@ class HTMLDocxPreprocessor:
if len(img_tags):
if access is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/'))
folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
for img in img_tags:
@@ -370,10 +405,12 @@ class HTMLDocxPreprocessor:
if access is not None:
link = access.send_image(img_path, doc_id=book_id)
img.attrs['src'] = link
self.logger_object.log(f'{img_name} successfully uploaded.')
self.logger_object.log(
f'{img_name} successfully uploaded.')
else:
img_size = os.path.getsize(img_path)
self.logger_object.log(f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG)
self.logger_object.log(
f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG)
new_img_path = new_path / img_name
copyfile(img_path, new_img_path)
img.attrs["src"] = str(new_img_path)
@@ -408,7 +445,8 @@ class HTMLDocxPreprocessor:
def _process_toc_links(self):
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')})
toc_links = self.body_tag.find_all(
"a", {'name': re.compile(r'^_Toc\d+')})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
for tag in headers:
@@ -418,7 +456,8 @@ class HTMLDocxPreprocessor:
elif tag.name == "p":
exist_in_toc = self._check_parent_link_exist_in_toc(tag)
if tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
new_tag = BeautifulSoup(
features="lxml").new_tag("h" + outline_level)
text = tag.text
tag.replaceWith(new_tag)
new_tag.string = text
@@ -440,14 +479,16 @@ class HTMLDocxPreprocessor:
@staticmethod
def clean_tag_from_tabs(tag: NavigableString):
cleaned = re.sub(r'(\s+)+', ' ', tag)
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
# print('input: ', repr(tag))
# print('test: ', repr(cleaned))
def clean_tag_from_numbering(self, tag):
cleaned = self.clean_title_from_numbering(tag)
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
# print('input: ', repr(tag))
# print('test: ', repr(cleaned))
@@ -484,7 +525,8 @@ class HTMLDocxPreprocessor:
"""
headers_info = []
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
headers_outline = [int(re.sub(r"^h", "", tag.name)) for tag in header_tags]
headers_outline = [int(re.sub(r"^h", "", tag.name))
for tag in header_tags]
if headers_outline:
top_level_outline = min(headers_outline)
top_level_headers = [tag for tag in header_tags
@@ -518,13 +560,17 @@ class HTMLDocxPreprocessor:
Assume header(s) to be introduction if:
1. one header not numbered, before 1 numbered header
2. it is first header from the top level list and it equals to 'introduction'
2. it is first header from the top level list and it equals to 'introductio
Returns
-------
None
mark each top-level header with flag should_be_numbered = true/false
Result :
Mark each top-level header with flag should_be_numbered = true/false
"""
is_numbered_header = [header['is_numbered'] for header in self.top_level_headers]
is_title = [header['is_introduction'] for header in self.top_level_headers]
is_numbered_header = [header['is_numbered']
for header in self.top_level_headers]
is_title = [header['is_introduction']
for header in self.top_level_headers]
first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
second_is_numbered_or_not_exist = all(is_numbered_header[1:2])
@@ -539,7 +585,19 @@ class HTMLDocxPreprocessor:
self.top_level_headers[i]['should_be_numbered'] = True
def _process_headings(self):
"""Function to process tags <h>."""
"""
Function to process tags <h>.
Steps
----------
1. remove <b>, <span>
2. clean text in header from numbering and \n
Returns
-------
None
processed <h> tags
"""
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
# 1. remove <b>, <span>
@@ -581,36 +639,52 @@ class HTMLDocxPreprocessor:
for i, item in enumerate(content):
if type(content[i]) is NavigableString:
cleaned = re.sub(r'(\s+)+', ' ', content[i])
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
content[i].replace_with(this)
content[i] = this
else:
self.apply_func_to_last_child(content[i], self.clean_tag_from_tabs)
self.apply_func_to_last_child(
content[i], self.clean_tag_from_tabs)
content[0] = '' if content[0] == ' ' else content[0]
content = [item for item in content if item != '']
if type(content[0]) is NavigableString:
cleaned = self.clean_title_from_numbering(content[0])
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
content[0].replace_with(this)
content[0] = this
else:
self.apply_func_to_last_child(content[0], self.clean_tag_from_numbering)
self.apply_func_to_last_child(
content[0], self.clean_tag_from_numbering)
def _process_lists(self):
"""
Function to process tags <li>.
Unwrap <p> tags.
Function
- process tags <li>.
- unwrap <p> tags.
Parameters
----------
body_tag: Tag, soup object
Returns
-------
None
uwrap <p> tag with li
"""
li_tags = self.body_tag.find_all("li")
for il_tag in li_tags:
il_tag.attrs.update(il_tag.p.attrs)
il_tag.p.unwrap()
for li_tag in li_tags:
li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap()
def process_html(self, access, html_path, book_id):
def process_html(self, access=None, html_path='', book_id='local'):
"""Process html code to satisfy LiveCarta formatting."""
self.logger_object.log('Beginning of processing .html file.')
try:
self.logger_object.log(f'Processing TOC and headers.')
self._process_toc_links()
@@ -628,18 +702,22 @@ class HTMLDocxPreprocessor:
self.logger_object.log('Tables processing.')
self._process_tables()
self.logger_object.log(f'{self.tables_amount} tables have been processed.')
self.logger_object.log(
f'{self.tables_amount} tables have been processed.')
self.logger_object.log('Hrefs processing.')
self._process_hrefs()
self.logger_object.log('Footnotes processing.')
self._process_footnotes()
self.logger_object.log(f'{len(self.footnotes)} footnotes have been processed.')
self.logger_object.log(
f'{len(self.footnotes)} footnotes have been processed.')
self.logger_object.log('Image processing.')
self._process_images(access=access, html_path=html_path, book_id=book_id)
self.logger_object.log(f'{len(self.images)} images have been processed.')
self._process_images(
access=access, html_path=html_path, book_id=book_id)
self.logger_object.log(
f'{len(self.images)} images have been processed.')
self._process_footer()
self._process_div()
@@ -658,7 +736,8 @@ class HTMLDocxPreprocessor:
self.delete_content_before_toc()
except Exception as exc:
self.logger_object.log('Error has occurred while processing html.', logging.ERROR)
self.logger_object.log(
'Error has occurred while processing html.', logging.ERROR)
self.logger_object.log_error_to_main_log()
if self.status_wrapper:
self.status_wrapper.set_error()