Docx refactoring

This commit is contained in:
Kiryl
2022-07-27 20:20:52 +03:00
parent 617d4fcaef
commit 290ffa346a
4 changed files with 182 additions and 226 deletions

View File

@@ -66,7 +66,6 @@ class Docx2LibreHTML:
raise error
self.logger_object.log(f"File - {self.file_path}.")
print(f"{self.file_path}")
self.logger_object.log("Beginning of conversion from .docx to .html.")
check_file_exists(
@@ -74,7 +73,7 @@ class Docx2LibreHTML:
folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(folder_path, f"../html/{self.book_id}")
out_dir_path = os.path.join(folder_path, f"../books/html/{self.book_id}")
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
try:

View File

@@ -34,9 +34,9 @@ class DocxBook(BookSolver):
"""
# 1. Converts docx to html with LibreOffice
html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access,
html_converter = Docx2LibreHTML(self.book_id, self.book_path, self.access,
self.logger_object, self.libre_locker)
# TODO presets
# todo presets
# 2. Parses and cleans html, gets list of tags, gets footnotes
parser = HTMLDocxPreprocessor(
@@ -53,7 +53,7 @@ class DocxBook(BookSolver):
if __name__ == "__main__":
docx_file_path = '../../docx/music_inquiry.docx'
docx_file_path = '../../books/docx/music_inquiry.docx'
logger_object = BookLogger(
name='docx', book_id=docx_file_path.split('/')[-1])
locker = Event()

View File

@@ -1,7 +1,7 @@
import re
from bs4 import BeautifulSoup, NavigableString
@staticmethod
def _clean_footnote_content(content):
content = content.strip()
return content.strip()

View File

@@ -20,6 +20,38 @@ class HTMLDocxPreprocessor:
self.top_level_headers = None
self.content = list()
def _process_toc_links(self):
def _check_parent_link_exist_in_toc(tag_with_link):
toc_links = []
for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}):
link_name = a_tag.attrs["name"]
toc_item = self.body_tag.find("a", {"href": "#" + link_name})
if toc_item:
toc_links.append(toc_item)
return len(toc_links) > 0
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all(
"a", {"name": re.compile(r"^_Toc\d+")})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
for h_tag in headers:
if re.search(r"^h\d$", h_tag.name):
h_tag.a.unwrap()
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
elif h_tag.name == "p":
exist_in_toc = _check_parent_link_exist_in_toc(h_tag)
if h_tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup(
features="lxml").new_tag("h" + outline_level)
text = h_tag.text
h_tag.replaceWith(new_tag)
new_tag.string = text
else:
# rethink document structure when you have toc_links, other cases?
self.logger_object.log(f"Something went wrong in processing toc_links."
f" Check the structure of the file. "
f"Tag name: {h_tag.name}")
def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
# todo regex
"""
@@ -48,12 +80,12 @@ class HTMLDocxPreprocessor:
"""Function cleans meaningless <u> tags before links."""
underlines = self.body_tag.find_all("u")
for u in underlines:
if u.find_all('a'):
if u.find_all("a"):
u.unwrap()
links = self.body_tag.find_all('a')
links = self.body_tag.find_all("a")
for link in links:
u = link.find_all('u')
u = link.find_all("u")
if u and len(u) == 1:
u[0].unwrap()
@@ -81,16 +113,12 @@ class HTMLDocxPreprocessor:
"""
size = re.search(r"font-size: (\d{1,3})pt", style)
if size is None:
return style
size = size.group(1)
new_size = cls.convert_pt_to_px(size)
if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
return ""
return re.sub(size + "pt", str(new_size) + "px", style)
def _font_to_span(self):
@@ -108,10 +136,10 @@ class HTMLDocxPreprocessor:
style = self.convert_font_pt_to_px(style)
if style != "":
if color and color in LiveCartaConfig.COLORS_MAP:
style += f'; color: {color};'
style += f"; color: {color};"
font.attrs["style"] = style
elif color and color in LiveCartaConfig.COLORS_MAP:
font.attrs["style"] = f'color: {color};'
font.attrs["style"] = f"color: {color};"
if len(font.attrs) == 0:
font.unwrap()
@@ -121,16 +149,16 @@ class HTMLDocxPreprocessor:
def clean_trash(self):
# todo make it regex dict
"""Function to remove all styles and tags we don't need."""
self._clean_tag('span', 'style', re.compile(
r'^background: #[\da-fA-F]{6}$'))
"""Function to remove all styles and tags we don"t need."""
self._clean_tag("span", "style", re.compile(
r"^background: #[\da-fA-F]{6}$"))
# todo: check for another languages
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))
self._clean_tag('span', 'style', re.compile(
'^letter-spacing: -?[\d.]+pt$'))
self._clean_tag("span", "lang", re.compile(r"^ru-RU$"))
self._clean_tag("span", "style", re.compile(
"^letter-spacing: -?[\d.]+pt$"))
self._clean_tag('font', 'face', re.compile(
r'^Times New Roman[\w, ]+$'))
self._clean_tag("font", "face", re.compile(
r"^Times New Roman[\w, ]+$"))
self._clean_tag("a", "name", "_GoBack")
self._clean_underline_links()
@@ -139,60 +167,68 @@ class HTMLDocxPreprocessor:
# replace toc with empty <TOC> tag
tables = self.body_tag.find_all(
"div", id=re.compile(r'^Table of Contents\d+'))
"div", id=re.compile(r"^Table of Contents\d+"))
for table in tables:
table.wrap(self.html_soup.new_tag("TOC"))
table.decompose()
def _preprocessing_headings(self):
# todo regex
"""Function to convert all lower level headings to p tags"""
pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$"
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = "p"
def _process_paragraph(self):
"""Function to process <p> tags (text-align and text-indent value)."""
paragraphs = self.body_tag.find_all('p')
paragraphs = self.body_tag.find_all("p")
for p in paragraphs:
# libre converts some \n into <p> with 2 </br>
# there we remove 1 unnecessary <br>
brs = p.find_all('br')
brs = p.find_all("br")
text = p.text
if brs and text == '\n\n' and len(brs) == 2:
if brs and text == "\n\n" and len(brs) == 2:
brs[0].decompose()
indent_should_be_added = False
if text and ((text[0:1] == '\t') or (text[:2] == '\n\t')):
if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")):
indent_should_be_added = True
align = p.get('align')
style = p.get('style')
align = p.get("align")
style = p.get("style")
if style:
indent = re.search(r'text-indent: ([\d.]{1,4})in', style)
margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style)
indent = re.search(r"text-indent: ([\d.]{1,4})in", style)
margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style)
margin_right = re.search(
r'margin-right: ([\d.]{1,4})in', style)
margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style)
r"margin-right: ([\d.]{1,4})in", style)
margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style)
margin_bottom = re.search(
r'margin-bottom: ([\d.]{1,4})in', style)
r"margin-bottom: ([\d.]{1,4})in", style)
else:
indent = margin_left = margin_right = \
margin_top = margin_bottom = None
if margin_left and margin_right and margin_top and margin_bottom and \
margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \
margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11':
p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote'))
margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \
margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11":
p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))
p.attrs = {}
style = ''
style = ""
if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE:
style += f'text-align: {align};'
style += f"text-align: {align};"
if indent is not None or indent_should_be_added:
# indent = indent.group(1)
style += f'text-indent: {LiveCartaConfig.INDENT};'
style += f"text-indent: {LiveCartaConfig.INDENT};"
if style:
p.attrs['style'] = style
p.attrs["style"] = style
def _process_two_columns(self):
"""Function to process paragraphs which has two columns layout."""
@@ -203,40 +239,6 @@ class HTMLDocxPreprocessor:
child["class"] = "columns2"
div.unwrap()
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
sizes = []
for td in tds:
style = td.get('style')
if style:
match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
if match:
size = match.group(1)
units = match.group(2)
if units == "pt":
size = self.convert_pt_to_px(size)
sizes.append(float(size))
width = td.get('width')
td.attrs = {}
if width:
td.attrs['width'] = width
if sizes:
border_size = sum(sizes) / len(sizes)
table.attrs['border'] = f'{border_size:.2}'
self.tables_amount = len(tables)
def _process_quotes(self):
"""
Function to process block quotes.
@@ -259,9 +261,9 @@ class HTMLDocxPreprocessor:
for table in tables:
trs = table.find_all("tr")
tds = table.find_all("td")
if len(trs) == 1 and len(tds) == 1 and tds[0].get('width') == '600':
if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600":
td = tds[0]
is_zero_border = 'border: none;' in td.get('style')
is_zero_border = "border: none;" in td.get("style")
paragraphs = td.find_all("p")
has_i_tag_or_br = [(p.i, p.br) for p in paragraphs]
has_i_tag_or_br = [x[0] is not None or x[1] is not None
@@ -269,27 +271,61 @@ class HTMLDocxPreprocessor:
if all(has_i_tag_or_br) and is_zero_border:
new_div = BeautifulSoup(
features='lxml').new_tag('blockquote')
features="lxml").new_tag("blockquote")
for p in paragraphs:
new_div.append(p)
table.replaceWith(new_div)
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
sizes = []
for td in tds:
style = td.get("style")
if style:
match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
if match:
size = match.group(1)
units = match.group(2)
if units == "pt":
size = self.convert_pt_to_px(size)
sizes.append(float(size))
width = td.get("width")
td.attrs = {}
if width:
td.attrs["width"] = width
if sizes:
border_size = sum(sizes) / len(sizes)
table.attrs["border"] = f"{border_size:.2}"
self.tables_amount = len(tables)
def _process_hrefs(self):
a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^.*http.+')})
"a", {"href": re.compile("^.*http.+")})
# remove char=end of file for some editors
for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '')
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
tag.string = tag.text.replace("\u200c", "")
tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^(?!#sdfootnote)')})
"a", {"href": re.compile("^(?!#sdfootnote)")})
for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '')
tag.string = tag.text.replace('\u200b', '') # zero-width-space
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
tag.string = tag.text.replace("\u200c", "")
tag.string = tag.text.replace("\u200b", "") # zero-width-space
tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
def _process_footer(self):
# todo regex
@@ -297,7 +333,7 @@ class HTMLDocxPreprocessor:
Function to process <div title="footer"> tags.
All the tags will be deleted from file.
"""
divs = self.body_tag.find_all('div', {'title': 'footer'})
divs = self.body_tag.find_all("div", {"title": "footer"})
for div in divs:
div.decompose()
@@ -305,90 +341,9 @@ class HTMLDocxPreprocessor:
# todo regex
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
divs = self.body_tag.find_all("div")
for div in divs:
div.unwrap()
def _check_parent_link_exist_in_toc(self, tag_with_link):
toc_links = []
for a_tag in tag_with_link.find_all("a", {'name': re.compile(r'^_Toc\d+')}):
link_name = a_tag.attrs['name']
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
if toc_item:
toc_links.append(toc_item)
return len(toc_links) > 0
def _process_toc_links(self):
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all(
"a", {'name': re.compile(r'^_Toc\d+')})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
for h_tag in headers:
if re.search(r"^h\d$", h_tag.name):
h_tag.a.unwrap()
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
elif h_tag.name == "p":
exist_in_toc = self._check_parent_link_exist_in_toc(h_tag)
if h_tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup(
features="lxml").new_tag("h" + outline_level)
text = h_tag.text
h_tag.replaceWith(new_tag)
new_tag.string = text
else:
# rethink document structure when you have toc_links, other cases?
self.logger_object.log(f'Something went wrong in processing toc_links.'
f' Check the structure of the file. '
f'Tag name: {h_tag.name}')
@staticmethod
def clean_title_from_numbering(title: str):
"""Function to remove digits from headers."""
title = re.sub(r'^(\s+)+', '', title)
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
# title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
@staticmethod
def clean_tag_from_tabs(tag: NavigableString):
cleaned = re.sub(r'(\s+)+', ' ', tag)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
# print('input: ', repr(tag))
# print('test: ', repr(cleaned))
def clean_tag_from_numbering(self, tag):
cleaned = self.clean_title_from_numbering(tag)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
# print('input: ', repr(tag))
# print('test: ', repr(cleaned))
def apply_func_to_last_child(self, tag, func=None):
"""
works only with constructions like (((child to work with)))
where child is object of NavigableString
"""
if type(tag) is NavigableString:
func(tag)
else:
children = list(tag.children)
if children:
self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self):
# todo regex
"""Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
def _get_top_level_headers(self):
"""
Function for gathering info about top-level chapters.
@@ -416,27 +371,26 @@ class HTMLDocxPreprocessor:
tag.parent.unwrap()
title = tag.text
title = re.sub(r'\s+', ' ', title).strip()
number = re.match(r'^(?:\.?\d+\.? ?)+', title)
title = re.sub(r"\s+", " ", title).strip()
number = re.match(r"^(?:\.?\d+\.? ?)+", title)
is_numbered = number is not None
cleaned_title = self.clean_title_from_numbering(tag.text)
is_introduction = cleaned_title.lower() == 'introduction'
cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text)
is_introduction = cleaned_title.lower() == "introduction"
headers_info.append({
'title': cleaned_title,
'is_numbered': is_numbered,
'is_introduction': is_introduction})
"title": cleaned_title,
"is_numbered": is_numbered,
"is_introduction": is_introduction})
return headers_info
def _mark_introduction_headers(self):
"""
Function to find out:
what header shouldn't be numbered and can be treated as introduction chapter
what header shouldn"t be numbered and can be treated as introduction chapter
Assume header(s) to be introduction if:
1. one header not numbered, before 1 numbered header
2. it is first header from the top level list, and it equals to 'introduction'
2. it is first header from the top level list, and it equals to "introduction"
Returns
-------
@@ -444,9 +398,9 @@ class HTMLDocxPreprocessor:
mark each top-level header with flag should_be_numbered = true/false
"""
is_numbered_header = [header['is_numbered']
is_numbered_header = [header["is_numbered"]
for header in self.top_level_headers]
is_title = [header['is_introduction']
is_title = [header["is_introduction"]
for header in self.top_level_headers]
first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
@@ -454,12 +408,31 @@ class HTMLDocxPreprocessor:
first_header_is_introduction = is_title and is_title[0]
if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction:
self.top_level_headers[0]['should_be_numbered'] = False
self.top_level_headers[0]["should_be_numbered"] = False
for i in range(1, len(self.top_level_headers)):
self.top_level_headers[i]['should_be_numbered'] = True
self.top_level_headers[i]["should_be_numbered"] = True
else:
for i in range(0, len(self.top_level_headers)):
self.top_level_headers[i]['should_be_numbered'] = True
self.top_level_headers[i]["should_be_numbered"] = True
@staticmethod
def clean_title_from_tabs(tag: NavigableString):
cleaned = re.sub(r"[\s\xa0]", " ", tag)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
def apply_func_to_last_child(self, tag, func=None):
"""
works only with constructions like (((child to work with)))
where child is object of NavigableString
"""
if type(tag) is NavigableString:
func(tag)
else:
children = list(tag.children)
if children:
self.apply_func_to_last_child(children[0], func)
def _process_headings(self):
# todo regex
@@ -499,44 +472,33 @@ class HTMLDocxPreprocessor:
while tag.parent.name == "ol":
tag.parent.unwrap()
title = tag.text
title = self.clean_title_from_numbering(title)
if title == "":
cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text)
if cleaned_title == "":
tag.unwrap()
else:
assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings."
content = list(tag.children)
# do not take into account rubbish empty tags like <a>, but don't remove them
# do not take into account rubbish empty tags like <a>, but don"t remove them
content = [item for item in content if
(type(item) is not NavigableString and item.text != '')
(type(item) is not NavigableString and item.text != "")
or (type(item) is NavigableString)]
content[0] = "" if content[0] == " " else content[0]
content = [item for item in content if item != ""]
for i, item in enumerate(content):
if type(content[i]) is NavigableString:
cleaned = re.sub(r'(\s+)+', ' ', content[i])
cleaned = re.sub(r"(\s+)+", " ", content[i])
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
content[i].replace_with(this)
content[i] = this
else:
self.apply_func_to_last_child(
content[i], self.clean_tag_from_tabs)
content[0] = '' if content[0] == ' ' else content[0]
content = [item for item in content if item != '']
if type(content[0]) is NavigableString:
cleaned = self.clean_title_from_numbering(content[0])
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
content[0].replace_with(this)
content[0] = this
else:
self.apply_func_to_last_child(
content[0], self.clean_tag_from_numbering)
content[i], self.clean_title_from_tabs)
def _process_lists(self):
# todo regex
@@ -551,81 +513,76 @@ class HTMLDocxPreprocessor:
uwrap <p> tag with li
"""
li_tags = self.body_tag.find_all("li")
for li_tag in li_tags:
li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap()
def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated
toc_tag = self.html_soup.new_tag('TOC')
toc_tag = self.html_soup.new_tag("TOC")
self.content: List[Tag] = self.body_tag.find_all(recursive=False)
if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:]
def process_html(self, access=None, html_path='', book_id=0):
def process_html(self, access=None, html_path="", book_id=0):
"""Process html code to satisfy LiveCarta formatting."""
self.logger_object.log('Beginning of processing .html file.')
self.logger_object.log("Beginning of processing .html file.")
try:
self.logger_object.log(f'Processing TOC and headers.')
self.logger_object.log(f"Processing TOC and headers.")
self._process_toc_links()
self.clean_trash()
# process main elements of the .html doc
self.logger_object.log(f'Processing main elements of html.')
self.logger_object.log(f"Processing main elements of html.")
self._preprocessing_headings()
self._process_paragraph()
self._process_two_columns()
self.logger_object.log('Block quotes processing.')
self.logger_object.log("Block quotes processing.")
self._process_quotes()
self.logger_object.log('Tables processing.')
self.logger_object.log("Tables processing.")
self._process_tables()
self.logger_object.log(
f'{self.tables_amount} tables have been processed.')
f"{self.tables_amount} tables have been processed.")
self.logger_object.log('Hrefs processing.')
self.logger_object.log("Hrefs processing.")
self._process_hrefs()
self.logger_object.log('Footnotes processing.')
self.logger_object.log("Footnotes processing.")
self.footnotes = process_footnotes(self.body_tag)
self.logger_object.log(
f'{len(self.footnotes)} footnotes have been processed.')
f"{len(self.footnotes)} footnotes have been processed.")
self.logger_object.log('Image processing.')
self.logger_object.log("Image processing.")
self.images = process_images(access=access, html_path=html_path,
book_id=book_id, body_tag=self.body_tag)
self.logger_object.log(
f'{len(self.images)} images have been processed.')
f"{len(self.images)} images have been processed.")
self._process_footer()
self._process_div()
self.content = self.body_tag.find_all(recursive=False)
self.top_level_headers = self._get_top_level_headers()
self._mark_introduction_headers()
self._process_headings()
self.content: List[Tag] = self.body_tag.find_all(recursive=False)
self._process_lists()
# delete text before table of content if exists
self.delete_content_before_toc()
except Exception as exc:
self.logger_object.log(
'Error has occurred while processing html.', logging.ERROR)
"Error has occurred while processing html.", logging.ERROR)
self.logger_object.log_error_to_main_log()
if self.status_wrapper:
self.status_wrapper.set_error()
raise exc
self.logger_object.log('End of processing .html file.')
self.logger_object.log("End of processing .html file.")
return self.content, self.footnotes, self.top_level_headers