diff --git a/src/docx_converter/docx2libre_html.py b/src/docx_converter/docx2libre_html.py index fbb24fe..56fe2f7 100644 --- a/src/docx_converter/docx2libre_html.py +++ b/src/docx_converter/docx2libre_html.py @@ -66,7 +66,6 @@ class Docx2LibreHTML: raise error self.logger_object.log(f"File - {self.file_path}.") - print(f"{self.file_path}") self.logger_object.log("Beginning of conversion from .docx to .html.") check_file_exists( @@ -74,7 +73,7 @@ class Docx2LibreHTML: folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) - out_dir_path = os.path.join(folder_path, f"../html/{self.book_id}") + out_dir_path = os.path.join(folder_path, f"../books/html/{self.book_id}") pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) try: diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 9f1735b..6260edb 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -34,9 +34,9 @@ class DocxBook(BookSolver): """ # 1. Converts docx to html with LibreOffice - html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, + html_converter = Docx2LibreHTML(self.book_id, self.book_path, self.access, self.logger_object, self.libre_locker) - # TODO presets + # todo presets # 2. Parses and cleans html, gets list of tags, gets footnotes parser = HTMLDocxPreprocessor( @@ -53,7 +53,7 @@ class DocxBook(BookSolver): if __name__ == "__main__": - docx_file_path = '../../docx/music_inquiry.docx' + docx_file_path = '../../books/docx/music_inquiry.docx' logger_object = BookLogger( name='docx', book_id=docx_file_path.split('/')[-1]) locker = Event() diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py index beb6d15..c269b73 100644 --- a/src/docx_converter/footnotes_processing.py +++ b/src/docx_converter/footnotes_processing.py @@ -1,7 +1,7 @@ import re from bs4 import BeautifulSoup, NavigableString -@staticmethod + def _clean_footnote_content(content): content = content.strip() return content.strip() diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index 046166f..a44df01 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -11,7 +11,7 @@ from src.docx_converter.image_processing import process_images class HTMLDocxPreprocessor: - + def __init__(self, html_soup, logger_object, status_wrapper=None): self.body_tag = html_soup.body self.html_soup = html_soup @@ -20,6 +20,38 @@ class HTMLDocxPreprocessor: self.top_level_headers = None self.content = list() + def _process_toc_links(self): + def _check_parent_link_exist_in_toc(tag_with_link): + toc_links = [] + for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}): + link_name = a_tag.attrs["name"] + toc_item = self.body_tag.find("a", {"href": "#" + link_name}) + if toc_item: + toc_links.append(toc_item) + return len(toc_links) > 0 + """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" + toc_links = self.body_tag.find_all( + "a", {"name": re.compile(r"^_Toc\d+")}) + headers = [link.parent for link in toc_links] + outline_level = "1" # All the unknown outlines will be predicted as

+ for h_tag in headers: + if re.search(r"^h\d$", h_tag.name): + h_tag.a.unwrap() + # outline_level = tag.name[-1] # TODO: add prediction of the outline level + elif h_tag.name == "p": + exist_in_toc = _check_parent_link_exist_in_toc(h_tag) + if h_tag in self.body_tag.find_all("p") and exist_in_toc: + new_tag = BeautifulSoup( + features="lxml").new_tag("h" + outline_level) + text = h_tag.text + h_tag.replaceWith(new_tag) + new_tag.string = text + else: + # rethink document structure when you have toc_links, other cases? + self.logger_object.log(f"Something went wrong in processing toc_links." + f" Check the structure of the file. " + f"Tag name: {h_tag.name}") + def _clean_tag(self, tag: str, attr_name: str, attr_value: re): # todo regex """ @@ -48,12 +80,12 @@ class HTMLDocxPreprocessor: """Function cleans meaningless tags before links.""" underlines = self.body_tag.find_all("u") for u in underlines: - if u.find_all('a'): + if u.find_all("a"): u.unwrap() - links = self.body_tag.find_all('a') + links = self.body_tag.find_all("a") for link in links: - u = link.find_all('u') + u = link.find_all("u") if u and len(u) == 1: u[0].unwrap() @@ -81,16 +113,12 @@ class HTMLDocxPreprocessor: """ size = re.search(r"font-size: (\d{1,3})pt", style) - if size is None: return style - size = size.group(1) new_size = cls.convert_pt_to_px(size) - if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE: return "" - return re.sub(size + "pt", str(new_size) + "px", style) def _font_to_span(self): @@ -108,10 +136,10 @@ class HTMLDocxPreprocessor: style = self.convert_font_pt_to_px(style) if style != "": if color and color in LiveCartaConfig.COLORS_MAP: - style += f'; color: {color};' + style += f"; color: {color};" font.attrs["style"] = style elif color and color in LiveCartaConfig.COLORS_MAP: - font.attrs["style"] = f'color: {color};' + font.attrs["style"] = f"color: {color};" if len(font.attrs) == 0: font.unwrap() @@ -121,16 +149,16 @@ class HTMLDocxPreprocessor: def clean_trash(self): # todo make it regex dict - """Function to remove all styles and tags we don't need.""" - self._clean_tag('span', 'style', re.compile( - r'^background: #[\da-fA-F]{6}$')) + """Function to remove all styles and tags we don"t need.""" + self._clean_tag("span", "style", re.compile( + r"^background: #[\da-fA-F]{6}$")) # todo: check for another languages - self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) - self._clean_tag('span', 'style', re.compile( - '^letter-spacing: -?[\d.]+pt$')) + self._clean_tag("span", "lang", re.compile(r"^ru-RU$")) + self._clean_tag("span", "style", re.compile( + "^letter-spacing: -?[\d.]+pt$")) - self._clean_tag('font', 'face', re.compile( - r'^Times New Roman[\w, ]+$')) + self._clean_tag("font", "face", re.compile( + r"^Times New Roman[\w, ]+$")) self._clean_tag("a", "name", "_GoBack") self._clean_underline_links() @@ -139,60 +167,68 @@ class HTMLDocxPreprocessor: # replace toc with empty tag tables = self.body_tag.find_all( - "div", id=re.compile(r'^Table of Contents\d+')) + "div", id=re.compile(r"^Table of Contents\d+")) for table in tables: table.wrap(self.html_soup.new_tag("TOC")) table.decompose() + def _preprocessing_headings(self): + # todo regex + """Function to convert all lower level headings to p tags""" + pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$" + header_tags = self.body_tag.find_all(re.compile(pattern)) + for tag in header_tags: + tag.name = "p" + def _process_paragraph(self): """Function to process

tags (text-align and text-indent value).""" - paragraphs = self.body_tag.find_all('p') + paragraphs = self.body_tag.find_all("p") for p in paragraphs: # libre converts some \n into

with 2
# there we remove 1 unnecessary
- brs = p.find_all('br') + brs = p.find_all("br") text = p.text - if brs and text == '\n\n' and len(brs) == 2: + if brs and text == "\n\n" and len(brs) == 2: brs[0].decompose() indent_should_be_added = False - if text and ((text[0:1] == '\t') or (text[:2] == '\n\t')): + if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")): indent_should_be_added = True - align = p.get('align') - style = p.get('style') + align = p.get("align") + style = p.get("style") if style: - indent = re.search(r'text-indent: ([\d.]{1,4})in', style) - margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style) + indent = re.search(r"text-indent: ([\d.]{1,4})in", style) + margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style) margin_right = re.search( - r'margin-right: ([\d.]{1,4})in', style) - margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style) + r"margin-right: ([\d.]{1,4})in", style) + margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style) margin_bottom = re.search( - r'margin-bottom: ([\d.]{1,4})in', style) + r"margin-bottom: ([\d.]{1,4})in", style) else: indent = margin_left = margin_right = \ margin_top = margin_bottom = None if margin_left and margin_right and margin_top and margin_bottom and \ - margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \ - margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11': - p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) + margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \ + margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11": + p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote")) p.attrs = {} - style = '' + style = "" if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE: - style += f'text-align: {align};' + style += f"text-align: {align};" if indent is not None or indent_should_be_added: # indent = indent.group(1) - style += f'text-indent: {LiveCartaConfig.INDENT};' + style += f"text-indent: {LiveCartaConfig.INDENT};" if style: - p.attrs['style'] = style + p.attrs["style"] = style def _process_two_columns(self): """Function to process paragraphs which has two columns layout.""" @@ -203,40 +239,6 @@ class HTMLDocxPreprocessor: child["class"] = "columns2" div.unwrap() - def _process_tables(self): - """Function to process tables. Set "border" attribute.""" - tables = self.body_tag.find_all("table") - for table in tables: - tds = table.find_all("td") - - sizes = [] - for td in tds: - style = td.get('style') - - if style: - match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) - - if match: - size = match.group(1) - units = match.group(2) - - if units == "pt": - size = self.convert_pt_to_px(size) - - sizes.append(float(size)) - - width = td.get('width') - - td.attrs = {} - if width: - td.attrs['width'] = width - - if sizes: - border_size = sum(sizes) / len(sizes) - table.attrs['border'] = f'{border_size:.2}' - - self.tables_amount = len(tables) - def _process_quotes(self): """ Function to process block quotes. @@ -259,9 +261,9 @@ class HTMLDocxPreprocessor: for table in tables: trs = table.find_all("tr") tds = table.find_all("td") - if len(trs) == 1 and len(tds) == 1 and tds[0].get('width') == '600': + if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600": td = tds[0] - is_zero_border = 'border: none;' in td.get('style') + is_zero_border = "border: none;" in td.get("style") paragraphs = td.find_all("p") has_i_tag_or_br = [(p.i, p.br) for p in paragraphs] has_i_tag_or_br = [x[0] is not None or x[1] is not None @@ -269,27 +271,61 @@ class HTMLDocxPreprocessor: if all(has_i_tag_or_br) and is_zero_border: new_div = BeautifulSoup( - features='lxml').new_tag('blockquote') + features="lxml").new_tag("blockquote") for p in paragraphs: new_div.append(p) table.replaceWith(new_div) + def _process_tables(self): + """Function to process tables. Set "border" attribute.""" + tables = self.body_tag.find_all("table") + for table in tables: + tds = table.find_all("td") + + sizes = [] + for td in tds: + style = td.get("style") + + if style: + match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) + + if match: + size = match.group(1) + units = match.group(2) + + if units == "pt": + size = self.convert_pt_to_px(size) + + sizes.append(float(size)) + + width = td.get("width") + + td.attrs = {} + if width: + td.attrs["width"] = width + + if sizes: + border_size = sum(sizes) / len(sizes) + table.attrs["border"] = f"{border_size:.2}" + + self.tables_amount = len(tables) + def _process_hrefs(self): a_tags_with_href = self.body_tag.find_all( - 'a', {'href': re.compile('^.*http.+')}) + "a", {"href": re.compile("^.*http.+")}) # remove char=end of file for some editors for tag in a_tags_with_href: - tag.string = tag.text.replace('\u200c', '') - tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') + tag.string = tag.text.replace("\u200c", "") + tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") a_tags_with_href = self.body_tag.find_all( - 'a', {'href': re.compile('^(?!#sdfootnote)')}) + "a", {"href": re.compile("^(?!#sdfootnote)")}) for tag in a_tags_with_href: - tag.string = tag.text.replace('\u200c', '') - tag.string = tag.text.replace('\u200b', '') # zero-width-space - tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') + tag.string = tag.text.replace("\u200c", "") + tag.string = tag.text.replace("\u200b", "") # zero-width-space + tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") def _process_footer(self): # todo regex @@ -297,7 +333,7 @@ class HTMLDocxPreprocessor: Function to process

tags. All the tags will be deleted from file. """ - divs = self.body_tag.find_all('div', {'title': 'footer'}) + divs = self.body_tag.find_all("div", {"title": "footer"}) for div in divs: div.decompose() @@ -305,90 +341,9 @@ class HTMLDocxPreprocessor: # todo regex """Function to process
tags. All the tags will be deleted from file, all content of the tags will stay.""" divs = self.body_tag.find_all("div") - for div in divs: div.unwrap() - def _check_parent_link_exist_in_toc(self, tag_with_link): - toc_links = [] - for a_tag in tag_with_link.find_all("a", {'name': re.compile(r'^_Toc\d+')}): - link_name = a_tag.attrs['name'] - toc_item = self.body_tag.find("a", {'href': '#' + link_name}) - if toc_item: - toc_links.append(toc_item) - - return len(toc_links) > 0 - - def _process_toc_links(self): - """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" - toc_links = self.body_tag.find_all( - "a", {'name': re.compile(r'^_Toc\d+')}) - headers = [link.parent for link in toc_links] - outline_level = "1" # All the unknown outlines will be predicted as

- for h_tag in headers: - if re.search(r"^h\d$", h_tag.name): - h_tag.a.unwrap() - # outline_level = tag.name[-1] # TODO: add prediction of the outline level - elif h_tag.name == "p": - exist_in_toc = self._check_parent_link_exist_in_toc(h_tag) - if h_tag in self.body_tag.find_all("p") and exist_in_toc: - new_tag = BeautifulSoup( - features="lxml").new_tag("h" + outline_level) - text = h_tag.text - h_tag.replaceWith(new_tag) - new_tag.string = text - else: - # rethink document structure when you have toc_links, other cases? - self.logger_object.log(f'Something went wrong in processing toc_links.' - f' Check the structure of the file. ' - f'Tag name: {h_tag.name}') - - @staticmethod - def clean_title_from_numbering(title: str): - """Function to remove digits from headers.""" - title = re.sub(r'^(\s+)+', '', title) - # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title - # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title - # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title - return title - - @staticmethod - def clean_tag_from_tabs(tag: NavigableString): - cleaned = re.sub(r'(\s+)+', ' ', tag) - this = BeautifulSoup.new_string(BeautifulSoup( - features="lxml"), cleaned, NavigableString) - tag.replace_with(this) - # print('input: ', repr(tag)) - # print('test: ', repr(cleaned)) - - def clean_tag_from_numbering(self, tag): - cleaned = self.clean_title_from_numbering(tag) - this = BeautifulSoup.new_string(BeautifulSoup( - features="lxml"), cleaned, NavigableString) - tag.replace_with(this) - # print('input: ', repr(tag)) - # print('test: ', repr(cleaned)) - - def apply_func_to_last_child(self, tag, func=None): - """ - works only with constructions like (((child to work with))) - where child is object of NavigableString - """ - if type(tag) is NavigableString: - func(tag) - else: - children = list(tag.children) - if children: - self.apply_func_to_last_child(children[0], func) - - def _preprocessing_headings(self): - # todo regex - """Function to convert all lower level headings to p tags""" - pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' - header_tags = self.body_tag.find_all(re.compile(pattern)) - for tag in header_tags: - tag.name = 'p' - def _get_top_level_headers(self): """ Function for gathering info about top-level chapters. @@ -416,27 +371,26 @@ class HTMLDocxPreprocessor: tag.parent.unwrap() title = tag.text - title = re.sub(r'\s+', ' ', title).strip() - number = re.match(r'^(?:\.?\d+\.? ?)+', title) + title = re.sub(r"\s+", " ", title).strip() + number = re.match(r"^(?:\.?\d+\.? ?)+", title) is_numbered = number is not None - cleaned_title = self.clean_title_from_numbering(tag.text) - is_introduction = cleaned_title.lower() == 'introduction' + cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text) + is_introduction = cleaned_title.lower() == "introduction" headers_info.append({ - 'title': cleaned_title, - 'is_numbered': is_numbered, - 'is_introduction': is_introduction}) - + "title": cleaned_title, + "is_numbered": is_numbered, + "is_introduction": is_introduction}) return headers_info def _mark_introduction_headers(self): """ Function to find out: - what header shouldn't be numbered and can be treated as introduction chapter + what header shouldn"t be numbered and can be treated as introduction chapter Assume header(s) to be introduction if: 1. one header not numbered, before 1 numbered header - 2. it is first header from the top level list, and it equals to 'introduction' + 2. it is first header from the top level list, and it equals to "introduction" Returns ------- @@ -444,9 +398,9 @@ class HTMLDocxPreprocessor: mark each top-level header with flag should_be_numbered = true/false """ - is_numbered_header = [header['is_numbered'] + is_numbered_header = [header["is_numbered"] for header in self.top_level_headers] - is_title = [header['is_introduction'] + is_title = [header["is_introduction"] for header in self.top_level_headers] first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 @@ -454,12 +408,31 @@ class HTMLDocxPreprocessor: first_header_is_introduction = is_title and is_title[0] if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction: - self.top_level_headers[0]['should_be_numbered'] = False + self.top_level_headers[0]["should_be_numbered"] = False for i in range(1, len(self.top_level_headers)): - self.top_level_headers[i]['should_be_numbered'] = True + self.top_level_headers[i]["should_be_numbered"] = True else: for i in range(0, len(self.top_level_headers)): - self.top_level_headers[i]['should_be_numbered'] = True + self.top_level_headers[i]["should_be_numbered"] = True + + @staticmethod + def clean_title_from_tabs(tag: NavigableString): + cleaned = re.sub(r"[\s\xa0]", " ", tag) + this = BeautifulSoup.new_string(BeautifulSoup( + features="lxml"), cleaned, NavigableString) + tag.replace_with(this) + + def apply_func_to_last_child(self, tag, func=None): + """ + works only with constructions like (((child to work with))) + where child is object of NavigableString + """ + if type(tag) is NavigableString: + func(tag) + else: + children = list(tag.children) + if children: + self.apply_func_to_last_child(children[0], func) def _process_headings(self): # todo regex @@ -499,44 +472,33 @@ class HTMLDocxPreprocessor: while tag.parent.name == "ol": tag.parent.unwrap() - title = tag.text - title = self.clean_title_from_numbering(title) - if title == "": + cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text) + if cleaned_title == "": tag.unwrap() else: assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \ - f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' + f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings." content = list(tag.children) - # do not take into account rubbish empty tags like , but don't remove them + # do not take into account rubbish empty tags like , but don"t remove them content = [item for item in content if - (type(item) is not NavigableString and item.text != '') + (type(item) is not NavigableString and item.text != "") or (type(item) is NavigableString)] + content[0] = "" if content[0] == " " else content[0] + content = [item for item in content if item != ""] + for i, item in enumerate(content): if type(content[i]) is NavigableString: - cleaned = re.sub(r'(\s+)+', ' ', content[i]) + cleaned = re.sub(r"(\s+)+", " ", content[i]) this = BeautifulSoup.new_string(BeautifulSoup( features="lxml"), cleaned, NavigableString) content[i].replace_with(this) content[i] = this else: self.apply_func_to_last_child( - content[i], self.clean_tag_from_tabs) - - content[0] = '' if content[0] == ' ' else content[0] - content = [item for item in content if item != ''] - - if type(content[0]) is NavigableString: - cleaned = self.clean_title_from_numbering(content[0]) - this = BeautifulSoup.new_string(BeautifulSoup( - features="lxml"), cleaned, NavigableString) - content[0].replace_with(this) - content[0] = this - else: - self.apply_func_to_last_child( - content[0], self.clean_tag_from_numbering) + content[i], self.clean_title_from_tabs) def _process_lists(self): # todo regex @@ -551,81 +513,76 @@ class HTMLDocxPreprocessor: uwrap

tag with li """ - li_tags = self.body_tag.find_all("li") - for li_tag in li_tags: li_tag.attrs.update(li_tag.p.attrs) li_tag.p.unwrap() def delete_content_before_toc(self): # remove all tag upper the only in content !!! body tag is not updated - toc_tag = self.html_soup.new_tag('TOC') + toc_tag = self.html_soup.new_tag("TOC") + self.content: List[Tag] = self.body_tag.find_all(recursive=False) if toc_tag in self.content: ind = self.content.index(toc_tag) + 1 self.content = self.content[ind:] - def process_html(self, access=None, html_path='', book_id=0): + def process_html(self, access=None, html_path="", book_id=0): """Process html code to satisfy LiveCarta formatting.""" - self.logger_object.log('Beginning of processing .html file.') + self.logger_object.log("Beginning of processing .html file.") try: - self.logger_object.log(f'Processing TOC and headers.') + self.logger_object.log(f"Processing TOC and headers.") self._process_toc_links() self.clean_trash() # process main elements of the .html doc - self.logger_object.log(f'Processing main elements of html.') + self.logger_object.log(f"Processing main elements of html.") self._preprocessing_headings() self._process_paragraph() self._process_two_columns() - self.logger_object.log('Block quotes processing.') + self.logger_object.log("Block quotes processing.") self._process_quotes() - self.logger_object.log('Tables processing.') + self.logger_object.log("Tables processing.") self._process_tables() self.logger_object.log( - f'{self.tables_amount} tables have been processed.') + f"{self.tables_amount} tables have been processed.") - self.logger_object.log('Hrefs processing.') + self.logger_object.log("Hrefs processing.") self._process_hrefs() - self.logger_object.log('Footnotes processing.') + self.logger_object.log("Footnotes processing.") self.footnotes = process_footnotes(self.body_tag) self.logger_object.log( - f'{len(self.footnotes)} footnotes have been processed.') + f"{len(self.footnotes)} footnotes have been processed.") - self.logger_object.log('Image processing.') + self.logger_object.log("Image processing.") self.images = process_images(access=access, html_path=html_path, book_id=book_id, body_tag=self.body_tag) self.logger_object.log( - f'{len(self.images)} images have been processed.') + f"{len(self.images)} images have been processed.") self._process_footer() self._process_div() - self.content = self.body_tag.find_all(recursive=False) - self.top_level_headers = self._get_top_level_headers() self._mark_introduction_headers() self._process_headings() - self.content: List[Tag] = self.body_tag.find_all(recursive=False) - self._process_lists() # delete text before table of content if exists self.delete_content_before_toc() except Exception as exc: self.logger_object.log( - 'Error has occurred while processing html.', logging.ERROR) + "Error has occurred while processing html.", logging.ERROR) self.logger_object.log_error_to_main_log() if self.status_wrapper: self.status_wrapper.set_error() raise exc - self.logger_object.log('End of processing .html file.') + self.logger_object.log("End of processing .html file.") return self.content, self.footnotes, self.top_level_headers