Docx refactoring

This commit is contained in:
Kiryl
2022-07-27 20:20:52 +03:00
parent 617d4fcaef
commit 290ffa346a
4 changed files with 182 additions and 226 deletions

View File

@@ -66,7 +66,6 @@ class Docx2LibreHTML:
raise error raise error
self.logger_object.log(f"File - {self.file_path}.") self.logger_object.log(f"File - {self.file_path}.")
print(f"{self.file_path}")
self.logger_object.log("Beginning of conversion from .docx to .html.") self.logger_object.log("Beginning of conversion from .docx to .html.")
check_file_exists( check_file_exists(
@@ -74,7 +73,7 @@ class Docx2LibreHTML:
folder_path = os.path.dirname( folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__))) os.path.dirname(os.path.abspath(__file__)))
out_dir_path = os.path.join(folder_path, f"../html/{self.book_id}") out_dir_path = os.path.join(folder_path, f"../books/html/{self.book_id}")
pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True)
try: try:

View File

@@ -34,9 +34,9 @@ class DocxBook(BookSolver):
""" """
# 1. Converts docx to html with LibreOffice # 1. Converts docx to html with LibreOffice
html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, html_converter = Docx2LibreHTML(self.book_id, self.book_path, self.access,
self.logger_object, self.libre_locker) self.logger_object, self.libre_locker)
# TODO presets # todo presets
# 2. Parses and cleans html, gets list of tags, gets footnotes # 2. Parses and cleans html, gets list of tags, gets footnotes
parser = HTMLDocxPreprocessor( parser = HTMLDocxPreprocessor(
@@ -53,7 +53,7 @@ class DocxBook(BookSolver):
if __name__ == "__main__": if __name__ == "__main__":
docx_file_path = '../../docx/music_inquiry.docx' docx_file_path = '../../books/docx/music_inquiry.docx'
logger_object = BookLogger( logger_object = BookLogger(
name='docx', book_id=docx_file_path.split('/')[-1]) name='docx', book_id=docx_file_path.split('/')[-1])
locker = Event() locker = Event()

View File

@@ -1,7 +1,7 @@
import re import re
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString
@staticmethod
def _clean_footnote_content(content): def _clean_footnote_content(content):
content = content.strip() content = content.strip()
return content.strip() return content.strip()

View File

@@ -20,6 +20,38 @@ class HTMLDocxPreprocessor:
self.top_level_headers = None self.top_level_headers = None
self.content = list() self.content = list()
def _process_toc_links(self):
def _check_parent_link_exist_in_toc(tag_with_link):
toc_links = []
for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}):
link_name = a_tag.attrs["name"]
toc_item = self.body_tag.find("a", {"href": "#" + link_name})
if toc_item:
toc_links.append(toc_item)
return len(toc_links) > 0
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all(
"a", {"name": re.compile(r"^_Toc\d+")})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
for h_tag in headers:
if re.search(r"^h\d$", h_tag.name):
h_tag.a.unwrap()
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
elif h_tag.name == "p":
exist_in_toc = _check_parent_link_exist_in_toc(h_tag)
if h_tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup(
features="lxml").new_tag("h" + outline_level)
text = h_tag.text
h_tag.replaceWith(new_tag)
new_tag.string = text
else:
# rethink document structure when you have toc_links, other cases?
self.logger_object.log(f"Something went wrong in processing toc_links."
f" Check the structure of the file. "
f"Tag name: {h_tag.name}")
def _clean_tag(self, tag: str, attr_name: str, attr_value: re): def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
# todo regex # todo regex
""" """
@@ -48,12 +80,12 @@ class HTMLDocxPreprocessor:
"""Function cleans meaningless <u> tags before links.""" """Function cleans meaningless <u> tags before links."""
underlines = self.body_tag.find_all("u") underlines = self.body_tag.find_all("u")
for u in underlines: for u in underlines:
if u.find_all('a'): if u.find_all("a"):
u.unwrap() u.unwrap()
links = self.body_tag.find_all('a') links = self.body_tag.find_all("a")
for link in links: for link in links:
u = link.find_all('u') u = link.find_all("u")
if u and len(u) == 1: if u and len(u) == 1:
u[0].unwrap() u[0].unwrap()
@@ -81,16 +113,12 @@ class HTMLDocxPreprocessor:
""" """
size = re.search(r"font-size: (\d{1,3})pt", style) size = re.search(r"font-size: (\d{1,3})pt", style)
if size is None: if size is None:
return style return style
size = size.group(1) size = size.group(1)
new_size = cls.convert_pt_to_px(size) new_size = cls.convert_pt_to_px(size)
if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE: if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
return "" return ""
return re.sub(size + "pt", str(new_size) + "px", style) return re.sub(size + "pt", str(new_size) + "px", style)
def _font_to_span(self): def _font_to_span(self):
@@ -108,10 +136,10 @@ class HTMLDocxPreprocessor:
style = self.convert_font_pt_to_px(style) style = self.convert_font_pt_to_px(style)
if style != "": if style != "":
if color and color in LiveCartaConfig.COLORS_MAP: if color and color in LiveCartaConfig.COLORS_MAP:
style += f'; color: {color};' style += f"; color: {color};"
font.attrs["style"] = style font.attrs["style"] = style
elif color and color in LiveCartaConfig.COLORS_MAP: elif color and color in LiveCartaConfig.COLORS_MAP:
font.attrs["style"] = f'color: {color};' font.attrs["style"] = f"color: {color};"
if len(font.attrs) == 0: if len(font.attrs) == 0:
font.unwrap() font.unwrap()
@@ -121,16 +149,16 @@ class HTMLDocxPreprocessor:
def clean_trash(self): def clean_trash(self):
# todo make it regex dict # todo make it regex dict
"""Function to remove all styles and tags we don't need.""" """Function to remove all styles and tags we don"t need."""
self._clean_tag('span', 'style', re.compile( self._clean_tag("span", "style", re.compile(
r'^background: #[\da-fA-F]{6}$')) r"^background: #[\da-fA-F]{6}$"))
# todo: check for another languages # todo: check for another languages
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) self._clean_tag("span", "lang", re.compile(r"^ru-RU$"))
self._clean_tag('span', 'style', re.compile( self._clean_tag("span", "style", re.compile(
'^letter-spacing: -?[\d.]+pt$')) "^letter-spacing: -?[\d.]+pt$"))
self._clean_tag('font', 'face', re.compile( self._clean_tag("font", "face", re.compile(
r'^Times New Roman[\w, ]+$')) r"^Times New Roman[\w, ]+$"))
self._clean_tag("a", "name", "_GoBack") self._clean_tag("a", "name", "_GoBack")
self._clean_underline_links() self._clean_underline_links()
@@ -139,60 +167,68 @@ class HTMLDocxPreprocessor:
# replace toc with empty <TOC> tag # replace toc with empty <TOC> tag
tables = self.body_tag.find_all( tables = self.body_tag.find_all(
"div", id=re.compile(r'^Table of Contents\d+')) "div", id=re.compile(r"^Table of Contents\d+"))
for table in tables: for table in tables:
table.wrap(self.html_soup.new_tag("TOC")) table.wrap(self.html_soup.new_tag("TOC"))
table.decompose() table.decompose()
def _preprocessing_headings(self):
# todo regex
"""Function to convert all lower level headings to p tags"""
pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$"
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = "p"
def _process_paragraph(self): def _process_paragraph(self):
"""Function to process <p> tags (text-align and text-indent value).""" """Function to process <p> tags (text-align and text-indent value)."""
paragraphs = self.body_tag.find_all('p') paragraphs = self.body_tag.find_all("p")
for p in paragraphs: for p in paragraphs:
# libre converts some \n into <p> with 2 </br> # libre converts some \n into <p> with 2 </br>
# there we remove 1 unnecessary <br> # there we remove 1 unnecessary <br>
brs = p.find_all('br') brs = p.find_all("br")
text = p.text text = p.text
if brs and text == '\n\n' and len(brs) == 2: if brs and text == "\n\n" and len(brs) == 2:
brs[0].decompose() brs[0].decompose()
indent_should_be_added = False indent_should_be_added = False
if text and ((text[0:1] == '\t') or (text[:2] == '\n\t')): if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")):
indent_should_be_added = True indent_should_be_added = True
align = p.get('align') align = p.get("align")
style = p.get('style') style = p.get("style")
if style: if style:
indent = re.search(r'text-indent: ([\d.]{1,4})in', style) indent = re.search(r"text-indent: ([\d.]{1,4})in", style)
margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style) margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style)
margin_right = re.search( margin_right = re.search(
r'margin-right: ([\d.]{1,4})in', style) r"margin-right: ([\d.]{1,4})in", style)
margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style) margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style)
margin_bottom = re.search( margin_bottom = re.search(
r'margin-bottom: ([\d.]{1,4})in', style) r"margin-bottom: ([\d.]{1,4})in", style)
else: else:
indent = margin_left = margin_right = \ indent = margin_left = margin_right = \
margin_top = margin_bottom = None margin_top = margin_bottom = None
if margin_left and margin_right and margin_top and margin_bottom and \ if margin_left and margin_right and margin_top and margin_bottom and \
margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \ margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \
margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11': margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11":
p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))
p.attrs = {} p.attrs = {}
style = '' style = ""
if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE: if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE:
style += f'text-align: {align};' style += f"text-align: {align};"
if indent is not None or indent_should_be_added: if indent is not None or indent_should_be_added:
# indent = indent.group(1) # indent = indent.group(1)
style += f'text-indent: {LiveCartaConfig.INDENT};' style += f"text-indent: {LiveCartaConfig.INDENT};"
if style: if style:
p.attrs['style'] = style p.attrs["style"] = style
def _process_two_columns(self): def _process_two_columns(self):
"""Function to process paragraphs which has two columns layout.""" """Function to process paragraphs which has two columns layout."""
@@ -203,40 +239,6 @@ class HTMLDocxPreprocessor:
child["class"] = "columns2" child["class"] = "columns2"
div.unwrap() div.unwrap()
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
sizes = []
for td in tds:
style = td.get('style')
if style:
match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
if match:
size = match.group(1)
units = match.group(2)
if units == "pt":
size = self.convert_pt_to_px(size)
sizes.append(float(size))
width = td.get('width')
td.attrs = {}
if width:
td.attrs['width'] = width
if sizes:
border_size = sum(sizes) / len(sizes)
table.attrs['border'] = f'{border_size:.2}'
self.tables_amount = len(tables)
def _process_quotes(self): def _process_quotes(self):
""" """
Function to process block quotes. Function to process block quotes.
@@ -259,9 +261,9 @@ class HTMLDocxPreprocessor:
for table in tables: for table in tables:
trs = table.find_all("tr") trs = table.find_all("tr")
tds = table.find_all("td") tds = table.find_all("td")
if len(trs) == 1 and len(tds) == 1 and tds[0].get('width') == '600': if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600":
td = tds[0] td = tds[0]
is_zero_border = 'border: none;' in td.get('style') is_zero_border = "border: none;" in td.get("style")
paragraphs = td.find_all("p") paragraphs = td.find_all("p")
has_i_tag_or_br = [(p.i, p.br) for p in paragraphs] has_i_tag_or_br = [(p.i, p.br) for p in paragraphs]
has_i_tag_or_br = [x[0] is not None or x[1] is not None has_i_tag_or_br = [x[0] is not None or x[1] is not None
@@ -269,27 +271,61 @@ class HTMLDocxPreprocessor:
if all(has_i_tag_or_br) and is_zero_border: if all(has_i_tag_or_br) and is_zero_border:
new_div = BeautifulSoup( new_div = BeautifulSoup(
features='lxml').new_tag('blockquote') features="lxml").new_tag("blockquote")
for p in paragraphs: for p in paragraphs:
new_div.append(p) new_div.append(p)
table.replaceWith(new_div) table.replaceWith(new_div)
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
sizes = []
for td in tds:
style = td.get("style")
if style:
match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
if match:
size = match.group(1)
units = match.group(2)
if units == "pt":
size = self.convert_pt_to_px(size)
sizes.append(float(size))
width = td.get("width")
td.attrs = {}
if width:
td.attrs["width"] = width
if sizes:
border_size = sum(sizes) / len(sizes)
table.attrs["border"] = f"{border_size:.2}"
self.tables_amount = len(tables)
def _process_hrefs(self): def _process_hrefs(self):
a_tags_with_href = self.body_tag.find_all( a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^.*http.+')}) "a", {"href": re.compile("^.*http.+")})
# remove char=end of file for some editors # remove char=end of file for some editors
for tag in a_tags_with_href: for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace("\u200c", "")
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
a_tags_with_href = self.body_tag.find_all( a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^(?!#sdfootnote)')}) "a", {"href": re.compile("^(?!#sdfootnote)")})
for tag in a_tags_with_href: for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace("\u200c", "")
tag.string = tag.text.replace('\u200b', '') # zero-width-space tag.string = tag.text.replace("\u200b", "") # zero-width-space
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
def _process_footer(self): def _process_footer(self):
# todo regex # todo regex
@@ -297,7 +333,7 @@ class HTMLDocxPreprocessor:
Function to process <div title="footer"> tags. Function to process <div title="footer"> tags.
All the tags will be deleted from file. All the tags will be deleted from file.
""" """
divs = self.body_tag.find_all('div', {'title': 'footer'}) divs = self.body_tag.find_all("div", {"title": "footer"})
for div in divs: for div in divs:
div.decompose() div.decompose()
@@ -305,90 +341,9 @@ class HTMLDocxPreprocessor:
# todo regex # todo regex
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.""" """Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
divs = self.body_tag.find_all("div") divs = self.body_tag.find_all("div")
for div in divs: for div in divs:
div.unwrap() div.unwrap()
def _check_parent_link_exist_in_toc(self, tag_with_link):
toc_links = []
for a_tag in tag_with_link.find_all("a", {'name': re.compile(r'^_Toc\d+')}):
link_name = a_tag.attrs['name']
toc_item = self.body_tag.find("a", {'href': '#' + link_name})
if toc_item:
toc_links.append(toc_item)
return len(toc_links) > 0
def _process_toc_links(self):
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all(
"a", {'name': re.compile(r'^_Toc\d+')})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
for h_tag in headers:
if re.search(r"^h\d$", h_tag.name):
h_tag.a.unwrap()
# outline_level = tag.name[-1] # TODO: add prediction of the outline level
elif h_tag.name == "p":
exist_in_toc = self._check_parent_link_exist_in_toc(h_tag)
if h_tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup(
features="lxml").new_tag("h" + outline_level)
text = h_tag.text
h_tag.replaceWith(new_tag)
new_tag.string = text
else:
# rethink document structure when you have toc_links, other cases?
self.logger_object.log(f'Something went wrong in processing toc_links.'
f' Check the structure of the file. '
f'Tag name: {h_tag.name}')
@staticmethod
def clean_title_from_numbering(title: str):
"""Function to remove digits from headers."""
title = re.sub(r'^(\s+)+', '', title)
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title
# title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
@staticmethod
def clean_tag_from_tabs(tag: NavigableString):
cleaned = re.sub(r'(\s+)+', ' ', tag)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
# print('input: ', repr(tag))
# print('test: ', repr(cleaned))
def clean_tag_from_numbering(self, tag):
cleaned = self.clean_title_from_numbering(tag)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
# print('input: ', repr(tag))
# print('test: ', repr(cleaned))
def apply_func_to_last_child(self, tag, func=None):
"""
works only with constructions like (((child to work with)))
where child is object of NavigableString
"""
if type(tag) is NavigableString:
func(tag)
else:
children = list(tag.children)
if children:
self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self):
# todo regex
"""Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
tag.name = 'p'
def _get_top_level_headers(self): def _get_top_level_headers(self):
""" """
Function for gathering info about top-level chapters. Function for gathering info about top-level chapters.
@@ -416,27 +371,26 @@ class HTMLDocxPreprocessor:
tag.parent.unwrap() tag.parent.unwrap()
title = tag.text title = tag.text
title = re.sub(r'\s+', ' ', title).strip() title = re.sub(r"\s+", " ", title).strip()
number = re.match(r'^(?:\.?\d+\.? ?)+', title) number = re.match(r"^(?:\.?\d+\.? ?)+", title)
is_numbered = number is not None is_numbered = number is not None
cleaned_title = self.clean_title_from_numbering(tag.text) cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text)
is_introduction = cleaned_title.lower() == 'introduction' is_introduction = cleaned_title.lower() == "introduction"
headers_info.append({ headers_info.append({
'title': cleaned_title, "title": cleaned_title,
'is_numbered': is_numbered, "is_numbered": is_numbered,
'is_introduction': is_introduction}) "is_introduction": is_introduction})
return headers_info return headers_info
def _mark_introduction_headers(self): def _mark_introduction_headers(self):
""" """
Function to find out: Function to find out:
what header shouldn't be numbered and can be treated as introduction chapter what header shouldn"t be numbered and can be treated as introduction chapter
Assume header(s) to be introduction if: Assume header(s) to be introduction if:
1. one header not numbered, before 1 numbered header 1. one header not numbered, before 1 numbered header
2. it is first header from the top level list, and it equals to 'introduction' 2. it is first header from the top level list, and it equals to "introduction"
Returns Returns
------- -------
@@ -444,9 +398,9 @@ class HTMLDocxPreprocessor:
mark each top-level header with flag should_be_numbered = true/false mark each top-level header with flag should_be_numbered = true/false
""" """
is_numbered_header = [header['is_numbered'] is_numbered_header = [header["is_numbered"]
for header in self.top_level_headers] for header in self.top_level_headers]
is_title = [header['is_introduction'] is_title = [header["is_introduction"]
for header in self.top_level_headers] for header in self.top_level_headers]
first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
@@ -454,12 +408,31 @@ class HTMLDocxPreprocessor:
first_header_is_introduction = is_title and is_title[0] first_header_is_introduction = is_title and is_title[0]
if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction: if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction:
self.top_level_headers[0]['should_be_numbered'] = False self.top_level_headers[0]["should_be_numbered"] = False
for i in range(1, len(self.top_level_headers)): for i in range(1, len(self.top_level_headers)):
self.top_level_headers[i]['should_be_numbered'] = True self.top_level_headers[i]["should_be_numbered"] = True
else: else:
for i in range(0, len(self.top_level_headers)): for i in range(0, len(self.top_level_headers)):
self.top_level_headers[i]['should_be_numbered'] = True self.top_level_headers[i]["should_be_numbered"] = True
@staticmethod
def clean_title_from_tabs(tag: NavigableString):
cleaned = re.sub(r"[\s\xa0]", " ", tag)
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this)
def apply_func_to_last_child(self, tag, func=None):
"""
works only with constructions like (((child to work with)))
where child is object of NavigableString
"""
if type(tag) is NavigableString:
func(tag)
else:
children = list(tag.children)
if children:
self.apply_func_to_last_child(children[0], func)
def _process_headings(self): def _process_headings(self):
# todo regex # todo regex
@@ -499,44 +472,33 @@ class HTMLDocxPreprocessor:
while tag.parent.name == "ol": while tag.parent.name == "ol":
tag.parent.unwrap() tag.parent.unwrap()
title = tag.text cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text)
title = self.clean_title_from_numbering(title) if cleaned_title == "":
if title == "":
tag.unwrap() tag.unwrap()
else: else:
assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \ assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings."
content = list(tag.children) content = list(tag.children)
# do not take into account rubbish empty tags like <a>, but don't remove them # do not take into account rubbish empty tags like <a>, but don"t remove them
content = [item for item in content if content = [item for item in content if
(type(item) is not NavigableString and item.text != '') (type(item) is not NavigableString and item.text != "")
or (type(item) is NavigableString)] or (type(item) is NavigableString)]
content[0] = "" if content[0] == " " else content[0]
content = [item for item in content if item != ""]
for i, item in enumerate(content): for i, item in enumerate(content):
if type(content[i]) is NavigableString: if type(content[i]) is NavigableString:
cleaned = re.sub(r'(\s+)+', ' ', content[i]) cleaned = re.sub(r"(\s+)+", " ", content[i])
this = BeautifulSoup.new_string(BeautifulSoup( this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString) features="lxml"), cleaned, NavigableString)
content[i].replace_with(this) content[i].replace_with(this)
content[i] = this content[i] = this
else: else:
self.apply_func_to_last_child( self.apply_func_to_last_child(
content[i], self.clean_tag_from_tabs) content[i], self.clean_title_from_tabs)
content[0] = '' if content[0] == ' ' else content[0]
content = [item for item in content if item != '']
if type(content[0]) is NavigableString:
cleaned = self.clean_title_from_numbering(content[0])
this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
content[0].replace_with(this)
content[0] = this
else:
self.apply_func_to_last_child(
content[0], self.clean_tag_from_numbering)
def _process_lists(self): def _process_lists(self):
# todo regex # todo regex
@@ -551,81 +513,76 @@ class HTMLDocxPreprocessor:
uwrap <p> tag with li uwrap <p> tag with li
""" """
li_tags = self.body_tag.find_all("li") li_tags = self.body_tag.find_all("li")
for li_tag in li_tags: for li_tag in li_tags:
li_tag.attrs.update(li_tag.p.attrs) li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap() li_tag.p.unwrap()
def delete_content_before_toc(self): def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated # remove all tag upper the <TOC> only in content !!! body tag is not updated
toc_tag = self.html_soup.new_tag('TOC') toc_tag = self.html_soup.new_tag("TOC")
self.content: List[Tag] = self.body_tag.find_all(recursive=False)
if toc_tag in self.content: if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1 ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:] self.content = self.content[ind:]
def process_html(self, access=None, html_path='', book_id=0): def process_html(self, access=None, html_path="", book_id=0):
"""Process html code to satisfy LiveCarta formatting.""" """Process html code to satisfy LiveCarta formatting."""
self.logger_object.log('Beginning of processing .html file.') self.logger_object.log("Beginning of processing .html file.")
try: try:
self.logger_object.log(f'Processing TOC and headers.') self.logger_object.log(f"Processing TOC and headers.")
self._process_toc_links() self._process_toc_links()
self.clean_trash() self.clean_trash()
# process main elements of the .html doc # process main elements of the .html doc
self.logger_object.log(f'Processing main elements of html.') self.logger_object.log(f"Processing main elements of html.")
self._preprocessing_headings() self._preprocessing_headings()
self._process_paragraph() self._process_paragraph()
self._process_two_columns() self._process_two_columns()
self.logger_object.log('Block quotes processing.') self.logger_object.log("Block quotes processing.")
self._process_quotes() self._process_quotes()
self.logger_object.log('Tables processing.') self.logger_object.log("Tables processing.")
self._process_tables() self._process_tables()
self.logger_object.log( self.logger_object.log(
f'{self.tables_amount} tables have been processed.') f"{self.tables_amount} tables have been processed.")
self.logger_object.log('Hrefs processing.') self.logger_object.log("Hrefs processing.")
self._process_hrefs() self._process_hrefs()
self.logger_object.log('Footnotes processing.') self.logger_object.log("Footnotes processing.")
self.footnotes = process_footnotes(self.body_tag) self.footnotes = process_footnotes(self.body_tag)
self.logger_object.log( self.logger_object.log(
f'{len(self.footnotes)} footnotes have been processed.') f"{len(self.footnotes)} footnotes have been processed.")
self.logger_object.log('Image processing.') self.logger_object.log("Image processing.")
self.images = process_images(access=access, html_path=html_path, self.images = process_images(access=access, html_path=html_path,
book_id=book_id, body_tag=self.body_tag) book_id=book_id, body_tag=self.body_tag)
self.logger_object.log( self.logger_object.log(
f'{len(self.images)} images have been processed.') f"{len(self.images)} images have been processed.")
self._process_footer() self._process_footer()
self._process_div() self._process_div()
self.content = self.body_tag.find_all(recursive=False)
self.top_level_headers = self._get_top_level_headers() self.top_level_headers = self._get_top_level_headers()
self._mark_introduction_headers() self._mark_introduction_headers()
self._process_headings() self._process_headings()
self.content: List[Tag] = self.body_tag.find_all(recursive=False)
self._process_lists() self._process_lists()
# delete text before table of content if exists # delete text before table of content if exists
self.delete_content_before_toc() self.delete_content_before_toc()
except Exception as exc: except Exception as exc:
self.logger_object.log( self.logger_object.log(
'Error has occurred while processing html.', logging.ERROR) "Error has occurred while processing html.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
if self.status_wrapper: if self.status_wrapper:
self.status_wrapper.set_error() self.status_wrapper.set_error()
raise exc raise exc
self.logger_object.log('End of processing .html file.') self.logger_object.log("End of processing .html file.")
return self.content, self.footnotes, self.top_level_headers return self.content, self.footnotes, self.top_level_headers