diff --git a/src/docx_converter/docx2libre_html.py b/src/docx_converter/docx2libre_html.py index fbb24fe..56fe2f7 100644 --- a/src/docx_converter/docx2libre_html.py +++ b/src/docx_converter/docx2libre_html.py @@ -66,7 +66,6 @@ class Docx2LibreHTML: raise error self.logger_object.log(f"File - {self.file_path}.") - print(f"{self.file_path}") self.logger_object.log("Beginning of conversion from .docx to .html.") check_file_exists( @@ -74,7 +73,7 @@ class Docx2LibreHTML: folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) - out_dir_path = os.path.join(folder_path, f"../html/{self.book_id}") + out_dir_path = os.path.join(folder_path, f"../books/html/{self.book_id}") pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) try: diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 9f1735b..6260edb 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -34,9 +34,9 @@ class DocxBook(BookSolver): """ # 1. Converts docx to html with LibreOffice - html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, + html_converter = Docx2LibreHTML(self.book_id, self.book_path, self.access, self.logger_object, self.libre_locker) - # TODO presets + # todo presets # 2. Parses and cleans html, gets list of tags, gets footnotes parser = HTMLDocxPreprocessor( @@ -53,7 +53,7 @@ class DocxBook(BookSolver): if __name__ == "__main__": - docx_file_path = '../../docx/music_inquiry.docx' + docx_file_path = '../../books/docx/music_inquiry.docx' logger_object = BookLogger( name='docx', book_id=docx_file_path.split('/')[-1]) locker = Event() diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py index beb6d15..c269b73 100644 --- a/src/docx_converter/footnotes_processing.py +++ b/src/docx_converter/footnotes_processing.py @@ -1,7 +1,7 @@ import re from bs4 import BeautifulSoup, NavigableString -@staticmethod + def _clean_footnote_content(content): content = content.strip() return content.strip() diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index 046166f..a44df01 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -11,7 +11,7 @@ from src.docx_converter.image_processing import process_images class HTMLDocxPreprocessor: - + def __init__(self, html_soup, logger_object, status_wrapper=None): self.body_tag = html_soup.body self.html_soup = html_soup @@ -20,6 +20,38 @@ class HTMLDocxPreprocessor: self.top_level_headers = None self.content = list() + def _process_toc_links(self): + def _check_parent_link_exist_in_toc(tag_with_link): + toc_links = [] + for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}): + link_name = a_tag.attrs["name"] + toc_item = self.body_tag.find("a", {"href": "#" + link_name}) + if toc_item: + toc_links.append(toc_item) + return len(toc_links) > 0 + """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" + toc_links = self.body_tag.find_all( + "a", {"name": re.compile(r"^_Toc\d+")}) + headers = [link.parent for link in toc_links] + outline_level = "1" # All the unknown outlines will be predicted as
tags (text-align and text-indent value).""" - paragraphs = self.body_tag.find_all('p') + paragraphs = self.body_tag.find_all("p") for p in paragraphs: # libre converts some \n into
with 2
# there we remove 1 unnecessary
- brs = p.find_all('br')
+ brs = p.find_all("br")
text = p.text
- if brs and text == '\n\n' and len(brs) == 2:
+ if brs and text == "\n\n" and len(brs) == 2:
brs[0].decompose()
indent_should_be_added = False
- if text and ((text[0:1] == '\t') or (text[:2] == '\n\t')):
+ if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")):
indent_should_be_added = True
- align = p.get('align')
- style = p.get('style')
+ align = p.get("align")
+ style = p.get("style")
if style:
- indent = re.search(r'text-indent: ([\d.]{1,4})in', style)
- margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style)
+ indent = re.search(r"text-indent: ([\d.]{1,4})in", style)
+ margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style)
margin_right = re.search(
- r'margin-right: ([\d.]{1,4})in', style)
- margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style)
+ r"margin-right: ([\d.]{1,4})in", style)
+ margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style)
margin_bottom = re.search(
- r'margin-bottom: ([\d.]{1,4})in', style)
+ r"margin-bottom: ([\d.]{1,4})in", style)
else:
indent = margin_left = margin_right = \
margin_top = margin_bottom = None
if margin_left and margin_right and margin_top and margin_bottom and \
- margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \
- margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11':
- p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote'))
+ margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \
+ margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11":
+ p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))
p.attrs = {}
- style = ''
+ style = ""
if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE:
- style += f'text-align: {align};'
+ style += f"text-align: {align};"
if indent is not None or indent_should_be_added:
# indent = indent.group(1)
- style += f'text-indent: {LiveCartaConfig.INDENT};'
+ style += f"text-indent: {LiveCartaConfig.INDENT};"
if style:
- p.attrs['style'] = style
+ p.attrs["style"] = style
def _process_two_columns(self):
"""Function to process paragraphs which has two columns layout."""
@@ -203,40 +239,6 @@ class HTMLDocxPreprocessor:
child["class"] = "columns2"
div.unwrap()
- def _process_tables(self):
- """Function to process tables. Set "border" attribute."""
- tables = self.body_tag.find_all("table")
- for table in tables:
- tds = table.find_all("td")
-
- sizes = []
- for td in tds:
- style = td.get('style')
-
- if style:
- match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
-
- if match:
- size = match.group(1)
- units = match.group(2)
-
- if units == "pt":
- size = self.convert_pt_to_px(size)
-
- sizes.append(float(size))
-
- width = td.get('width')
-
- td.attrs = {}
- if width:
- td.attrs['width'] = width
-
- if sizes:
- border_size = sum(sizes) / len(sizes)
- table.attrs['border'] = f'{border_size:.2}'
-
- self.tables_amount = len(tables)
-
def _process_quotes(self):
"""
Function to process block quotes.
@@ -259,9 +261,9 @@ class HTMLDocxPreprocessor:
for table in tables:
trs = table.find_all("tr")
tds = table.find_all("td")
- if len(trs) == 1 and len(tds) == 1 and tds[0].get('width') == '600':
+ if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600":
td = tds[0]
- is_zero_border = 'border: none;' in td.get('style')
+ is_zero_border = "border: none;" in td.get("style")
paragraphs = td.find_all("p")
has_i_tag_or_br = [(p.i, p.br) for p in paragraphs]
has_i_tag_or_br = [x[0] is not None or x[1] is not None
@@ -269,27 +271,61 @@ class HTMLDocxPreprocessor:
if all(has_i_tag_or_br) and is_zero_border:
new_div = BeautifulSoup(
- features='lxml').new_tag('blockquote')
+ features="lxml").new_tag("blockquote")
for p in paragraphs:
new_div.append(p)
table.replaceWith(new_div)
+ def _process_tables(self):
+ """Function to process tables. Set "border" attribute."""
+ tables = self.body_tag.find_all("table")
+ for table in tables:
+ tds = table.find_all("td")
+
+ sizes = []
+ for td in tds:
+ style = td.get("style")
+
+ if style:
+ match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
+
+ if match:
+ size = match.group(1)
+ units = match.group(2)
+
+ if units == "pt":
+ size = self.convert_pt_to_px(size)
+
+ sizes.append(float(size))
+
+ width = td.get("width")
+
+ td.attrs = {}
+ if width:
+ td.attrs["width"] = width
+
+ if sizes:
+ border_size = sum(sizes) / len(sizes)
+ table.attrs["border"] = f"{border_size:.2}"
+
+ self.tables_amount = len(tables)
+
def _process_hrefs(self):
a_tags_with_href = self.body_tag.find_all(
- 'a', {'href': re.compile('^.*http.+')})
+ "a", {"href": re.compile("^.*http.+")})
# remove char=end of file for some editors
for tag in a_tags_with_href:
- tag.string = tag.text.replace('\u200c', '')
- tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
+ tag.string = tag.text.replace("\u200c", "")
+ tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
a_tags_with_href = self.body_tag.find_all(
- 'a', {'href': re.compile('^(?!#sdfootnote)')})
+ "a", {"href": re.compile("^(?!#sdfootnote)")})
for tag in a_tags_with_href:
- tag.string = tag.text.replace('\u200c', '')
- tag.string = tag.text.replace('\u200b', '') # zero-width-space
- tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
+ tag.string = tag.text.replace("\u200c", "")
+ tag.string = tag.text.replace("\u200b", "") # zero-width-space
+ tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
def _process_footer(self):
# todo regex
@@ -297,7 +333,7 @@ class HTMLDocxPreprocessor:
Function to process
tag with li
"""
-
li_tags = self.body_tag.find_all("li")
-
for li_tag in li_tags:
li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap()
def delete_content_before_toc(self):
# remove all tag upper the