From 9f067bb93d0b2b4c3217a465954d4c360da2f4f8 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 24 Jun 2022 17:10:43 +0300 Subject: [PATCH] Make structure of docx c as epub s --- src/docx_converter/footnotes_processing.py | 73 +++++++++ src/docx_converter/html_docx_preprocessor.py | 148 +++---------------- src/docx_converter/image_processing.py | 39 +++++ 3 files changed, 134 insertions(+), 126 deletions(-) create mode 100644 src/docx_converter/footnotes_processing.py create mode 100644 src/docx_converter/image_processing.py diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py new file mode 100644 index 0000000..84861d7 --- /dev/null +++ b/src/docx_converter/footnotes_processing.py @@ -0,0 +1,73 @@ +import re +from bs4 import BeautifulSoup, NavigableString, Tag + +@staticmethod +def _clean_footnote_content(content): + content = content.strip() + return content.strip() + + +def process_footnotes(body_tag): + """Function returns list of footnotes and delete them from html_soup.""" + footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc') + footnote_content = body_tag.find_all( + 'div', id=re.compile(r'^sdfootnote\d+$')) + footnote_amt = len(footnote_anchors) + + assert footnote_amt == len(footnote_content), \ + 'Something went wrong with footnotes after libre conversion' + + footnotes = [] + + for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): + true_a_tag = cont_tag.find_all( + 'a', class_=re.compile(r'^sdfootnote.+$'))[0] + + if true_a_tag.attrs.get('href') is None: + cont_tag.a.decompose() + continue + + assert anc_tag['name'] == true_a_tag['href'][1:], \ + 'Something went wrong with footnotes after libre conversion' + + new_tag = BeautifulSoup(features='lxml').new_tag('sup') + new_tag['class'] = 'footnote-element' + new_tag['data-id'] = i + 1 + new_tag['id'] = f'footnote-{i + 1}' + new_tag.string = '*' + anc_tag.replace_with(new_tag) + + # extra digits in footnotes from documents downloaded from livecarta + a_text = true_a_tag.text + if len(cont_tag.find_all('p')): + sup = cont_tag.find_all('p')[0].find('sup') + if sup and sup.text == a_text: + sup.decompose() + + for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}): + tag_a.decompose() + + # remove font-size + for span in cont_tag.find_all('span', {'style': re.compile('font-size')}): + style = span.get('style') + style = re.sub(r"font-size: \d+px", "", style) + if style == '': + del span.attrs['style'] + else: + span.attrs['style'] = style + + unicode_string = '' + for child in cont_tag.children: + if type(child) is NavigableString: + continue + if child.name == 'blockquote': + unicode_string += str(child) + else: + unicode_string += child.decode_contents() + + content = _clean_footnote_content(unicode_string) + cont_tag.decompose() + + footnotes.append(content) + + return footnotes diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index 80d96a3..425fa10 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -1,14 +1,13 @@ -import os import re import logging -import pathlib from typing import List -from shutil import copyfile from bs4 import BeautifulSoup, NavigableString, Tag from src.livecarta_config import LiveCartaConfig from src.util.helpers import BookLogger, BookStatusWrapper +from src.docx_converter.footnotes_processing import process_footnotes +from src.docx_converter.image_processing import process_images class HTMLDocxPreprocessor: @@ -22,6 +21,7 @@ class HTMLDocxPreprocessor: self.content = list() def _clean_tag(self, tag: str, attr_name: str, attr_value: re): + # todo regex """ Function to clean tags by its name and attribute value. Parameters @@ -44,6 +44,7 @@ class HTMLDocxPreprocessor: tag.unwrap() def _clean_underline_links(self): + # todo regex """Function cleans meaningless tags before links.""" underlines = self.body_tag.find_all("u") for u in underlines: @@ -99,12 +100,10 @@ class HTMLDocxPreprocessor: """ fonts = self.body_tag.find_all("font") for font in fonts: - face = font.get("face") - style = font.get("style") - color = font.get("color") + face, style, color =\ + font.get("face"), font.get("style"), font.get("color") - font.attrs = {} - font.name = "span" + font.attrs, font.name = {}, "span" if style: style = self.convert_font_pt_to_px(style) if style != "": @@ -127,14 +126,8 @@ class HTMLDocxPreprocessor: # on this step there should be no more tags assert len(self.body_tag.find_all("font")) == 0 - def delete_content_before_toc(self): - # remove all tag upper the only in content !!! body tag is not updated - toc_tag = self.html_soup.new_tag('TOC') - if toc_tag in self.content: - ind = self.content.index(toc_tag) + 1 - self.content = self.content[ind:] - def clean_trash(self): + # todo make it regex dict """Function to remove all styles and tags we don't need.""" self._clean_tag('span', 'style', re.compile( r'^background: #[\da-fA-F]{6}$')) @@ -308,115 +301,8 @@ class HTMLDocxPreprocessor: tag.string = tag.text.replace('\u200b', '') # zero-width-space tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') - @staticmethod - def _clean_footnote_content(content): - content = content.strip() - return content.strip() - - def _process_footnotes(self): - """Function returns list of footnotes and delete them from html_soup.""" - footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc') - footnote_content = self.body_tag.find_all( - 'div', id=re.compile(r'^sdfootnote\d+$')) - footnote_amt = len(footnote_anchors) - - assert footnote_amt == len(footnote_content), \ - 'Something went wrong with footnotes after libre conversion' - - footnotes = [] - - for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): - true_a_tag = cont_tag.find_all( - 'a', class_=re.compile(r'^sdfootnote.+$'))[0] - - if true_a_tag.attrs.get('href') is None: - cont_tag.a.decompose() - continue - - assert anc_tag['name'] == true_a_tag['href'][1:], \ - 'Something went wrong with footnotes after libre conversion' - - new_tag = BeautifulSoup(features='lxml').new_tag('sup') - new_tag['class'] = 'footnote-element' - new_tag['data-id'] = i + 1 - new_tag['id'] = f'footnote-{i + 1}' - new_tag.string = '*' - anc_tag.replace_with(new_tag) - - # extra digits in footnotes from documents downloaded from livecarta - a_text = true_a_tag.text - if len(cont_tag.find_all('p')): - sup = cont_tag.find_all('p')[0].find('sup') - if sup and sup.text == a_text: - sup.decompose() - - for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}): - tag_a.decompose() - - # remove font-size - for span in cont_tag.find_all('span', {'style': re.compile('font-size')}): - style = span.get('style') - style = re.sub(r"font-size: \d+px", "", style) - if style == '': - del span.attrs['style'] - else: - span.attrs['style'] = style - - unicode_string = '' - for child in cont_tag.children: - if type(child) is NavigableString: - continue - if child.name == 'blockquote': - unicode_string += str(child) - else: - unicode_string += child.decode_contents() - - content = self._clean_footnote_content(unicode_string) - cont_tag.decompose() - - footnotes.append(content) - - self.footnotes = footnotes - - def _process_images(self, access, html_path, book_id): - """ - Function to process tag. Img should be sent Amazon S3 and then return new tag with valid link. - For now images are moved to one folder. - """ - img_tags = self.body_tag.find_all('img') - - if len(img_tags): - if access is None: - folder_path = os.path.dirname( - os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join( - folder_path, f'json/img_{book_id}/')) - new_path.mkdir(exist_ok=True) - - for img in img_tags: - img_name = img.attrs.get('src') - # quick fix for bad links - if (len(img_name) >= 3) and img_name[:3] == '../': - img_name = img_name[3:] - - img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') - - if access is not None: - link = access.send_image(img_path, doc_id=book_id) - img.attrs['src'] = link - self.logger_object.log( - f'{img_name} successfully uploaded.') - else: - img_size = os.path.getsize(img_path) - self.logger_object.log( - f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG) - new_img_path = new_path / img_name - copyfile(img_path, new_img_path) - img.attrs["src"] = str(new_img_path) - - self.images = img_tags - def _process_footer(self): + # todo regex """ Function to process
tags. All the tags will be deleted from file. @@ -426,6 +312,7 @@ class HTMLDocxPreprocessor: div.decompose() def _process_div(self): + # todo regex """Function to process
tags. All the tags will be deleted from file, all content of the tags will stay.""" divs = self.body_tag.find_all("div") @@ -505,6 +392,7 @@ class HTMLDocxPreprocessor: self.apply_func_to_last_child(children[0], func) def _preprocessing_headings(self): + # todo regex """Function to convert all lower level headings to p tags""" pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = self.body_tag.find_all(re.compile(pattern)) @@ -584,6 +472,7 @@ class HTMLDocxPreprocessor: self.top_level_headers[i]['should_be_numbered'] = True def _process_headings(self): + # todo regex """ Function to process tags . Steps @@ -660,6 +549,7 @@ class HTMLDocxPreprocessor: content[0], self.clean_tag_from_numbering) def _process_lists(self): + # todo regex """ Function - process tags
  • . @@ -678,6 +568,13 @@ class HTMLDocxPreprocessor: li_tag.attrs.update(li_tag.p.attrs) li_tag.p.unwrap() + def delete_content_before_toc(self): + # remove all tag upper the only in content !!! body tag is not updated + toc_tag = self.html_soup.new_tag('TOC') + if toc_tag in self.content: + ind = self.content.index(toc_tag) + 1 + self.content = self.content[ind:] + def process_html(self, access=None, html_path='', book_id='local'): """Process html code to satisfy LiveCarta formatting.""" self.logger_object.log('Beginning of processing .html file.') @@ -705,13 +602,12 @@ class HTMLDocxPreprocessor: self._process_hrefs() self.logger_object.log('Footnotes processing.') - self._process_footnotes() + self.footnotes = process_footnotes(self.body_tag) self.logger_object.log( f'{len(self.footnotes)} footnotes have been processed.') self.logger_object.log('Image processing.') - self._process_images( - access=access, html_path=html_path, book_id=book_id) + self.images = process_images(self.body_tag, access=access, html_path=html_path, book_id=book_id) self.logger_object.log( f'{len(self.images)} images have been processed.') diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py new file mode 100644 index 0000000..923a274 --- /dev/null +++ b/src/docx_converter/image_processing.py @@ -0,0 +1,39 @@ +import os +import logging +import pathlib +from shutil import copyfile + + +def process_images(body_tag, access, html_path, book_id): + """ + Function to process tag. Img should be sent Amazon S3 and then return new tag with valid link. + For now images are moved to one folder. + """ + img_tags = body_tag.find_all('img') + + if len(img_tags): + if access is None: + folder_path = os.path.dirname( + os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join( + folder_path, f'json/img_{book_id}/')) + new_path.mkdir(exist_ok=True) + + for img in img_tags: + img_name = img.attrs.get('src') + # quick fix for bad links + if (len(img_name) >= 3) and img_name[:3] == '../': + img_name = img_name[3:] + + img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') + + if access is not None: + link = access.send_image(img_path, doc_id=book_id) + img.attrs['src'] = link + else: + img_size = os.path.getsize(img_path) + new_img_path = new_path / img_name + copyfile(img_path, new_img_path) + img.attrs["src"] = str(new_img_path) + + return img_tags \ No newline at end of file