forked from LiveCarta/BookConverter
Make structure of docx c as epub s
This commit is contained in:
73
src/docx_converter/footnotes_processing.py
Normal file
73
src/docx_converter/footnotes_processing.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_footnote_content(content):
|
||||||
|
content = content.strip()
|
||||||
|
return content.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def process_footnotes(body_tag):
|
||||||
|
"""Function returns list of footnotes and delete them from html_soup."""
|
||||||
|
footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc')
|
||||||
|
footnote_content = body_tag.find_all(
|
||||||
|
'div', id=re.compile(r'^sdfootnote\d+$'))
|
||||||
|
footnote_amt = len(footnote_anchors)
|
||||||
|
|
||||||
|
assert footnote_amt == len(footnote_content), \
|
||||||
|
'Something went wrong with footnotes after libre conversion'
|
||||||
|
|
||||||
|
footnotes = []
|
||||||
|
|
||||||
|
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
|
||||||
|
true_a_tag = cont_tag.find_all(
|
||||||
|
'a', class_=re.compile(r'^sdfootnote.+$'))[0]
|
||||||
|
|
||||||
|
if true_a_tag.attrs.get('href') is None:
|
||||||
|
cont_tag.a.decompose()
|
||||||
|
continue
|
||||||
|
|
||||||
|
assert anc_tag['name'] == true_a_tag['href'][1:], \
|
||||||
|
'Something went wrong with footnotes after libre conversion'
|
||||||
|
|
||||||
|
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||||
|
new_tag['class'] = 'footnote-element'
|
||||||
|
new_tag['data-id'] = i + 1
|
||||||
|
new_tag['id'] = f'footnote-{i + 1}'
|
||||||
|
new_tag.string = '*'
|
||||||
|
anc_tag.replace_with(new_tag)
|
||||||
|
|
||||||
|
# extra digits in footnotes from documents downloaded from livecarta
|
||||||
|
a_text = true_a_tag.text
|
||||||
|
if len(cont_tag.find_all('p')):
|
||||||
|
sup = cont_tag.find_all('p')[0].find('sup')
|
||||||
|
if sup and sup.text == a_text:
|
||||||
|
sup.decompose()
|
||||||
|
|
||||||
|
for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
|
||||||
|
tag_a.decompose()
|
||||||
|
|
||||||
|
# remove font-size
|
||||||
|
for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
|
||||||
|
style = span.get('style')
|
||||||
|
style = re.sub(r"font-size: \d+px", "", style)
|
||||||
|
if style == '':
|
||||||
|
del span.attrs['style']
|
||||||
|
else:
|
||||||
|
span.attrs['style'] = style
|
||||||
|
|
||||||
|
unicode_string = ''
|
||||||
|
for child in cont_tag.children:
|
||||||
|
if type(child) is NavigableString:
|
||||||
|
continue
|
||||||
|
if child.name == 'blockquote':
|
||||||
|
unicode_string += str(child)
|
||||||
|
else:
|
||||||
|
unicode_string += child.decode_contents()
|
||||||
|
|
||||||
|
content = _clean_footnote_content(unicode_string)
|
||||||
|
cont_tag.decompose()
|
||||||
|
|
||||||
|
footnotes.append(content)
|
||||||
|
|
||||||
|
return footnotes
|
||||||
@@ -1,14 +1,13 @@
|
|||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import pathlib
|
|
||||||
from typing import List
|
from typing import List
|
||||||
from shutil import copyfile
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||||
|
|
||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
from src.util.helpers import BookLogger, BookStatusWrapper
|
from src.util.helpers import BookLogger, BookStatusWrapper
|
||||||
|
from src.docx_converter.footnotes_processing import process_footnotes
|
||||||
|
from src.docx_converter.image_processing import process_images
|
||||||
|
|
||||||
|
|
||||||
class HTMLDocxPreprocessor:
|
class HTMLDocxPreprocessor:
|
||||||
@@ -22,6 +21,7 @@ class HTMLDocxPreprocessor:
|
|||||||
self.content = list()
|
self.content = list()
|
||||||
|
|
||||||
def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
|
def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
|
||||||
|
# todo regex
|
||||||
"""
|
"""
|
||||||
Function to clean tags by its name and attribute value.
|
Function to clean tags by its name and attribute value.
|
||||||
Parameters
|
Parameters
|
||||||
@@ -44,6 +44,7 @@ class HTMLDocxPreprocessor:
|
|||||||
tag.unwrap()
|
tag.unwrap()
|
||||||
|
|
||||||
def _clean_underline_links(self):
|
def _clean_underline_links(self):
|
||||||
|
# todo regex
|
||||||
"""Function cleans meaningless <u> tags before links."""
|
"""Function cleans meaningless <u> tags before links."""
|
||||||
underlines = self.body_tag.find_all("u")
|
underlines = self.body_tag.find_all("u")
|
||||||
for u in underlines:
|
for u in underlines:
|
||||||
@@ -99,12 +100,10 @@ class HTMLDocxPreprocessor:
|
|||||||
"""
|
"""
|
||||||
fonts = self.body_tag.find_all("font")
|
fonts = self.body_tag.find_all("font")
|
||||||
for font in fonts:
|
for font in fonts:
|
||||||
face = font.get("face")
|
face, style, color =\
|
||||||
style = font.get("style")
|
font.get("face"), font.get("style"), font.get("color")
|
||||||
color = font.get("color")
|
|
||||||
|
|
||||||
font.attrs = {}
|
font.attrs, font.name = {}, "span"
|
||||||
font.name = "span"
|
|
||||||
if style:
|
if style:
|
||||||
style = self.convert_font_pt_to_px(style)
|
style = self.convert_font_pt_to_px(style)
|
||||||
if style != "":
|
if style != "":
|
||||||
@@ -127,14 +126,8 @@ class HTMLDocxPreprocessor:
|
|||||||
# on this step there should be no more <font> tags
|
# on this step there should be no more <font> tags
|
||||||
assert len(self.body_tag.find_all("font")) == 0
|
assert len(self.body_tag.find_all("font")) == 0
|
||||||
|
|
||||||
def delete_content_before_toc(self):
|
|
||||||
# remove all tag upper the <TOC> only in content !!! body tag is not updated
|
|
||||||
toc_tag = self.html_soup.new_tag('TOC')
|
|
||||||
if toc_tag in self.content:
|
|
||||||
ind = self.content.index(toc_tag) + 1
|
|
||||||
self.content = self.content[ind:]
|
|
||||||
|
|
||||||
def clean_trash(self):
|
def clean_trash(self):
|
||||||
|
# todo make it regex dict
|
||||||
"""Function to remove all styles and tags we don't need."""
|
"""Function to remove all styles and tags we don't need."""
|
||||||
self._clean_tag('span', 'style', re.compile(
|
self._clean_tag('span', 'style', re.compile(
|
||||||
r'^background: #[\da-fA-F]{6}$'))
|
r'^background: #[\da-fA-F]{6}$'))
|
||||||
@@ -308,115 +301,8 @@ class HTMLDocxPreprocessor:
|
|||||||
tag.string = tag.text.replace('\u200b', '') # zero-width-space
|
tag.string = tag.text.replace('\u200b', '') # zero-width-space
|
||||||
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
|
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _clean_footnote_content(content):
|
|
||||||
content = content.strip()
|
|
||||||
return content.strip()
|
|
||||||
|
|
||||||
def _process_footnotes(self):
|
|
||||||
"""Function returns list of footnotes and delete them from html_soup."""
|
|
||||||
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
|
|
||||||
footnote_content = self.body_tag.find_all(
|
|
||||||
'div', id=re.compile(r'^sdfootnote\d+$'))
|
|
||||||
footnote_amt = len(footnote_anchors)
|
|
||||||
|
|
||||||
assert footnote_amt == len(footnote_content), \
|
|
||||||
'Something went wrong with footnotes after libre conversion'
|
|
||||||
|
|
||||||
footnotes = []
|
|
||||||
|
|
||||||
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
|
|
||||||
true_a_tag = cont_tag.find_all(
|
|
||||||
'a', class_=re.compile(r'^sdfootnote.+$'))[0]
|
|
||||||
|
|
||||||
if true_a_tag.attrs.get('href') is None:
|
|
||||||
cont_tag.a.decompose()
|
|
||||||
continue
|
|
||||||
|
|
||||||
assert anc_tag['name'] == true_a_tag['href'][1:], \
|
|
||||||
'Something went wrong with footnotes after libre conversion'
|
|
||||||
|
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
|
||||||
new_tag['class'] = 'footnote-element'
|
|
||||||
new_tag['data-id'] = i + 1
|
|
||||||
new_tag['id'] = f'footnote-{i + 1}'
|
|
||||||
new_tag.string = '*'
|
|
||||||
anc_tag.replace_with(new_tag)
|
|
||||||
|
|
||||||
# extra digits in footnotes from documents downloaded from livecarta
|
|
||||||
a_text = true_a_tag.text
|
|
||||||
if len(cont_tag.find_all('p')):
|
|
||||||
sup = cont_tag.find_all('p')[0].find('sup')
|
|
||||||
if sup and sup.text == a_text:
|
|
||||||
sup.decompose()
|
|
||||||
|
|
||||||
for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
|
|
||||||
tag_a.decompose()
|
|
||||||
|
|
||||||
# remove font-size
|
|
||||||
for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
|
|
||||||
style = span.get('style')
|
|
||||||
style = re.sub(r"font-size: \d+px", "", style)
|
|
||||||
if style == '':
|
|
||||||
del span.attrs['style']
|
|
||||||
else:
|
|
||||||
span.attrs['style'] = style
|
|
||||||
|
|
||||||
unicode_string = ''
|
|
||||||
for child in cont_tag.children:
|
|
||||||
if type(child) is NavigableString:
|
|
||||||
continue
|
|
||||||
if child.name == 'blockquote':
|
|
||||||
unicode_string += str(child)
|
|
||||||
else:
|
|
||||||
unicode_string += child.decode_contents()
|
|
||||||
|
|
||||||
content = self._clean_footnote_content(unicode_string)
|
|
||||||
cont_tag.decompose()
|
|
||||||
|
|
||||||
footnotes.append(content)
|
|
||||||
|
|
||||||
self.footnotes = footnotes
|
|
||||||
|
|
||||||
def _process_images(self, access, html_path, book_id):
|
|
||||||
"""
|
|
||||||
Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
|
|
||||||
For now images are moved to one folder.
|
|
||||||
"""
|
|
||||||
img_tags = self.body_tag.find_all('img')
|
|
||||||
|
|
||||||
if len(img_tags):
|
|
||||||
if access is None:
|
|
||||||
folder_path = os.path.dirname(
|
|
||||||
os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
new_path = pathlib.Path(os.path.join(
|
|
||||||
folder_path, f'json/img_{book_id}/'))
|
|
||||||
new_path.mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
for img in img_tags:
|
|
||||||
img_name = img.attrs.get('src')
|
|
||||||
# quick fix for bad links
|
|
||||||
if (len(img_name) >= 3) and img_name[:3] == '../':
|
|
||||||
img_name = img_name[3:]
|
|
||||||
|
|
||||||
img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
|
|
||||||
|
|
||||||
if access is not None:
|
|
||||||
link = access.send_image(img_path, doc_id=book_id)
|
|
||||||
img.attrs['src'] = link
|
|
||||||
self.logger_object.log(
|
|
||||||
f'{img_name} successfully uploaded.')
|
|
||||||
else:
|
|
||||||
img_size = os.path.getsize(img_path)
|
|
||||||
self.logger_object.log(
|
|
||||||
f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG)
|
|
||||||
new_img_path = new_path / img_name
|
|
||||||
copyfile(img_path, new_img_path)
|
|
||||||
img.attrs["src"] = str(new_img_path)
|
|
||||||
|
|
||||||
self.images = img_tags
|
|
||||||
|
|
||||||
def _process_footer(self):
|
def _process_footer(self):
|
||||||
|
# todo regex
|
||||||
"""
|
"""
|
||||||
Function to process <div title="footer"> tags.
|
Function to process <div title="footer"> tags.
|
||||||
All the tags will be deleted from file.
|
All the tags will be deleted from file.
|
||||||
@@ -426,6 +312,7 @@ class HTMLDocxPreprocessor:
|
|||||||
div.decompose()
|
div.decompose()
|
||||||
|
|
||||||
def _process_div(self):
|
def _process_div(self):
|
||||||
|
# todo regex
|
||||||
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
|
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
|
||||||
divs = self.body_tag.find_all("div")
|
divs = self.body_tag.find_all("div")
|
||||||
|
|
||||||
@@ -505,6 +392,7 @@ class HTMLDocxPreprocessor:
|
|||||||
self.apply_func_to_last_child(children[0], func)
|
self.apply_func_to_last_child(children[0], func)
|
||||||
|
|
||||||
def _preprocessing_headings(self):
|
def _preprocessing_headings(self):
|
||||||
|
# todo regex
|
||||||
"""Function to convert all lower level headings to p tags"""
|
"""Function to convert all lower level headings to p tags"""
|
||||||
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||||
header_tags = self.body_tag.find_all(re.compile(pattern))
|
header_tags = self.body_tag.find_all(re.compile(pattern))
|
||||||
@@ -584,6 +472,7 @@ class HTMLDocxPreprocessor:
|
|||||||
self.top_level_headers[i]['should_be_numbered'] = True
|
self.top_level_headers[i]['should_be_numbered'] = True
|
||||||
|
|
||||||
def _process_headings(self):
|
def _process_headings(self):
|
||||||
|
# todo regex
|
||||||
"""
|
"""
|
||||||
Function to process tags <h>.
|
Function to process tags <h>.
|
||||||
Steps
|
Steps
|
||||||
@@ -660,6 +549,7 @@ class HTMLDocxPreprocessor:
|
|||||||
content[0], self.clean_tag_from_numbering)
|
content[0], self.clean_tag_from_numbering)
|
||||||
|
|
||||||
def _process_lists(self):
|
def _process_lists(self):
|
||||||
|
# todo regex
|
||||||
"""
|
"""
|
||||||
Function
|
Function
|
||||||
- process tags <li>.
|
- process tags <li>.
|
||||||
@@ -678,6 +568,13 @@ class HTMLDocxPreprocessor:
|
|||||||
li_tag.attrs.update(li_tag.p.attrs)
|
li_tag.attrs.update(li_tag.p.attrs)
|
||||||
li_tag.p.unwrap()
|
li_tag.p.unwrap()
|
||||||
|
|
||||||
|
def delete_content_before_toc(self):
|
||||||
|
# remove all tag upper the <TOC> only in content !!! body tag is not updated
|
||||||
|
toc_tag = self.html_soup.new_tag('TOC')
|
||||||
|
if toc_tag in self.content:
|
||||||
|
ind = self.content.index(toc_tag) + 1
|
||||||
|
self.content = self.content[ind:]
|
||||||
|
|
||||||
def process_html(self, access=None, html_path='', book_id='local'):
|
def process_html(self, access=None, html_path='', book_id='local'):
|
||||||
"""Process html code to satisfy LiveCarta formatting."""
|
"""Process html code to satisfy LiveCarta formatting."""
|
||||||
self.logger_object.log('Beginning of processing .html file.')
|
self.logger_object.log('Beginning of processing .html file.')
|
||||||
@@ -705,13 +602,12 @@ class HTMLDocxPreprocessor:
|
|||||||
self._process_hrefs()
|
self._process_hrefs()
|
||||||
|
|
||||||
self.logger_object.log('Footnotes processing.')
|
self.logger_object.log('Footnotes processing.')
|
||||||
self._process_footnotes()
|
self.footnotes = process_footnotes(self.body_tag)
|
||||||
self.logger_object.log(
|
self.logger_object.log(
|
||||||
f'{len(self.footnotes)} footnotes have been processed.')
|
f'{len(self.footnotes)} footnotes have been processed.')
|
||||||
|
|
||||||
self.logger_object.log('Image processing.')
|
self.logger_object.log('Image processing.')
|
||||||
self._process_images(
|
self.images = process_images(self.body_tag, access=access, html_path=html_path, book_id=book_id)
|
||||||
access=access, html_path=html_path, book_id=book_id)
|
|
||||||
self.logger_object.log(
|
self.logger_object.log(
|
||||||
f'{len(self.images)} images have been processed.')
|
f'{len(self.images)} images have been processed.')
|
||||||
|
|
||||||
|
|||||||
39
src/docx_converter/image_processing.py
Normal file
39
src/docx_converter/image_processing.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import pathlib
|
||||||
|
from shutil import copyfile
|
||||||
|
|
||||||
|
|
||||||
|
def process_images(body_tag, access, html_path, book_id):
|
||||||
|
"""
|
||||||
|
Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
|
||||||
|
For now images are moved to one folder.
|
||||||
|
"""
|
||||||
|
img_tags = body_tag.find_all('img')
|
||||||
|
|
||||||
|
if len(img_tags):
|
||||||
|
if access is None:
|
||||||
|
folder_path = os.path.dirname(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
new_path = pathlib.Path(os.path.join(
|
||||||
|
folder_path, f'json/img_{book_id}/'))
|
||||||
|
new_path.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
for img in img_tags:
|
||||||
|
img_name = img.attrs.get('src')
|
||||||
|
# quick fix for bad links
|
||||||
|
if (len(img_name) >= 3) and img_name[:3] == '../':
|
||||||
|
img_name = img_name[3:]
|
||||||
|
|
||||||
|
img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
|
||||||
|
|
||||||
|
if access is not None:
|
||||||
|
link = access.send_image(img_path, doc_id=book_id)
|
||||||
|
img.attrs['src'] = link
|
||||||
|
else:
|
||||||
|
img_size = os.path.getsize(img_path)
|
||||||
|
new_img_path = new_path / img_name
|
||||||
|
copyfile(img_path, new_img_path)
|
||||||
|
img.attrs["src"] = str(new_img_path)
|
||||||
|
|
||||||
|
return img_tags
|
||||||
Reference in New Issue
Block a user