Make structure of docx c as epub s

This commit is contained in:
Kiryl
2022-06-24 17:10:43 +03:00
parent 66e03c98e3
commit 9f067bb93d
3 changed files with 134 additions and 126 deletions

View File

@@ -0,0 +1,73 @@
import re
from bs4 import BeautifulSoup, NavigableString, Tag
@staticmethod
def _clean_footnote_content(content):
content = content.strip()
return content.strip()
def process_footnotes(body_tag):
"""Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = body_tag.find_all(
'div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_anchors)
assert footnote_amt == len(footnote_content), \
'Something went wrong with footnotes after libre conversion'
footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
true_a_tag = cont_tag.find_all(
'a', class_=re.compile(r'^sdfootnote.+$'))[0]
if true_a_tag.attrs.get('href') is None:
cont_tag.a.decompose()
continue
assert anc_tag['name'] == true_a_tag['href'][1:], \
'Something went wrong with footnotes after libre conversion'
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
anc_tag.replace_with(new_tag)
# extra digits in footnotes from documents downloaded from livecarta
a_text = true_a_tag.text
if len(cont_tag.find_all('p')):
sup = cont_tag.find_all('p')[0].find('sup')
if sup and sup.text == a_text:
sup.decompose()
for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
tag_a.decompose()
# remove font-size
for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
style = span.get('style')
style = re.sub(r"font-size: \d+px", "", style)
if style == '':
del span.attrs['style']
else:
span.attrs['style'] = style
unicode_string = ''
for child in cont_tag.children:
if type(child) is NavigableString:
continue
if child.name == 'blockquote':
unicode_string += str(child)
else:
unicode_string += child.decode_contents()
content = _clean_footnote_content(unicode_string)
cont_tag.decompose()
footnotes.append(content)
return footnotes

View File

@@ -1,14 +1,13 @@
import os
import re import re
import logging import logging
import pathlib
from typing import List from typing import List
from shutil import copyfile
from bs4 import BeautifulSoup, NavigableString, Tag from bs4 import BeautifulSoup, NavigableString, Tag
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.util.helpers import BookLogger, BookStatusWrapper from src.util.helpers import BookLogger, BookStatusWrapper
from src.docx_converter.footnotes_processing import process_footnotes
from src.docx_converter.image_processing import process_images
class HTMLDocxPreprocessor: class HTMLDocxPreprocessor:
@@ -22,6 +21,7 @@ class HTMLDocxPreprocessor:
self.content = list() self.content = list()
def _clean_tag(self, tag: str, attr_name: str, attr_value: re): def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
# todo regex
""" """
Function to clean tags by its name and attribute value. Function to clean tags by its name and attribute value.
Parameters Parameters
@@ -44,6 +44,7 @@ class HTMLDocxPreprocessor:
tag.unwrap() tag.unwrap()
def _clean_underline_links(self): def _clean_underline_links(self):
# todo regex
"""Function cleans meaningless <u> tags before links.""" """Function cleans meaningless <u> tags before links."""
underlines = self.body_tag.find_all("u") underlines = self.body_tag.find_all("u")
for u in underlines: for u in underlines:
@@ -99,12 +100,10 @@ class HTMLDocxPreprocessor:
""" """
fonts = self.body_tag.find_all("font") fonts = self.body_tag.find_all("font")
for font in fonts: for font in fonts:
face = font.get("face") face, style, color =\
style = font.get("style") font.get("face"), font.get("style"), font.get("color")
color = font.get("color")
font.attrs = {} font.attrs, font.name = {}, "span"
font.name = "span"
if style: if style:
style = self.convert_font_pt_to_px(style) style = self.convert_font_pt_to_px(style)
if style != "": if style != "":
@@ -127,14 +126,8 @@ class HTMLDocxPreprocessor:
# on this step there should be no more <font> tags # on this step there should be no more <font> tags
assert len(self.body_tag.find_all("font")) == 0 assert len(self.body_tag.find_all("font")) == 0
def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated
toc_tag = self.html_soup.new_tag('TOC')
if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:]
def clean_trash(self): def clean_trash(self):
# todo make it regex dict
"""Function to remove all styles and tags we don't need.""" """Function to remove all styles and tags we don't need."""
self._clean_tag('span', 'style', re.compile( self._clean_tag('span', 'style', re.compile(
r'^background: #[\da-fA-F]{6}$')) r'^background: #[\da-fA-F]{6}$'))
@@ -308,115 +301,8 @@ class HTMLDocxPreprocessor:
tag.string = tag.text.replace('\u200b', '') # zero-width-space tag.string = tag.text.replace('\u200b', '') # zero-width-space
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
@staticmethod
def _clean_footnote_content(content):
content = content.strip()
return content.strip()
def _process_footnotes(self):
"""Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all(
'div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_anchors)
assert footnote_amt == len(footnote_content), \
'Something went wrong with footnotes after libre conversion'
footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
true_a_tag = cont_tag.find_all(
'a', class_=re.compile(r'^sdfootnote.+$'))[0]
if true_a_tag.attrs.get('href') is None:
cont_tag.a.decompose()
continue
assert anc_tag['name'] == true_a_tag['href'][1:], \
'Something went wrong with footnotes after libre conversion'
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
new_tag['id'] = f'footnote-{i + 1}'
new_tag.string = '*'
anc_tag.replace_with(new_tag)
# extra digits in footnotes from documents downloaded from livecarta
a_text = true_a_tag.text
if len(cont_tag.find_all('p')):
sup = cont_tag.find_all('p')[0].find('sup')
if sup and sup.text == a_text:
sup.decompose()
for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}):
tag_a.decompose()
# remove font-size
for span in cont_tag.find_all('span', {'style': re.compile('font-size')}):
style = span.get('style')
style = re.sub(r"font-size: \d+px", "", style)
if style == '':
del span.attrs['style']
else:
span.attrs['style'] = style
unicode_string = ''
for child in cont_tag.children:
if type(child) is NavigableString:
continue
if child.name == 'blockquote':
unicode_string += str(child)
else:
unicode_string += child.decode_contents()
content = self._clean_footnote_content(unicode_string)
cont_tag.decompose()
footnotes.append(content)
self.footnotes = footnotes
def _process_images(self, access, html_path, book_id):
"""
Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
For now images are moved to one folder.
"""
img_tags = self.body_tag.find_all('img')
if len(img_tags):
if access is None:
folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
for img in img_tags:
img_name = img.attrs.get('src')
# quick fix for bad links
if (len(img_name) >= 3) and img_name[:3] == '../':
img_name = img_name[3:]
img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
if access is not None:
link = access.send_image(img_path, doc_id=book_id)
img.attrs['src'] = link
self.logger_object.log(
f'{img_name} successfully uploaded.')
else:
img_size = os.path.getsize(img_path)
self.logger_object.log(
f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG)
new_img_path = new_path / img_name
copyfile(img_path, new_img_path)
img.attrs["src"] = str(new_img_path)
self.images = img_tags
def _process_footer(self): def _process_footer(self):
# todo regex
""" """
Function to process <div title="footer"> tags. Function to process <div title="footer"> tags.
All the tags will be deleted from file. All the tags will be deleted from file.
@@ -426,6 +312,7 @@ class HTMLDocxPreprocessor:
div.decompose() div.decompose()
def _process_div(self): def _process_div(self):
# todo regex
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.""" """Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
divs = self.body_tag.find_all("div") divs = self.body_tag.find_all("div")
@@ -505,6 +392,7 @@ class HTMLDocxPreprocessor:
self.apply_func_to_last_child(children[0], func) self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self): def _preprocessing_headings(self):
# todo regex
"""Function to convert all lower level headings to p tags""" """Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern)) header_tags = self.body_tag.find_all(re.compile(pattern))
@@ -584,6 +472,7 @@ class HTMLDocxPreprocessor:
self.top_level_headers[i]['should_be_numbered'] = True self.top_level_headers[i]['should_be_numbered'] = True
def _process_headings(self): def _process_headings(self):
# todo regex
""" """
Function to process tags <h>. Function to process tags <h>.
Steps Steps
@@ -660,6 +549,7 @@ class HTMLDocxPreprocessor:
content[0], self.clean_tag_from_numbering) content[0], self.clean_tag_from_numbering)
def _process_lists(self): def _process_lists(self):
# todo regex
""" """
Function Function
- process tags <li>. - process tags <li>.
@@ -678,6 +568,13 @@ class HTMLDocxPreprocessor:
li_tag.attrs.update(li_tag.p.attrs) li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap() li_tag.p.unwrap()
def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated
toc_tag = self.html_soup.new_tag('TOC')
if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:]
def process_html(self, access=None, html_path='', book_id='local'): def process_html(self, access=None, html_path='', book_id='local'):
"""Process html code to satisfy LiveCarta formatting.""" """Process html code to satisfy LiveCarta formatting."""
self.logger_object.log('Beginning of processing .html file.') self.logger_object.log('Beginning of processing .html file.')
@@ -705,13 +602,12 @@ class HTMLDocxPreprocessor:
self._process_hrefs() self._process_hrefs()
self.logger_object.log('Footnotes processing.') self.logger_object.log('Footnotes processing.')
self._process_footnotes() self.footnotes = process_footnotes(self.body_tag)
self.logger_object.log( self.logger_object.log(
f'{len(self.footnotes)} footnotes have been processed.') f'{len(self.footnotes)} footnotes have been processed.')
self.logger_object.log('Image processing.') self.logger_object.log('Image processing.')
self._process_images( self.images = process_images(self.body_tag, access=access, html_path=html_path, book_id=book_id)
access=access, html_path=html_path, book_id=book_id)
self.logger_object.log( self.logger_object.log(
f'{len(self.images)} images have been processed.') f'{len(self.images)} images have been processed.')

View File

@@ -0,0 +1,39 @@
import os
import logging
import pathlib
from shutil import copyfile
def process_images(body_tag, access, html_path, book_id):
"""
Function to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
For now images are moved to one folder.
"""
img_tags = body_tag.find_all('img')
if len(img_tags):
if access is None:
folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
for img in img_tags:
img_name = img.attrs.get('src')
# quick fix for bad links
if (len(img_name) >= 3) and img_name[:3] == '../':
img_name = img_name[3:]
img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}')
if access is not None:
link = access.send_image(img_path, doc_id=book_id)
img.attrs['src'] = link
else:
img_size = os.path.getsize(img_path)
new_img_path = new_path / img_name
copyfile(img_path, new_img_path)
img.attrs["src"] = str(new_img_path)
return img_tags