forked from LiveCarta/BookConverter
epub converter: add table processing, class removal
This commit is contained in:
@@ -51,8 +51,44 @@ def preprocess_figure():
|
||||
pass
|
||||
|
||||
|
||||
def preprocess_table():
|
||||
pass
|
||||
def preprocess_table(body_tag: BeautifulSoup):
|
||||
tables = body_tag.find_all("table")
|
||||
for table in tables:
|
||||
tds = table.find_all("td")
|
||||
|
||||
border_sizes = []
|
||||
for td in tds:
|
||||
style = td.get('style')
|
||||
width = ''
|
||||
if style:
|
||||
border_match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) or\
|
||||
re.search(r"border-top-width: ?(\d+\.?\d*)(p[tx])", style) or\
|
||||
re.search(r"border-left-width: ?(\d+\.?\d*)(p[tx])", style) or \
|
||||
re.search(r"border-right-width: ?(\d+\.?\d*)(p[tx])", style) or \
|
||||
re.search(r"border-bottom-width: ?(\d+\.?\d*)(p[tx])", style)
|
||||
|
||||
if border_match:
|
||||
size = border_match.group(1)
|
||||
units = border_match.group(2)
|
||||
border_sizes.append(float(size))
|
||||
|
||||
width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
|
||||
|
||||
if width_match:
|
||||
size = width_match.group(1)
|
||||
units = width_match.group(2)
|
||||
width = size+'px'
|
||||
|
||||
width = td.get('width') or width
|
||||
|
||||
td.attrs = {}
|
||||
if width:
|
||||
td.attrs['width'] = width
|
||||
|
||||
if border_sizes:
|
||||
border_size = sum(border_sizes) / len(border_sizes)
|
||||
print(border_size)
|
||||
table.attrs['border'] = f'{border_size:.2}'
|
||||
|
||||
|
||||
def _process_lists(body_tag):
|
||||
@@ -275,6 +311,11 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
|
||||
clean_headings_content(content_tag, title_str)
|
||||
_process_lists(content_tag)
|
||||
_preprocessing_headings(content_tag)
|
||||
preprocess_table(content_tag)
|
||||
# 2. class removal
|
||||
for tag in content_tag.find_all(recursive=True):
|
||||
if hasattr(tag, 'attrs') and tag.attrs.get('class'):
|
||||
del tag.attrs['class']
|
||||
|
||||
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
|
||||
title_str = clean_title_from_numbering(title_str)
|
||||
|
||||
Reference in New Issue
Block a user