From d21b11f99a1fe3cc20dfb04238ec05392229ece4 Mon Sep 17 00:00:00 2001 From: shirshasa Date: Thu, 29 Apr 2021 16:23:46 +0300 Subject: [PATCH] epub converter: add table processing, class removal --- src/css_reader.py | 14 +++++++++-- src/html_epub_preprocessor.py | 45 +++++++++++++++++++++++++++++++++-- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/css_reader.py b/src/css_reader.py index 8bca842..8bdc884 100644 --- a/src/css_reader.py +++ b/src/css_reader.py @@ -76,6 +76,12 @@ LIVECARTA_STYLE_ATTRS = { 'vertical-align': ['super'], # 'color': [], 'background-color': [], + 'width': [], + 'border-top-width': [], + 'border-right-width': [], + 'border-left-width': [], + 'border-bottom-width': [], + 'border': [] } """ @@ -85,7 +91,7 @@ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING shou to suit livecarta style convention. """ LIVECARTA_STYLE_ATTRS_MAPPING = { - 'text-indent': lambda x: LawCartaConfig.INDENT, + 'text-indent': lambda x: LawCartaConfig.INDENT if x != '0' else '', 'font-variant': lambda x: x, 'text-align': lambda x: x, 'font': lambda x: '', @@ -93,6 +99,10 @@ LIVECARTA_STYLE_ATTRS_MAPPING = { 'font-size': convert_font_size, 'color': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''), 'background-color': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''), + 'border-top-width': lambda x: x if x != '0' else '', + 'border-right-width': lambda x: x if x != '0' else '', + 'border-left-width': lambda x: x if x != '0' else '', + 'border-bottom-width': lambda x: x if x != '0' else '', } """ @@ -162,7 +172,7 @@ def clean_css(css): def add_inline_style_to_html_soup(soup1, css_text): livecarta_tmp_ids = [] h_regex = f'(^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$)' - for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)' + h_regex))): + for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|' + h_regex))): x.attrs['livecarta_id'] = i livecarta_tmp_ids.append(i) diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 6e6f638..1687835 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -51,8 +51,44 @@ def preprocess_figure(): pass -def preprocess_table(): - pass +def preprocess_table(body_tag: BeautifulSoup): + tables = body_tag.find_all("table") + for table in tables: + tds = table.find_all("td") + + border_sizes = [] + for td in tds: + style = td.get('style') + width = '' + if style: + border_match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) or\ + re.search(r"border-top-width: ?(\d+\.?\d*)(p[tx])", style) or\ + re.search(r"border-left-width: ?(\d+\.?\d*)(p[tx])", style) or \ + re.search(r"border-right-width: ?(\d+\.?\d*)(p[tx])", style) or \ + re.search(r"border-bottom-width: ?(\d+\.?\d*)(p[tx])", style) + + if border_match: + size = border_match.group(1) + units = border_match.group(2) + border_sizes.append(float(size)) + + width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) + + if width_match: + size = width_match.group(1) + units = width_match.group(2) + width = size+'px' + + width = td.get('width') or width + + td.attrs = {} + if width: + td.attrs['width'] = width + + if border_sizes: + border_size = sum(border_sizes) / len(border_sizes) + print(border_size) + table.attrs['border'] = f'{border_size:.2}' def _process_lists(body_tag): @@ -275,6 +311,11 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup): clean_headings_content(content_tag, title_str) _process_lists(content_tag) _preprocessing_headings(content_tag) + preprocess_table(content_tag) + # 2. class removal + for tag in content_tag.find_all(recursive=True): + if hasattr(tag, 'attrs') and tag.attrs.get('class'): + del tag.attrs['class'] # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag)) title_str = clean_title_from_numbering(title_str)