epub converter: add table processing, class removal

2021-04-29 16:23:46 +03:00
parent 0a9124c046
commit d21b11f99a
2 changed files with 55 additions and 4 deletions
--- a/src/css_reader.py
+++ b/src/css_reader.py
@@ -76,6 +76,12 @@ LIVECARTA_STYLE_ATTRS = {
    'vertical-align': ['super'],  # <sup>
    'color': [],
    'background-color': [],
+    'width': [],
+    'border-top-width': [],
+    'border-right-width': [],
+    'border-left-width': [],
+    'border-bottom-width': [],
+    'border': []
 }

 """
@@ -85,7 +91,7 @@ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING shou
 to suit livecarta style convention.
 """
 LIVECARTA_STYLE_ATTRS_MAPPING = {
-    'text-indent': lambda x: LawCartaConfig.INDENT,
+    'text-indent': lambda x: LawCartaConfig.INDENT if x != '0' else '',
    'font-variant': lambda x: x,
    'text-align': lambda x: x,
    'font': lambda x: '',
@@ -93,6 +99,10 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
    'font-size': convert_font_size,
    'color': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''),
    'background-color': lambda x: LawCartaConfig.HTML42LIVECARTA_COLORS.get(str2color_name(x), ''),
+    'border-top-width': lambda x: x if x != '0' else '',
+    'border-right-width': lambda x: x if x != '0' else '',
+    'border-left-width': lambda x: x if x != '0' else '',
+    'border-bottom-width': lambda x: x if x != '0' else '',
 }

 """
@@ -162,7 +172,7 @@ def clean_css(css):
 def add_inline_style_to_html_soup(soup1, css_text):
    livecarta_tmp_ids = []
    h_regex = f'(^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$)'
-    for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)' + h_regex))):
+    for i, x in enumerate(soup1.find_all(re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|' + h_regex))):
        x.attrs['livecarta_id'] = i
        livecarta_tmp_ids.append(i)

--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -51,8 +51,44 @@ def preprocess_figure():
    pass


-def preprocess_table():
-    pass
+def preprocess_table(body_tag: BeautifulSoup):
+    tables = body_tag.find_all("table")
+    for table in tables:
+        tds = table.find_all("td")
+
+        border_sizes = []
+        for td in tds:
+            style = td.get('style')
+            width = ''
+            if style:
+                border_match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) or\
+                               re.search(r"border-top-width: ?(\d+\.?\d*)(p[tx])", style) or\
+                               re.search(r"border-left-width: ?(\d+\.?\d*)(p[tx])", style) or \
+                               re.search(r"border-right-width: ?(\d+\.?\d*)(p[tx])", style) or \
+                               re.search(r"border-bottom-width: ?(\d+\.?\d*)(p[tx])", style)
+
+                if border_match:
+                    size = border_match.group(1)
+                    units = border_match.group(2)
+                    border_sizes.append(float(size))
+
+                width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
+
+                if width_match:
+                    size = width_match.group(1)
+                    units = width_match.group(2)
+                    width = size+'px'
+
+            width = td.get('width') or width
+
+            td.attrs = {}
+            if width:
+                td.attrs['width'] = width
+
+        if border_sizes:
+            border_size = sum(border_sizes) / len(border_sizes)
+            print(border_size)
+            table.attrs['border'] = f'{border_size:.2}'


 def _process_lists(body_tag):
@@ -275,6 +311,11 @@ def prepare_title_and_content(title, content_tag: BeautifulSoup):
    clean_headings_content(content_tag, title_str)
    _process_lists(content_tag)
    _preprocessing_headings(content_tag)
+    preprocess_table(content_tag)
+    # 2. class removal
+    for tag in content_tag.find_all(recursive=True):
+        if hasattr(tag, 'attrs') and tag.attrs.get('class'):
+            del tag.attrs['class']

    # content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
    title_str = clean_title_from_numbering(title_str)