From f690412f5ccd4d663b2550e12931fcd545e7e3d3 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 24 Jun 2022 17:12:21 +0300 Subject: [PATCH] Create preset for wrapping tags with tables --- src/epub_converter/html_epub_preprocessor.py | 161 ++++++++----------- src/livecarta_config.py | 7 +- 2 files changed, 71 insertions(+), 97 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 9f776c3..fbc45a2 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -103,6 +103,61 @@ def _wrap_strings_with_p(chapter_tag): node.replace_with(p_tag) +def _wrap_tags_with_table(chapter_tag): + """Function wraps with """ + def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): + table = chapter_tag.new_tag("table") + table.attrs["border"], table.attrs["align"], table.attrs["style"] \ + = border, "center", f"width:{width}%;" + tbody, tr, td = \ + chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") + td.attrs["bgcolor"] = bg_color + tag_to_be_wrapped.wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) + return table + + def process_tag_using_table(tag_to_wrap): + _wrap_tag_with_table( + chapter_tag, + tag_to_be_wrapped=tag_to_wrap, + width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", + border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, + bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) + _add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) + tag_to_wrap.unwrap() + + for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items(): + if isinstance(attrs, tuple): + attr, val = attrs[0], attrs[1] + for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}): + process_tag_using_table(tag_to_wrap) + else: + for tag_to_wrap in chapter_tag.find_all(tags_to_wrap): + if any(attr_name in attrs for attr_name in tag_to_wrap.attrs): + process_tag_using_table(tag_to_wrap) + + +def _tags_to_correspond_livecarta_tag(chapter_tag): + """Function to replace all tags to correspond livecarta tags""" + for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): + for key in reg_key: + tags = chapter_tag.find_all(re.compile(key)) + for tag in tags: + # todo can cause appearance of \n

...

->

\n

...

\n

(section) + tag.name = to_replace_value + + +def _unwrap_tags(chapter_tag): + """Function unwrap tags and move id to span""" + for tag in LiveCartaConfig.TAGS_TO_UNWRAP: + for s in chapter_tag.find_all(tag): + _add_span_to_save_ids_for_links(s, chapter_tag) + s.unwrap() + + def _remove_headings_content(content_tag, title_of_chapter: str): """ Function @@ -138,23 +193,6 @@ def _remove_headings_content(content_tag, title_of_chapter: str): break -def _tags_to_correspond_livecarta_tag(chapter_tag): - """Function to replace all tags to correspond livecarta tags""" - for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): - for key in reg_key: - tags = chapter_tag.find_all(re.compile(key)) - for tag in tags: - # todo can cause appearance of \n

...

->

\n

...

\n

(section) - tag.name = to_replace_value - -def _unwrap_tags(chapter_tag): - """Function unwrap tags and move id to span""" - for tag in LiveCartaConfig. TAGS_TO_UNWRAP: - for s in chapter_tag.find_all(tag): - _add_span_to_save_ids_for_links(s, chapter_tag) - s.unwrap() - - # todo remove def _process_lists(chapter_tag: BeautifulSoup): """ @@ -181,13 +219,11 @@ def _preprocess_table(chapter_tag: BeautifulSoup): """Function to preprocess tables and tags(td|th|tr): style""" tables = chapter_tag.find_all("table") for table in tables: - t_tags = table.find_all(re.compile("td|th|tr")) - for t_tag in t_tags: - style = t_tag.get("style") + for t_tag in table.find_all(re.compile("td|th|tr")): width = "" - if style: + if t_tag.get("style"): width_match = re.search( - r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) + r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) if width_match: size = width_match.group(1) width = size + "px" @@ -197,9 +233,8 @@ def _preprocess_table(chapter_tag: BeautifulSoup): if t_tag.attrs.get("style"): t_tag.attrs["style"] = t_tag.attrs["style"].replace( "border:0;", "") - - elif t_tag.attrs.get("style") == "": - del t_tag.attrs["style"] + if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": + del t_tag.attrs["style"] if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: table.attrs["border"] = "1" @@ -254,72 +289,6 @@ def _preprocess_pre_tags(chapter_tag: BeautifulSoup): pre.append(code) -def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): - """Function wraps with
""" - table = chapter_tag.new_tag("table") - table.attrs["border"], table.attrs["align"], table.attrs["style"] \ - = border, "center", f"width:{width}%;" - tbody, tr, td = \ - chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") - td.attrs["bgcolor"] = bg_color - tag_to_be_wrapped.wrap(td) - td.wrap(tr) - tr.wrap(tbody) - tbody.wrap(table) - table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) - return table - - -def _preprocess_div_tags(chapter_tag): - """ - Function replace
with
: - """ - for div in chapter_tag.find_all("div"): - if any(attr in ["width", "border", "bgcolor"] for attr in div.attrs): - _wrap_tag_with_table( - chapter_tag, - tag_to_be_wrapped=div, - width=div.attrs["width"] if div.attrs.get("width") else "100", - border=div.attrs["border"] if div.attrs.get("border") else None, - bg_color=div.attrs["bgcolor"] if div.attrs.get("bgcolor") else None) - else: - div.name = "p" - continue - _add_span_to_save_ids_for_links(div, chapter_tag) - div.unwrap() - - -def _clean_wiley_block(block): - hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) - for hr in hrs: - hr.extract() - h = block.find(re.compile("h[1-9]")) - if h: - h.name = "p" - h.insert_before(BeautifulSoup(features="lxml").new_tag("br")) - - -def _preprocess_block_tags(chapter_tag: Tag): - """Function preprocessing tags""" - for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}): - _clean_wiley_block(block) - color = "#DDDDDD" if block.attrs.get( - "class") == "feature1" else None - color = "#EEEEEE" if block.attrs.get( - "class") == "feature2" else color - _wrap_tag_with_table(chapter_tag, block, bg_color=color) - block.insert_after(BeautifulSoup(features="lxml").new_tag("br")) - block.unwrap() - - for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): - _clean_wiley_block(future_block) - color = "#DDDDDD" if future_block.attrs.get( - "class") == "feature1" else None - color = "#EEEEEE" if future_block.attrs.get( - "class") == "feature2" else color - _wrap_tag_with_table(chapter_tag, future_block, bg_color=color) - - def _class_removing(chapter_tag): for tag in chapter_tag.find_all(recursive=True): if tag.attrs.get("class") \ @@ -356,6 +325,8 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro # 2. wrap NavigableString with tag

_wrap_strings_with_p(content_tag) + _wrap_tags_with_table(content_tag) + _tags_to_correspond_livecarta_tag(content_tag) _unwrap_tags(content_tag) @@ -365,12 +336,10 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro _remove_headings_content(content_tag, title_str) # 4. processing tags (

  • ,
  • , ,
    , 
    , ) - _process_lists(content_tag) + _process_lists(content_tag) # todo regex _preprocess_table(content_tag) - _preprocess_code_tags(content_tag) - _preprocess_pre_tags(content_tag) - _preprocess_div_tags(content_tag) - _preprocess_block_tags(content_tag) + _preprocess_code_tags(content_tag) # todo regex + _preprocess_pre_tags(content_tag) # todo regex # 5. remove classes that weren't created by converter _class_removing(content_tag) diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 9fc8e2e..a81ffca 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -115,8 +115,13 @@ class LiveCartaConfig: r"(^h[1-9]$)": ["list-style-type"] } + WRAP_TAGS_WITH_TABLE = { + ("div",) :["width", "border", "bgcolor"], + ("section", "blockquote",) : ("class", r"feature[1234]"), + } + REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = { - (r"^h[6-9]$", "figure$", "section$"): "p", + (r"^h[6-9]$", "^figure$", "^section$", "^div$"): "p", ("^aside$",): "blockquote", ("^header$", "^footer$"): "span", ("^b$",): "strong",