Merge pull request #308 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
This commit is contained in:
bivis
2022-10-20 17:08:28 +03:00
committed by GitHub
6 changed files with 240 additions and 102 deletions

View File

@@ -1,5 +1,5 @@
[
{
{
"preset_name": "wrapper",
"rules": [
{
@@ -34,7 +34,17 @@
{
"name": "title",
"value": "footer"
},
}
],
"text": null
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "id",
"value": "^Table of Contents\\d+"
@@ -104,15 +114,44 @@
"condition": {
"parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)",
"child_tags": null,
"attrs": null,
"text": null
}
},
{
"tags": ["^span$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "style",
"value": "(^background: #[\\da-fA-F]{6}$)|(^letter-spacing: -?[\\d.]+pt$)"
},
}
],
"text": null
}
},
{
"tags": ["^span$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "lang",
"value": "^ru-RU$"
},
}
],
"text": null
}
},
{
"tags": ["^span$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "face",
"value": "^Times New Roman[\\w, ]+$"
@@ -148,6 +187,15 @@
"tags": ["^u$"],
"condition": {
"parent_tags": ":is(a)",
"child_tags": null,
"attrs": null,
"text": null
}
},
{
"tags": ["^u$"],
"condition": {
"parent_tags": null,
"child_tags": ":is(a)",
"attrs": null,
"text": null

View File

@@ -11,15 +11,42 @@
{
"name": "width",
"value": ".*"
},
}
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "border",
"value": ".*"
},
}
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "style",
"value": "border.*"
},
}
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "bgcolor",
"value": ".*"
@@ -69,7 +96,7 @@
{
"tags": ["^code$", "^kbd$", "^var$"],
"condition": {
"parent_tags": ":not(pre)",
"parent_tags": ":not(pre, span)",
"child_tags": null,
"attrs": null
},
@@ -99,6 +126,15 @@
}
}
]
},
{
"preset_name": "attrs_remover",
"rules": [
{
"tags": ["^sup$"],
"condition": null
}
]
},
{
"preset_name": "attr_replacer",
@@ -171,4 +207,4 @@
}
]
}
]
]

View File

@@ -1,5 +1,6 @@
import json
import codecs
import logging
from src.book_solver import BookSolver
from src.util.helpers import BookLogger
@@ -30,11 +31,19 @@ class EpubBook(BookSolver):
json for LiveCarta platform
"""
html_preprocessor = HtmlPresetsProcessor(
logger=self.logger_object, preset_path="presets/epub_presets.json")
style_preprocessor = StyleReader()
html_processor = HtmlEpubProcessor(logger=self.logger_object,
html_preprocessor=html_preprocessor)
# Parses and cleans html, gets list of tags, gets footnotes
try:
html_preprocessor = HtmlPresetsProcessor(
logger=self.logger_object, preset_path="presets/epub_presets.json")
html_processor = HtmlEpubProcessor(logger=self.logger_object,
html_preprocessor=html_preprocessor)
except Exception as exc:
self.logger_object.log(
"Error has occurred while processing .html", logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
json_converter = EpubConverter(
self.book_path, access=self.access, logger=self.logger_object,
style_processor=style_preprocessor, html_processor=html_processor)

View File

@@ -107,12 +107,10 @@ class HtmlEpubProcessor:
len(text_preparing(tag)) != 0 and
re.findall(r"^h[1-5]$", tag.name or chapter_tag.name))
if title_in_text:
self.html_preprocessor._add_span_to_save_ids_for_links(
title_in_text[-1], chapter_tag)
self.html_preprocessor.add_span_to_save_ids_for_links(title_in_text[-1], chapter_tag)
title_in_text[-1].extract()
elif text_in_title:
[self.html_preprocessor._add_span_to_save_ids_for_links(
tag, chapter_tag) for tag in text_in_title]
[self.html_preprocessor.add_span_to_save_ids_for_links(tag, chapter_tag) for tag in text_in_title]
[tag.extract() for tag in text_in_title]
@staticmethod
@@ -135,12 +133,12 @@ class HtmlEpubProcessor:
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"]
def prepare_content(self, title_str: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
def prepare_content(self, title: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
"""
Function finalise processing/cleaning content
Parameters
----------
title_str: str
title: str
chapter_tag: BeautifulSoup, soup object
@@ -170,7 +168,7 @@ class HtmlEpubProcessor:
self._wrap_strings_with_p(chapter_tag)
# 3.
if remove_title_from_chapter:
self._remove_headings_content(chapter_tag, title_str)
self._remove_headings_content(chapter_tag, title)
# 4.
_process_presets(
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)

View File

@@ -2,7 +2,7 @@ import re
import json
from bs4 import BeautifulSoup, Tag
from bs4.element import PageElement
from typing import List, Dict, Union
from typing import List, Set, Dict, Union
from src.util.helpers import BookLogger
@@ -16,15 +16,60 @@ class HtmlPresetsProcessor:
"table_wrapper": self._process_tag_using_table,
"decomposer": self._decompose_tag,
"replacer": self._replace_tag,
"attrs_remover": self._remove_attrs,
"attr_replacer": self._replace_attr,
"unwrapper": self._unwrap_tag,
"inserter": self._insert_tag,
"text_replacer": self._replace_text
}
self.conditions = {
"parent_tags": self._tags_with_parent_condition,
"child_tags": self._tags_with_child_condition,
"attrs": self._tags_with_attrs_condition,
"text": self._tags_with_text_condition
}
@staticmethod
def _tags_with_parent_condition(**kwargs):
found_tags: Set[Tag] = set()
for parent_tag in kwargs["body_tag"].select(kwargs["family_condition"]):
for tag in parent_tag.find_all([re.compile(tag) for tag in kwargs["tags"]]):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _tags_with_child_condition(**kwargs):
found_tags: Set[Tag] = set()
for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]]):
if tag.select(kwargs["family_condition"]):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _tags_with_attrs_condition(**kwargs):
found_tags: Set[Tag] = set()
names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]]
values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]]
attr_conditions: Dict[str, str] = dict(zip(names, values))
for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]],
attr_conditions):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _tags_with_text_condition(**kwargs):
# find all tags that are in List of tags and tags that contains required text
found_tags: Set[Tag] = set()
for tag in kwargs["body_tag"].find_all(
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))",
t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]),
t.text)):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _wrap_tag(**kwargs):
kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
kwargs["found_tag"].wrap(kwargs["body_tag"].new_tag(
kwargs["rule"]["tag_to_wrap"]["name"]))
@staticmethod
@@ -34,13 +79,13 @@ class HtmlPresetsProcessor:
parent_tag.attrs[key] = tag.attrs[key]
def _decompose_tag(self, **kwargs):
if kwargs["tag"].parent:
self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent)
kwargs["tag"].decompose()
if kwargs["found_tag"].parent:
self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent)
kwargs["found_tag"].decompose()
@staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
chapter_tag: BeautifulSoup):
def add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
chapter_tag: BeautifulSoup):
"""
Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract)
@@ -82,29 +127,33 @@ class HtmlPresetsProcessor:
kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag(
"tr"), kwargs["body_tag"].new_tag("td")
td.attrs["bgcolor"] = bg_color
kwargs["tag"].wrap(td)
kwargs["found_tag"].wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table
_wrap_tag_with_table(
width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
width=kwargs["found_tag"].attrs["width"] if kwargs["found_tag"].attrs.get(
"width") else "100",
border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
border=kwargs["found_tag"].attrs["border"] if kwargs["found_tag"].attrs.get(
"border") else None,
bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"])
kwargs["tag"].unwrap()
bg_color=kwargs["found_tag"].attrs["bgcolor"] if kwargs["found_tag"].attrs.get("bgcolor") else None)
self.add_span_to_save_ids_for_links(kwargs["found_tag"], kwargs["body_tag"])
kwargs["found_tag"].unwrap()
@staticmethod
def _replace_tag(**kwargs):
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"]
kwargs["tag"].name = tag_to_replace
kwargs["found_tag"].name = tag_to_replace
if kwargs["rule"]["tag_to_replace"].get("attrs"):
dict_attributes = {attr["name"]: attr["value"]
for attr in kwargs["rule"]["tag_to_replace"]["attrs"]}
kwargs["tag"].attrs = dict_attributes
kwargs["found_tag"].attrs = dict_attributes
@staticmethod
def _remove_attrs(**kwargs):
kwargs["found_tag"].attrs = {}
@staticmethod
def _replace_attr(**kwargs):
@@ -114,21 +163,21 @@ class HtmlPresetsProcessor:
attr_to_replace, attr_value_to_replace =\
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
if attr_to_replace:
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr_name] \
if kwargs["tag"].get(attr_name)\
kwargs["found_tag"][attr_to_replace] = kwargs["found_tag"][attr_name] \
if kwargs["found_tag"].get(attr_name)\
else ""
if attr_value_to_replace:
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
del kwargs["tag"][attr_name]
kwargs["found_tag"].attrs[attr_to_replace] = attr_value_to_replace
del kwargs["found_tag"][attr_name]
elif attr_value_to_replace:
kwargs["tag"].attrs[attr_name] = attr_value_to_replace
kwargs["found_tag"].attrs[attr_name] = attr_value_to_replace
elif attr_name:
del kwargs["tag"][attr_name]
del kwargs["found_tag"][attr_name]
def _unwrap_tag(self, **kwargs):
if kwargs["tag"].parent:
self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent)
kwargs["tag"].unwrap()
if kwargs["found_tag"].parent:
self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent)
kwargs["found_tag"].unwrap()
@staticmethod
def _insert_tag(**kwargs):
@@ -138,29 +187,29 @@ class HtmlPresetsProcessor:
kwargs["body_tag"].new_tag(
kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes)
# insert all items that was in tag to subtag and remove from tag
for content in reversed(kwargs["tag"].contents):
for content in reversed(kwargs["found_tag"].contents):
tag_to_insert.insert(0, content.extract())
# wrap subtag with items
kwargs["tag"].append(tag_to_insert)
kwargs["found_tag"].append(tag_to_insert)
@staticmethod
def _replace_text(**kwargs):
if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["tag"].string):
if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["found_tag"].string):
new_text = re.sub(re.compile(
kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["tag"].string)
kwargs["tag"].string.replace_with(new_text)
kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["found_tag"].string)
kwargs["found_tag"].string.replace_with(new_text)
@staticmethod
def _process_tags(body_tag: BeautifulSoup,
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
action):
def process_tags(self,
body_tag: BeautifulSoup,
preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
action):
"""
Function does action with tags
Parameters
----------
body_tag: BeautifulSoup
Tag & contents of the body tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
list of conditions when fire function
action: function
action what to do with tag
@@ -170,39 +219,34 @@ class HtmlPresetsProcessor:
Body Tag with processed certain tags
"""
for rule in rules:
tags: List[str] = rule["tags"] if rule.get(
"tags") else rule["condition"]["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == "parent_tags":
for parent_tag in body_tag.select(condition_on_tag[1]):
for tag in parent_tag.find_all([re.compile(tag) for tag in tags]):
# parent_tag != tag.parent
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "child_tags":
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
if tag.select(condition_on_tag[1]):
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "text":
# find all tags that are in List of tags and tags that contains required text
for tag in body_tag.find_all(
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in tags]) + r"))",
t.name) and re.search(re.compile(rule["condition"]["text"]),
t.text)):
action(body_tag=body_tag, tag=tag, rule=rule)
for preset_rule in preset_rules:
tags: List[str] = preset_rule["tags"] if preset_rule.get(
"tags") else preset_rule["condition"]["tags"]
found_tags: List[Tag] = []
if preset_rule["condition"]:
conditions_on_tag = tuple((k, v) for k, v in preset_rule["condition"].items() if v)
for condition_on_tag in conditions_on_tag:
condition_func = self.conditions[condition_on_tag[0]]
was_found, f_tags = condition_func(body_tag=body_tag,
tags=tags,
rule=preset_rule,
family_condition=condition_on_tag[1])
found_tags = found_tags + f_tags if was_found else []
if not was_found:
break
# if there are several conditions on tags and found_tags isn't empty
if len(conditions_on_tag) > 1 and found_tags:
# tags satisfying all conditions(>1)
found_tags = [tag for tag in found_tags if found_tags.count(tag) > 1]
for found_tag in found_tags:
action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
else:
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
action(body_tag=body_tag, tag=tag, rule=rule)
for found_tag in body_tag.find_all([re.compile(tag) for tag in tags]):
action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
def _process_presets(html_preprocessor: HtmlPresetsProcessor, html_soup: BeautifulSoup):
for rule in html_preprocessor.preset:
for preset in html_preprocessor.preset:
# html_preprocessor.logger.log(rule["preset_name"].title() + " process.")
action = html_preprocessor.name2action[rule["preset_name"]]
html_preprocessor._process_tags(html_soup, rule["rules"], action)
action = html_preprocessor.name2action[preset["preset_name"]]
html_preprocessor.process_tags(html_soup, preset["rules"], action)

View File

@@ -109,24 +109,27 @@ class StyleReader:
return constraints_on_value, value_not_in_possible_values_list
def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
for i, style in reversed(list(enumerate(split_style))):
style_name, style_value = style.split(":")
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove
split_style.remove(style)
continue
try:
for i, style in reversed(list(enumerate(split_style))):
style_name, style_value = style.split(":")
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
# property not in LIVECARTA_STYLE_ATTRS, remove
split_style.remove(style)
continue
cleaned_value = self.clean_value(style_value, style_name)
if all(self.style_conditions(cleaned_value, style_name)):
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove
split_style.remove(style)
continue
else:
if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
style_value = func(cleaned_value)
split_style[i] = style_name + ":" + style_value
cleaned_value = self.clean_value(style_value, style_name)
if all(self.style_conditions(cleaned_value, style_name)):
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove
split_style.remove(style)
continue
else:
if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data
func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
style_value = func(cleaned_value)
split_style[i] = style_name + ":" + style_value
except ValueError as ve:
print(f"Style value isn't correct.")
return split_style
def build_inline_style_content(self, style: str) -> str: