Merge pull request #308 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
This commit is contained in:
bivis
2022-10-20 17:08:28 +03:00
committed by GitHub
6 changed files with 240 additions and 102 deletions

View File

@@ -1,5 +1,5 @@
[ [
{ {
"preset_name": "wrapper", "preset_name": "wrapper",
"rules": [ "rules": [
{ {
@@ -34,7 +34,17 @@
{ {
"name": "title", "name": "title",
"value": "footer" "value": "footer"
}, }
],
"text": null
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{ {
"name": "id", "name": "id",
"value": "^Table of Contents\\d+" "value": "^Table of Contents\\d+"
@@ -104,15 +114,44 @@
"condition": { "condition": {
"parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)", "parent_tags": ":is(h1, h2, h3, h4, h5, h6, h7, h8, h9)",
"child_tags": null, "child_tags": null,
"attrs": null,
"text": null
}
},
{
"tags": ["^span$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [ "attrs": [
{ {
"name": "style", "name": "style",
"value": "(^background: #[\\da-fA-F]{6}$)|(^letter-spacing: -?[\\d.]+pt$)" "value": "(^background: #[\\da-fA-F]{6}$)|(^letter-spacing: -?[\\d.]+pt$)"
}, }
],
"text": null
}
},
{
"tags": ["^span$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{ {
"name": "lang", "name": "lang",
"value": "^ru-RU$" "value": "^ru-RU$"
}, }
],
"text": null
}
},
{
"tags": ["^span$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{ {
"name": "face", "name": "face",
"value": "^Times New Roman[\\w, ]+$" "value": "^Times New Roman[\\w, ]+$"
@@ -148,6 +187,15 @@
"tags": ["^u$"], "tags": ["^u$"],
"condition": { "condition": {
"parent_tags": ":is(a)", "parent_tags": ":is(a)",
"child_tags": null,
"attrs": null,
"text": null
}
},
{
"tags": ["^u$"],
"condition": {
"parent_tags": null,
"child_tags": ":is(a)", "child_tags": ":is(a)",
"attrs": null, "attrs": null,
"text": null "text": null

View File

@@ -11,15 +11,42 @@
{ {
"name": "width", "name": "width",
"value": ".*" "value": ".*"
}, }
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{ {
"name": "border", "name": "border",
"value": ".*" "value": ".*"
}, }
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{ {
"name": "style", "name": "style",
"value": "border.*" "value": "border.*"
}, }
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{ {
"name": "bgcolor", "name": "bgcolor",
"value": ".*" "value": ".*"
@@ -69,7 +96,7 @@
{ {
"tags": ["^code$", "^kbd$", "^var$"], "tags": ["^code$", "^kbd$", "^var$"],
"condition": { "condition": {
"parent_tags": ":not(pre)", "parent_tags": ":not(pre, span)",
"child_tags": null, "child_tags": null,
"attrs": null "attrs": null
}, },
@@ -99,6 +126,15 @@
} }
} }
] ]
},
{
"preset_name": "attrs_remover",
"rules": [
{
"tags": ["^sup$"],
"condition": null
}
]
}, },
{ {
"preset_name": "attr_replacer", "preset_name": "attr_replacer",
@@ -171,4 +207,4 @@
} }
] ]
} }
] ]

View File

@@ -1,5 +1,6 @@
import json import json
import codecs import codecs
import logging
from src.book_solver import BookSolver from src.book_solver import BookSolver
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
@@ -30,11 +31,19 @@ class EpubBook(BookSolver):
json for LiveCarta platform json for LiveCarta platform
""" """
html_preprocessor = HtmlPresetsProcessor(
logger=self.logger_object, preset_path="presets/epub_presets.json")
style_preprocessor = StyleReader() style_preprocessor = StyleReader()
html_processor = HtmlEpubProcessor(logger=self.logger_object, # Parses and cleans html, gets list of tags, gets footnotes
html_preprocessor=html_preprocessor) try:
html_preprocessor = HtmlPresetsProcessor(
logger=self.logger_object, preset_path="presets/epub_presets.json")
html_processor = HtmlEpubProcessor(logger=self.logger_object,
html_preprocessor=html_preprocessor)
except Exception as exc:
self.logger_object.log(
"Error has occurred while processing .html", logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
json_converter = EpubConverter( json_converter = EpubConverter(
self.book_path, access=self.access, logger=self.logger_object, self.book_path, access=self.access, logger=self.logger_object,
style_processor=style_preprocessor, html_processor=html_processor) style_processor=style_preprocessor, html_processor=html_processor)

View File

@@ -107,12 +107,10 @@ class HtmlEpubProcessor:
len(text_preparing(tag)) != 0 and len(text_preparing(tag)) != 0 and
re.findall(r"^h[1-5]$", tag.name or chapter_tag.name)) re.findall(r"^h[1-5]$", tag.name or chapter_tag.name))
if title_in_text: if title_in_text:
self.html_preprocessor._add_span_to_save_ids_for_links( self.html_preprocessor.add_span_to_save_ids_for_links(title_in_text[-1], chapter_tag)
title_in_text[-1], chapter_tag)
title_in_text[-1].extract() title_in_text[-1].extract()
elif text_in_title: elif text_in_title:
[self.html_preprocessor._add_span_to_save_ids_for_links( [self.html_preprocessor.add_span_to_save_ids_for_links(tag, chapter_tag) for tag in text_in_title]
tag, chapter_tag) for tag in text_in_title]
[tag.extract() for tag in text_in_title] [tag.extract() for tag in text_in_title]
@staticmethod @staticmethod
@@ -135,12 +133,12 @@ class HtmlEpubProcessor:
and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]):
del tag.attrs["class"] del tag.attrs["class"]
def prepare_content(self, title_str: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: def prepare_content(self, title: str, chapter_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag:
""" """
Function finalise processing/cleaning content Function finalise processing/cleaning content
Parameters Parameters
---------- ----------
title_str: str title: str
chapter_tag: BeautifulSoup, soup object chapter_tag: BeautifulSoup, soup object
@@ -170,7 +168,7 @@ class HtmlEpubProcessor:
self._wrap_strings_with_p(chapter_tag) self._wrap_strings_with_p(chapter_tag)
# 3. # 3.
if remove_title_from_chapter: if remove_title_from_chapter:
self._remove_headings_content(chapter_tag, title_str) self._remove_headings_content(chapter_tag, title)
# 4. # 4.
_process_presets( _process_presets(
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag) html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)

View File

@@ -2,7 +2,7 @@ import re
import json import json
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from bs4.element import PageElement from bs4.element import PageElement
from typing import List, Dict, Union from typing import List, Set, Dict, Union
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
@@ -16,15 +16,60 @@ class HtmlPresetsProcessor:
"table_wrapper": self._process_tag_using_table, "table_wrapper": self._process_tag_using_table,
"decomposer": self._decompose_tag, "decomposer": self._decompose_tag,
"replacer": self._replace_tag, "replacer": self._replace_tag,
"attrs_remover": self._remove_attrs,
"attr_replacer": self._replace_attr, "attr_replacer": self._replace_attr,
"unwrapper": self._unwrap_tag, "unwrapper": self._unwrap_tag,
"inserter": self._insert_tag, "inserter": self._insert_tag,
"text_replacer": self._replace_text "text_replacer": self._replace_text
} }
self.conditions = {
"parent_tags": self._tags_with_parent_condition,
"child_tags": self._tags_with_child_condition,
"attrs": self._tags_with_attrs_condition,
"text": self._tags_with_text_condition
}
@staticmethod
def _tags_with_parent_condition(**kwargs):
found_tags: Set[Tag] = set()
for parent_tag in kwargs["body_tag"].select(kwargs["family_condition"]):
for tag in parent_tag.find_all([re.compile(tag) for tag in kwargs["tags"]]):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _tags_with_child_condition(**kwargs):
found_tags: Set[Tag] = set()
for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]]):
if tag.select(kwargs["family_condition"]):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _tags_with_attrs_condition(**kwargs):
found_tags: Set[Tag] = set()
names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]]
values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]]
attr_conditions: Dict[str, str] = dict(zip(names, values))
for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]],
attr_conditions):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod
def _tags_with_text_condition(**kwargs):
# find all tags that are in List of tags and tags that contains required text
found_tags: Set[Tag] = set()
for tag in kwargs["body_tag"].find_all(
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))",
t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]),
t.text)):
found_tags.add(tag)
return len(found_tags) != 0, list(found_tags)
@staticmethod @staticmethod
def _wrap_tag(**kwargs): def _wrap_tag(**kwargs):
kwargs["tag"].wrap(kwargs["body_tag"].new_tag( kwargs["found_tag"].wrap(kwargs["body_tag"].new_tag(
kwargs["rule"]["tag_to_wrap"]["name"])) kwargs["rule"]["tag_to_wrap"]["name"]))
@staticmethod @staticmethod
@@ -34,13 +79,13 @@ class HtmlPresetsProcessor:
parent_tag.attrs[key] = tag.attrs[key] parent_tag.attrs[key] = tag.attrs[key]
def _decompose_tag(self, **kwargs): def _decompose_tag(self, **kwargs):
if kwargs["tag"].parent: if kwargs["found_tag"].parent:
self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent) self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent)
kwargs["tag"].decompose() kwargs["found_tag"].decompose()
@staticmethod @staticmethod
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup], def add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
chapter_tag: BeautifulSoup): chapter_tag: BeautifulSoup):
""" """
Function adds span with id from tag_to_be_removed Function adds span with id from tag_to_be_removed
because this tag will be removed(unwrapped/extract) because this tag will be removed(unwrapped/extract)
@@ -82,29 +127,33 @@ class HtmlPresetsProcessor:
kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag( kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag(
"tr"), kwargs["body_tag"].new_tag("td") "tr"), kwargs["body_tag"].new_tag("td")
td.attrs["bgcolor"] = bg_color td.attrs["bgcolor"] = bg_color
kwargs["tag"].wrap(td) kwargs["found_tag"].wrap(td)
td.wrap(tr) td.wrap(tr)
tr.wrap(tbody) tr.wrap(tbody)
tbody.wrap(table) tbody.wrap(table)
table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
return table return table
_wrap_tag_with_table( _wrap_tag_with_table(
width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get( width=kwargs["found_tag"].attrs["width"] if kwargs["found_tag"].attrs.get(
"width") else "100", "width") else "100",
border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get( border=kwargs["found_tag"].attrs["border"] if kwargs["found_tag"].attrs.get(
"border") else None, "border") else None,
bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None) bg_color=kwargs["found_tag"].attrs["bgcolor"] if kwargs["found_tag"].attrs.get("bgcolor") else None)
self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"]) self.add_span_to_save_ids_for_links(kwargs["found_tag"], kwargs["body_tag"])
kwargs["tag"].unwrap() kwargs["found_tag"].unwrap()
@staticmethod @staticmethod
def _replace_tag(**kwargs): def _replace_tag(**kwargs):
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"] tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"]
kwargs["tag"].name = tag_to_replace kwargs["found_tag"].name = tag_to_replace
if kwargs["rule"]["tag_to_replace"].get("attrs"): if kwargs["rule"]["tag_to_replace"].get("attrs"):
dict_attributes = {attr["name"]: attr["value"] dict_attributes = {attr["name"]: attr["value"]
for attr in kwargs["rule"]["tag_to_replace"]["attrs"]} for attr in kwargs["rule"]["tag_to_replace"]["attrs"]}
kwargs["tag"].attrs = dict_attributes kwargs["found_tag"].attrs = dict_attributes
@staticmethod
def _remove_attrs(**kwargs):
kwargs["found_tag"].attrs = {}
@staticmethod @staticmethod
def _replace_attr(**kwargs): def _replace_attr(**kwargs):
@@ -114,21 +163,21 @@ class HtmlPresetsProcessor:
attr_to_replace, attr_value_to_replace =\ attr_to_replace, attr_value_to_replace =\
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"] kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
if attr_to_replace: if attr_to_replace:
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr_name] \ kwargs["found_tag"][attr_to_replace] = kwargs["found_tag"][attr_name] \
if kwargs["tag"].get(attr_name)\ if kwargs["found_tag"].get(attr_name)\
else "" else ""
if attr_value_to_replace: if attr_value_to_replace:
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace kwargs["found_tag"].attrs[attr_to_replace] = attr_value_to_replace
del kwargs["tag"][attr_name] del kwargs["found_tag"][attr_name]
elif attr_value_to_replace: elif attr_value_to_replace:
kwargs["tag"].attrs[attr_name] = attr_value_to_replace kwargs["found_tag"].attrs[attr_name] = attr_value_to_replace
elif attr_name: elif attr_name:
del kwargs["tag"][attr_name] del kwargs["found_tag"][attr_name]
def _unwrap_tag(self, **kwargs): def _unwrap_tag(self, **kwargs):
if kwargs["tag"].parent: if kwargs["found_tag"].parent:
self.set_attrs_to_parent(kwargs["tag"], kwargs["tag"].parent) self.set_attrs_to_parent(kwargs["found_tag"], kwargs["found_tag"].parent)
kwargs["tag"].unwrap() kwargs["found_tag"].unwrap()
@staticmethod @staticmethod
def _insert_tag(**kwargs): def _insert_tag(**kwargs):
@@ -138,29 +187,29 @@ class HtmlPresetsProcessor:
kwargs["body_tag"].new_tag( kwargs["body_tag"].new_tag(
kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes) kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes)
# insert all items that was in tag to subtag and remove from tag # insert all items that was in tag to subtag and remove from tag
for content in reversed(kwargs["tag"].contents): for content in reversed(kwargs["found_tag"].contents):
tag_to_insert.insert(0, content.extract()) tag_to_insert.insert(0, content.extract())
# wrap subtag with items # wrap subtag with items
kwargs["tag"].append(tag_to_insert) kwargs["found_tag"].append(tag_to_insert)
@staticmethod @staticmethod
def _replace_text(**kwargs): def _replace_text(**kwargs):
if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["tag"].string): if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["found_tag"].string):
new_text = re.sub(re.compile( new_text = re.sub(re.compile(
kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["tag"].string) kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["found_tag"].string)
kwargs["tag"].string.replace_with(new_text) kwargs["found_tag"].string.replace_with(new_text)
@staticmethod def process_tags(self,
def _process_tags(body_tag: BeautifulSoup, body_tag: BeautifulSoup,
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
action): action):
""" """
Function does action with tags Function does action with tags
Parameters Parameters
---------- ----------
body_tag: BeautifulSoup body_tag: BeautifulSoup
Tag & contents of the body tag Tag & contents of the body tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]] preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
list of conditions when fire function list of conditions when fire function
action: function action: function
action what to do with tag action what to do with tag
@@ -170,39 +219,34 @@ class HtmlPresetsProcessor:
Body Tag with processed certain tags Body Tag with processed certain tags
""" """
for rule in rules: for preset_rule in preset_rules:
tags: List[str] = rule["tags"] if rule.get( tags: List[str] = preset_rule["tags"] if preset_rule.get(
"tags") else rule["condition"]["tags"] "tags") else preset_rule["condition"]["tags"]
if rule["condition"]: found_tags: List[Tag] = []
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): if preset_rule["condition"]:
if condition_on_tag[0] == "parent_tags": conditions_on_tag = tuple((k, v) for k, v in preset_rule["condition"].items() if v)
for parent_tag in body_tag.select(condition_on_tag[1]): for condition_on_tag in conditions_on_tag:
for tag in parent_tag.find_all([re.compile(tag) for tag in tags]): condition_func = self.conditions[condition_on_tag[0]]
# parent_tag != tag.parent was_found, f_tags = condition_func(body_tag=body_tag,
action(body_tag=body_tag, tag=tag, rule=rule) tags=tags,
elif condition_on_tag[0] == "child_tags": rule=preset_rule,
for tag in body_tag.find_all([re.compile(tag) for tag in tags]): family_condition=condition_on_tag[1])
if tag.select(condition_on_tag[1]): found_tags = found_tags + f_tags if was_found else []
action(body_tag=body_tag, tag=tag, rule=rule) if not was_found:
elif condition_on_tag[0] == "attrs": break
for attr in rule["condition"]["attrs"]: # if there are several conditions on tags and found_tags isn't empty
for tag in body_tag.find_all([re.compile(tag) for tag in tags], if len(conditions_on_tag) > 1 and found_tags:
{attr["name"]: re.compile(fr"{attr['value']}")}): # tags satisfying all conditions(>1)
action(body_tag=body_tag, tag=tag, rule=rule) found_tags = [tag for tag in found_tags if found_tags.count(tag) > 1]
elif condition_on_tag[0] == "text": for found_tag in found_tags:
# find all tags that are in List of tags and tags that contains required text action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
for tag in body_tag.find_all(
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in tags]) + r"))",
t.name) and re.search(re.compile(rule["condition"]["text"]),
t.text)):
action(body_tag=body_tag, tag=tag, rule=rule)
else: else:
for tag in body_tag.find_all([re.compile(tag) for tag in tags]): for found_tag in body_tag.find_all([re.compile(tag) for tag in tags]):
action(body_tag=body_tag, tag=tag, rule=rule) action(body_tag=body_tag, found_tag=found_tag, rule=preset_rule)
def _process_presets(html_preprocessor: HtmlPresetsProcessor, html_soup: BeautifulSoup): def _process_presets(html_preprocessor: HtmlPresetsProcessor, html_soup: BeautifulSoup):
for rule in html_preprocessor.preset: for preset in html_preprocessor.preset:
# html_preprocessor.logger.log(rule["preset_name"].title() + " process.") # html_preprocessor.logger.log(rule["preset_name"].title() + " process.")
action = html_preprocessor.name2action[rule["preset_name"]] action = html_preprocessor.name2action[preset["preset_name"]]
html_preprocessor._process_tags(html_soup, rule["rules"], action) html_preprocessor.process_tags(html_soup, preset["rules"], action)

View File

@@ -109,24 +109,27 @@ class StyleReader:
return constraints_on_value, value_not_in_possible_values_list return constraints_on_value, value_not_in_possible_values_list
def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list: def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
for i, style in reversed(list(enumerate(split_style))): try:
style_name, style_value = style.split(":") for i, style in reversed(list(enumerate(split_style))):
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: style_name, style_value = style.split(":")
# property not in LIVECARTA_STYLE_ATTRS, remove if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
split_style.remove(style) # property not in LIVECARTA_STYLE_ATTRS, remove
continue split_style.remove(style)
continue
cleaned_value = self.clean_value(style_value, style_name) cleaned_value = self.clean_value(style_value, style_name)
if all(self.style_conditions(cleaned_value, style_name)): if all(self.style_conditions(cleaned_value, style_name)):
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove
split_style.remove(style) split_style.remove(style)
continue continue
else: else:
if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING: if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
# function that converts our data # function that converts our data
func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name] func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name]
style_value = func(cleaned_value) style_value = func(cleaned_value)
split_style[i] = style_name + ":" + style_value split_style[i] = style_name + ":" + style_value
except ValueError as ve:
print(f"Style value isn't correct.")
return split_style return split_style
def build_inline_style_content(self, style: str) -> str: def build_inline_style_content(self, style: str) -> str: