Merge pull request #324 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
This commit is contained in:
bivis
2022-12-12 13:21:55 +03:00
committed by GitHub
5 changed files with 31 additions and 69 deletions

View File

@@ -2,58 +2,6 @@
{ {
"preset_name": "table_wrapper", "preset_name": "table_wrapper",
"rules": [ "rules": [
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "width",
"value": ".*"
}
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "border",
"value": ".*"
}
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "style",
"value": "border.*"
}
]
}
},
{
"tags": ["^div$"],
"condition": {
"parent_tags": null,
"child_tags": null,
"attrs": [
{
"name": "bgcolor",
"value": ".*"
}
]
}
},
{ {
"tags": ["^section$", "^blockquote$"], "tags": ["^section$", "^blockquote$"],
"condition": { "condition": {
@@ -73,7 +21,7 @@
"preset_name": "replacer", "preset_name": "replacer",
"rules": [ "rules": [
{ {
"tags": ["^h[6-9]$", "^figure$", "^section$", "^div$", "blockquote"], "tags": ["^h[6-9]$", "^figure$", "^section$", "blockquote"],
"condition": null, "condition": null,
"tag_to_replace": { "tag_to_replace": {
"name": "p" "name": "p"
@@ -127,7 +75,7 @@
} }
] ]
}, },
{ {
"preset_name": "attrs_remover", "preset_name": "attrs_remover",
"rules": [ "rules": [
{ {

View File

@@ -52,12 +52,13 @@ class EpubBook(BookSolver):
if __name__ == "__main__": if __name__ == "__main__":
epub_file_path = f"../../books/epub/9781614382264.epub" epub_file_path = f"../../books/epub/Deep_Learning_with_Python_Second_Editio.epub"
logger_object = BookLogger(name="epub") logger_object = BookLogger(name="epub")
logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1]) logger_object.configure_book_logger(book_id=epub_file_path.split("/")[-1])
html_preprocessor = HtmlPresetsProcessor( html_preprocessor = HtmlPresetsProcessor(
logger=logger_object, preset_path="../../preset/epub_presets.json") logger=logger_object, preset_path="../../preset/epub_presets.json")
style_preprocessor = StyleReader() style_preprocessor = StyleReader()
html_processor = HtmlEpubProcessor(logger=logger_object, html_processor = HtmlEpubProcessor(logger=logger_object,

View File

@@ -2,7 +2,7 @@ import re
import json import json
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from bs4.element import PageElement from bs4.element import PageElement
from typing import List, Set, Dict, Union from typing import Union
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
@@ -29,42 +29,49 @@ class HtmlPresetsProcessor:
"text": self._tags_with_text_condition "text": self._tags_with_text_condition
} }
@staticmethod @staticmethod
def _tags_with_parent_condition(**kwargs): def _tags_with_parent_condition(**kwargs):
found_tags: Set[Tag] = set() found_tags: list[Tag] = list()
# add unique id in order not to add duplicates to the
# found_tags(because tag with subtag could duplicate found_tag)
u_id = 0
for parent_tag in kwargs["body_tag"].select(kwargs["family_condition"]): for parent_tag in kwargs["body_tag"].select(kwargs["family_condition"]):
for tag in parent_tag.find_all([re.compile(tag) for tag in kwargs["tags"]]): for tag in parent_tag.find_all([re.compile(tag) for tag in kwargs["tags"]]):
found_tags.add(tag) if not tag.attrs.get("unique_id"):
tag.attrs["unique_id"] = u_id
u_id += 1
found_tags.append(tag)
return len(found_tags) != 0, list(found_tags) return len(found_tags) != 0, list(found_tags)
@staticmethod @staticmethod
def _tags_with_child_condition(**kwargs): def _tags_with_child_condition(**kwargs):
found_tags: Set[Tag] = set() found_tags: list[Tag] = list()
for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]]): for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]]):
if tag.select(kwargs["family_condition"]): if tag.select(kwargs["family_condition"]):
found_tags.add(tag) found_tags.append(tag)
return len(found_tags) != 0, list(found_tags) return len(found_tags) != 0, list(found_tags)
@staticmethod @staticmethod
def _tags_with_attrs_condition(**kwargs): def _tags_with_attrs_condition(**kwargs):
found_tags: Set[Tag] = set() found_tags: list[Tag] = list()
names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]] names = [attr["name"] for attr in kwargs["rule"]["condition"]["attrs"]]
values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]] values = [re.compile(attr["value"]) for attr in kwargs["rule"]["condition"]["attrs"]]
attr_conditions: Dict[str, str] = dict(zip(names, values)) attr_conditions: dict[str, re] = dict(zip(names, values))
for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]], for tag in kwargs["body_tag"].find_all([re.compile(tag) for tag in kwargs["tags"]],
attr_conditions): attr_conditions):
found_tags.add(tag) found_tags.append(tag)
return len(found_tags) != 0, list(found_tags) return len(found_tags) != 0, list(found_tags)
@staticmethod @staticmethod
def _tags_with_text_condition(**kwargs): def _tags_with_text_condition(**kwargs):
# find all tags that are in List of tags and tags that contains required text # find all tags that are in List of tags and tags that contains required text
found_tags: Set[Tag] = set() found_tags: list[Tag] = list()
for tag in kwargs["body_tag"].find_all( for tag in kwargs["body_tag"].find_all(
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))", lambda t: re.search(r"(?=(" + '|'.join([tag for tag in kwargs["tags"]]) + r"))",
t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]), t.name) and re.search(re.compile(kwargs["rule"]["condition"]["text"]),
t.text)): t.text)):
found_tags.add(tag) found_tags.append(tag)
return len(found_tags) != 0, list(found_tags) return len(found_tags) != 0, list(found_tags)
@staticmethod @staticmethod
@@ -104,7 +111,7 @@ class HtmlPresetsProcessor:
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
tag_to_be_removed: Tag, tag_to_be_removed: Tag,
id_: str, id_: str,
class_: Union[List[str], str]): class_: Union[list[str], str]):
"""Function inserts span before tag aren't supported by LiveCarta""" """Function inserts span before tag aren't supported by LiveCarta"""
new_tag: Tag = chapter_tag.new_tag("span") new_tag: Tag = chapter_tag.new_tag("span")
new_tag.attrs["id"] = id_ or "" new_tag.attrs["id"] = id_ or ""
@@ -201,7 +208,7 @@ class HtmlPresetsProcessor:
def process_tags(self, def process_tags(self,
body_tag: BeautifulSoup, body_tag: BeautifulSoup,
preset_rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]], preset_rules: list[dict[str, Union[list[str], str, dict[str, Union[list[dict[str, str]], int, str]]]]],
action): action):
""" """
Function does action with tags Function does action with tags
@@ -220,9 +227,9 @@ class HtmlPresetsProcessor:
""" """
for preset_rule in preset_rules: for preset_rule in preset_rules:
tags: List[str] = preset_rule["tags"] if preset_rule.get( tags: list[str] = preset_rule["tags"] if preset_rule.get(
"tags") else preset_rule["condition"]["tags"] "tags") else preset_rule["condition"]["tags"]
found_tags: List[Tag] = [] found_tags: list[Tag] = []
if preset_rule["condition"]: if preset_rule["condition"]:
conditions_on_tag = tuple((k, v) for k, v in preset_rule["condition"].items() if v) conditions_on_tag = tuple((k, v) for k, v in preset_rule["condition"].items() if v)
for condition_on_tag in conditions_on_tag: for condition_on_tag in conditions_on_tag:

View File

@@ -71,6 +71,8 @@ class LiveCartaConfig:
"border-left-width": [], "border-left-width": [],
"border-bottom-width": [], "border-bottom-width": [],
"border-top": [], "border-top": [],
"border-right": [],
"border-left": [],
"border-bottom": [], "border-bottom": [],
"list-style-type": [], "list-style-type": [],
"list-style-image": [], "list-style-image": [],

View File

@@ -32,6 +32,8 @@ class StyleReader:
"border-left-width": self.convert_tag_style_values, "border-left-width": self.convert_tag_style_values,
"border-bottom-width": self.convert_tag_style_values, "border-bottom-width": self.convert_tag_style_values,
"border-top": self.convert_tag_style_values, "border-top": self.convert_tag_style_values,
"border-right": self.convert_tag_style_values,
"border-left": self.convert_tag_style_values,
"border-bottom": self.convert_tag_style_values, "border-bottom": self.convert_tag_style_values,
"list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
"list-style-image": lambda x: "disc", "list-style-image": lambda x: "disc",
@@ -88,6 +90,8 @@ class StyleReader:
values[size_number_idx] = convert_size_number(values[size_number_idx], "pt", 4 / 3) values[size_number_idx] = convert_size_number(values[size_number_idx], "pt", 4 / 3)
elif has_size.group(2) == "in": elif has_size.group(2) == "in":
values[size_number_idx] = convert_size_number(values[size_number_idx], "in", 96) values[size_number_idx] = convert_size_number(values[size_number_idx], "in", 96)
elif has_size.group(2) == "rem":
values[size_number_idx] = convert_size_number(values[size_number_idx], "rem", 16)
size_value = " ".join(values) size_value = " ".join(values)
return size_value return size_value