Merge pull request #307 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
This commit is contained in:
bivis
2022-10-14 10:27:39 +03:00
committed by GitHub
5 changed files with 144 additions and 49 deletions

View File

@@ -80,7 +80,8 @@ class DocxBook(BookSolver):
if __name__ == "__main__":
docx_file_path = "../../books/docx/AmericanGovernment3e-WEB.docx"
docx_file_path = "../../books/docx/output.docx"
logger_object = BookLogger(
name="docx", book_id=docx_file_path.split("/")[-1])
locker = Event()
@@ -88,7 +89,6 @@ if __name__ == "__main__":
html_converter = Docx2LibreHtml(file_path=docx_file_path,
logger=logger_object, libre_locker=locker)
html_preprocessor = HtmlPresetsProcessor(
logger=logger_object, preset_path="../../presets/docx_presets.json")
style_preprocessor = StyleReader()

View File

@@ -43,7 +43,7 @@ class EpubBook(BookSolver):
if __name__ == "__main__":
epub_file_path = "../../books/epub/9780763774134.epub"
epub_file_path = "../../books/epub/9781634259804.epub"
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])

View File

@@ -18,13 +18,14 @@ class HtmlPresetsProcessor:
"replacer": self._replace_tag,
"attr_replacer": self._replace_attr,
"unwrapper": self._unwrap_tag,
"inserter": self._insert_tag
"inserter": self._insert_tag,
"text_replacer": self._replace_text
}
@staticmethod
def _wrap_tag(**kwargs):
kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
kwargs["rule"]["tag_to_wrap"]))
kwargs["rule"]["tag_to_wrap"]["name"]))
@staticmethod
def set_attrs_to_parent(tag, parent_tag):
@@ -98,24 +99,31 @@ class HtmlPresetsProcessor:
@staticmethod
def _replace_tag(**kwargs):
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]["name"]
kwargs["tag"].name = tag_to_replace
if kwargs["rule"]["tag_to_replace"].get("attrs"):
dict_attributes = {attr["name"]: attr["value"]
for attr in kwargs["rule"]["tag_to_replace"]["attrs"]}
kwargs["tag"].attrs = dict_attributes
@staticmethod
def _replace_attr(**kwargs):
attr, attr_value =\
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
attr = kwargs["rule"]["condition"]["attrs"][0]
attr_name, attr_value =\
attr["name"], attr["value"]
attr_to_replace, attr_value_to_replace =\
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
if attr_to_replace:
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr_name] \
if kwargs["tag"].get(attr_name)\
else ""
if attr_value_to_replace:
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
del kwargs["tag"][attr]
del kwargs["tag"][attr_name]
elif attr_value_to_replace:
kwargs["tag"].attrs[attr] = attr_value_to_replace
elif attr:
del kwargs["tag"][attr]
kwargs["tag"].attrs[attr_name] = attr_value_to_replace
elif attr_name:
del kwargs["tag"][attr_name]
def _unwrap_tag(self, **kwargs):
if kwargs["tag"].parent:
@@ -124,14 +132,24 @@ class HtmlPresetsProcessor:
@staticmethod
def _insert_tag(**kwargs):
dict_attributes = {attr["name"]: attr["value"]
for attr in kwargs["rule"]["tag_to_insert"]["attrs"]}
tag_to_insert = \
kwargs["body_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
kwargs["body_tag"].new_tag(
kwargs["rule"]["tag_to_insert"]["name"], attrs=dict_attributes)
# insert all items that was in tag to subtag and remove from tag
for content in reversed(kwargs["tag"].contents):
tag_to_insert.insert(0, content.extract())
# wrap subtag with items
kwargs["tag"].append(tag_to_insert)
@staticmethod
def _replace_text(**kwargs):
if re.search(re.compile(kwargs["rule"]["condition"]["text"]), kwargs["tag"].string):
new_text = re.sub(re.compile(
kwargs["rule"]["condition"]["text"]), kwargs["rule"]["text_to_replace"], kwargs["tag"].string)
kwargs["tag"].string.replace_with(new_text)
@staticmethod
def _process_tags(body_tag: BeautifulSoup,
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
@@ -171,11 +189,12 @@ class HtmlPresetsProcessor:
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
action(body_tag=body_tag, tag=tag, rule=rule)
# attr replacer
elif condition_on_tag[0] == "tags":
attr = rule["attr"]
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
{attr['name']: re.compile(fr"{attr['value']}")}):
elif condition_on_tag[0] == "text":
# find all tags that are in List of tags and tags that contains required text
for tag in body_tag.find_all(
lambda t: re.search(r"(?=(" + '|'.join([tag for tag in tags]) + r"))",
t.name) and re.search(re.compile(rule["condition"]["text"]),
t.text)):
action(body_tag=body_tag, tag=tag, rule=rule)
else:
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):