forked from LiveCarta/BookConverter
Merge pull request #302 from Teqniksoft/kiryl/converter_fix
Kiryl/converter fix
This commit is contained in:
@@ -13,8 +13,7 @@ from src.inline_style_processor import modify_html_soup_with_css_styles
|
||||
class HtmlDocxProcessor:
|
||||
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
|
||||
self.logger = logger
|
||||
self.html_soup = html_soup
|
||||
self.body_tag = self.html_soup.body
|
||||
self.body_tag: BeautifulSoup = BeautifulSoup(str(html_soup.body))
|
||||
self.html_preprocessor = html_preprocessor
|
||||
self.style_preprocessor = style_preprocessor
|
||||
self.content: List[Tag] = []
|
||||
@@ -23,7 +22,6 @@ class HtmlDocxProcessor:
|
||||
for font in self.body_tag.find_all("font"):
|
||||
font.name = "span"
|
||||
|
||||
|
||||
def _process_hrefs(self):
|
||||
a_tags_with_href = self.body_tag.find_all(
|
||||
"a", {"href": re.compile("^.*http.+")})
|
||||
@@ -206,10 +204,9 @@ class HtmlDocxProcessor:
|
||||
else:
|
||||
h_tag.unwrap()
|
||||
|
||||
|
||||
def delete_content_before_toc(self):
|
||||
# remove all tag upper the <TOC> only in content !!! body tag is not updated
|
||||
toc_tag = self.html_soup.new_tag("TOC")
|
||||
toc_tag = self.body_tag.new_tag("TOC")
|
||||
if toc_tag in self.content:
|
||||
ind = self.content.index(toc_tag) + 1
|
||||
self.content = self.content[ind:]
|
||||
@@ -229,7 +226,7 @@ class HtmlDocxProcessor:
|
||||
self.body_tag)
|
||||
|
||||
self.logger.log("Inline style processing.")
|
||||
modify_html_soup_with_css_styles(self.body_tag)
|
||||
self.body_tag = modify_html_soup_with_css_styles(self.body_tag)
|
||||
|
||||
self.logger.log("Image processing.")
|
||||
images = process_images(access, path_to_html=html_path,
|
||||
@@ -256,9 +253,9 @@ class HtmlDocxProcessor:
|
||||
|
||||
self.logger.log(f".html using presets processing.")
|
||||
_process_presets(html_preprocessor=self.html_preprocessor,
|
||||
html_soup=self.html_soup)
|
||||
html_soup=self.body_tag)
|
||||
|
||||
self.content = self.body_tag.find_all(recursive=False)
|
||||
self.content = self.body_tag.body.find_all(recursive=False)
|
||||
# delete text before table of content if exists
|
||||
self.delete_content_before_toc()
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import re
|
||||
from typing import Union
|
||||
from typing import List, Union
|
||||
from bs4.element import PageElement
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
||||
|
||||
@@ -92,26 +92,26 @@ class HtmlEpubProcessor:
|
||||
clean/remove headings & add span with id
|
||||
|
||||
"""
|
||||
title_of_chapter = title_of_chapter.lower()
|
||||
for tag in chapter_tag.contents:
|
||||
tag: PageElement
|
||||
def text_preparing(tag: PageElement):
|
||||
text: str = tag if isinstance(tag, NavigableString) else tag.text
|
||||
if re.sub(r"[\s\xa0]", "", text):
|
||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||
text = text.strip() # delete extra spaces
|
||||
if not isinstance(tag, NavigableString):
|
||||
if title_of_chapter == text or \
|
||||
(title_of_chapter in text and
|
||||
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
|
||||
self.html_preprocessor._add_span_to_save_ids_for_links(
|
||||
tag, chapter_tag)
|
||||
tag.extract()
|
||||
return
|
||||
elif not self._remove_headings_content(tag, title_of_chapter):
|
||||
break
|
||||
else:
|
||||
tag.extract()
|
||||
return
|
||||
text = re.sub(r"[\s\xa0]", " ", text).lower()
|
||||
text = text.strip() # delete extra spaces
|
||||
return text
|
||||
|
||||
title_of_chapter: str = title_of_chapter.lower()
|
||||
title_in_text: List[Tag] = chapter_tag.find_all(lambda tag: title_of_chapter == text_preparing(tag) or \
|
||||
(title_of_chapter in text_preparing(tag) and
|
||||
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)))
|
||||
|
||||
text_in_title: List[Tag] = chapter_tag.find_all(lambda tag: (text_preparing(tag) in title_of_chapter))
|
||||
if title_in_text:
|
||||
self.html_preprocessor._add_span_to_save_ids_for_links(
|
||||
title_in_text[-1], chapter_tag)
|
||||
title_in_text[-1].extract()
|
||||
elif text_in_title:
|
||||
[self.html_preprocessor._add_span_to_save_ids_for_links(
|
||||
tag, chapter_tag) for tag in text_in_title]
|
||||
[tag.extract() for tag in text_in_title]
|
||||
|
||||
@staticmethod
|
||||
def _class_removing(chapter_tag: BeautifulSoup):
|
||||
|
||||
@@ -28,6 +28,7 @@ class HtmlPresetsProcessor:
|
||||
|
||||
@staticmethod
|
||||
def _decompose_tag(**kwargs):
|
||||
kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
|
||||
kwargs["tag"].decompose()
|
||||
|
||||
@staticmethod
|
||||
@@ -112,6 +113,7 @@ class HtmlPresetsProcessor:
|
||||
|
||||
@staticmethod
|
||||
def _unwrap_tag(**kwargs):
|
||||
kwargs["tag"].parent.attrs.update(kwargs["tag"].attrs)
|
||||
kwargs["tag"].unwrap()
|
||||
|
||||
@staticmethod
|
||||
@@ -153,7 +155,6 @@ class HtmlPresetsProcessor:
|
||||
for parent_tag in body_tag.select(condition_on_tag[1]):
|
||||
for tag in parent_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
# parent_tag != tag.parent
|
||||
tag.parent.attrs.update(tag.attrs)
|
||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||
elif condition_on_tag[0] == "child_tags":
|
||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
|
||||
@@ -14,7 +14,7 @@ class InlineStyleProcessor:
|
||||
def __init__(self, tag_inline_style: Tag):
|
||||
# tag with inline style + style parsed from css file
|
||||
self.tag_inline_style = tag_inline_style
|
||||
self.tag_inline_style.attrs['style']: str = self.process_inline_style()
|
||||
self.tag_inline_style.attrs["style"]: str = self.process_inline_style()
|
||||
|
||||
@staticmethod
|
||||
def remove_white_if_no_bgcolor(style_: str, tag: Tag) -> str:
|
||||
@@ -80,19 +80,19 @@ class InlineStyleProcessor:
|
||||
processed_style = ";".join(split_style)+';'
|
||||
|
||||
margin_left_regexp = re.compile(
|
||||
r"((margin-left|margin): *(-*\w+);*)")
|
||||
r"((margin-left|margin): *-*((\d*)\.*\d+)\w+;*)")
|
||||
text_indent_regexp = re.compile(
|
||||
r"(text-indent: *(-*\w+);*)")
|
||||
r"(text-indent: *-*((\d*)\.*\d+)\w+;*)")
|
||||
|
||||
has_margin = re.search(margin_left_regexp, processed_style)
|
||||
has_text_indent = re.search(text_indent_regexp, processed_style)
|
||||
if has_margin:
|
||||
num_m = abs(int("0" + "".join(
|
||||
filter(str.isdigit, str(has_margin.group(3))))))
|
||||
filter(str.isdigit, str(has_margin.group(4))))))
|
||||
|
||||
if has_text_indent:
|
||||
num_ti = abs(int("0" + "".join(
|
||||
filter(str.isdigit, str(has_text_indent.group(2))))))
|
||||
filter(str.isdigit, str(has_text_indent.group(3))))))
|
||||
processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
|
||||
str(abs(num_m - num_ti)) + "px; ")
|
||||
processed_style = processed_style.replace(
|
||||
@@ -106,7 +106,7 @@ class InlineStyleProcessor:
|
||||
elif has_text_indent:
|
||||
processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " +
|
||||
str(abs(int("0" + "".join(
|
||||
filter(str.isdigit, str(has_text_indent.group(2)))))))
|
||||
filter(str.isdigit, str(has_text_indent.group(3)))))))
|
||||
+ "px; ")
|
||||
return processed_style
|
||||
return processed_style
|
||||
@@ -127,22 +127,25 @@ class InlineStyleProcessor:
|
||||
processed inline style
|
||||
|
||||
"""
|
||||
inline_style = self.tag_inline_style.attrs.get("style") + ";"
|
||||
# 1. Remove white color if tag doesn"t have background color in style
|
||||
inline_style = self.remove_white_if_no_bgcolor(
|
||||
inline_style, self.tag_inline_style)
|
||||
inline_style = inline_style.replace(
|
||||
"list-style-image", "list-style-type")
|
||||
# 2. Create list of styles from inline style
|
||||
# replace all spaces between "; & letter" to ";"
|
||||
style = re.sub(r"; *", ";", inline_style)
|
||||
# when we split style by ";", last element of the list is "" - None (remove it)
|
||||
split_inline_style: list = list(filter(None, style.split(";")))
|
||||
# 3. Duplicate styles check - if the tag had duplicate styles
|
||||
# split_inline_style = self.duplicate_styles_check(split_inline_style)
|
||||
# 4. Processing indents
|
||||
inline_style: str = self.indents_processing(split_inline_style)
|
||||
return inline_style
|
||||
if self.tag_inline_style.attrs.get("style"):
|
||||
inline_style = self.tag_inline_style.attrs.get("style") + ";"
|
||||
# 1. Remove white color if tag doesn't have background color in style
|
||||
inline_style = self.remove_white_if_no_bgcolor(
|
||||
inline_style, self.tag_inline_style)
|
||||
inline_style = inline_style.replace(
|
||||
"list-style-image", "list-style-type")
|
||||
# 2. Create list of styles from inline style
|
||||
# replace all spaces between "; & letter" to ";"
|
||||
style = re.sub(r"; *", ";", inline_style)
|
||||
# when we split style by ";", last element of the list is "" - None (remove it)
|
||||
split_inline_style: list = list(filter(None, style.split(";")))
|
||||
# 3. Duplicate styles check - if the tag had duplicate styles
|
||||
# split_inline_style = self.duplicate_styles_check(split_inline_style)
|
||||
# 4. Processing indents
|
||||
inline_style: str = self.indents_processing(split_inline_style)
|
||||
return inline_style
|
||||
else:
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def check_style_to_be_tag(style: str) -> List[tuple]:
|
||||
|
||||
@@ -59,6 +59,7 @@ class LiveCartaConfig:
|
||||
"font-style": ["italic"], # <i>
|
||||
"text-decoration": ["underline", "line-through"], # <u> , <s>
|
||||
"text-decoration-line": ["underline", "line-through"], # <u> , <s>
|
||||
"text-transform": [],
|
||||
"vertical-align": ["super"], # <sup>
|
||||
"color": [],
|
||||
"background-color": [],
|
||||
@@ -76,4 +77,5 @@ class LiveCartaConfig:
|
||||
"margin-left": [],
|
||||
"margin-top": [],
|
||||
"margin": [],
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import re
|
||||
import cssutils
|
||||
from typing import Tuple
|
||||
from typing import List, Tuple, Union
|
||||
from os.path import dirname, normpath, join
|
||||
|
||||
from src.util.color_reader import str2hex
|
||||
@@ -16,28 +16,29 @@ class StyleReader:
|
||||
to suit LiveCarta style convention.
|
||||
"""
|
||||
self.LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||
"text-indent": self.convert_indents_tag_values,
|
||||
"text-indent": lambda x: self.convert_tag_style_values(x, is_indent=True),
|
||||
"font-variant": lambda x: x,
|
||||
"text-align": lambda x: x,
|
||||
"font": lambda x: "",
|
||||
"font-family": lambda x: x,
|
||||
"font-size": self.convert_tag_style_values,
|
||||
"text-transform": lambda x: x,
|
||||
"color": self.get_text_color,
|
||||
"background-color": self.get_bg_color,
|
||||
"background": self.get_bg_color,
|
||||
"border": lambda x: x if x != "0" else "",
|
||||
"border-top-width": lambda x: x if x != "0" else "",
|
||||
"border-right-width": lambda x: x if x != "0" else "",
|
||||
"border-left-width": lambda x: x if x != "0" else "",
|
||||
"border-bottom-width": lambda x: x if x != "0" else "",
|
||||
"border-top": lambda x: x if x != "0" else "",
|
||||
"border-bottom": lambda x: x if x != "0" else "",
|
||||
"border": self.convert_tag_style_values,
|
||||
"border-top-width": self.convert_tag_style_values,
|
||||
"border-right-width": self.convert_tag_style_values,
|
||||
"border-left-width": self.convert_tag_style_values,
|
||||
"border-bottom-width": self.convert_tag_style_values,
|
||||
"border-top": self.convert_tag_style_values,
|
||||
"border-bottom": self.convert_tag_style_values,
|
||||
"list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc",
|
||||
"list-style-image": lambda x: "disc",
|
||||
"margin-left": self.convert_indents_tag_values,
|
||||
"margin-top": self.convert_tag_style_values,
|
||||
"margin": self.convert_indents_tag_values,
|
||||
"width": self.convert_tag_style_values,
|
||||
"margin-left": lambda x: self.convert_tag_style_values(x, is_indent=True),
|
||||
"margin-top": lambda x: self.convert_tag_style_values(x, is_indent=True),
|
||||
"margin": lambda x: self.convert_tag_style_values(x, is_indent=True),
|
||||
"width": lambda x: self.convert_tag_style_values(x) if "%" not in x else x
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@@ -68,43 +69,26 @@ class StyleReader:
|
||||
-------
|
||||
size_value: str
|
||||
converted value size
|
||||
|
||||
"""
|
||||
size_regexp = re.compile(
|
||||
r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)|(^-*(\d*\.*\d+)in$)")
|
||||
has_style_attrs = re.search(size_regexp, size_value)
|
||||
if has_style_attrs:
|
||||
if has_style_attrs.group(1):
|
||||
def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str:
|
||||
size_number = float(size_number.replace(unit_to_replace, "")) * multiplier
|
||||
return str(size_number) + "px"
|
||||
has_size = re.search(r"(\d+)([\w%]+)", size_value)
|
||||
values: List = size_value.split(" ")
|
||||
if has_size:
|
||||
size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0]
|
||||
if has_size.group(2) == "%":
|
||||
multiplier = 5.76 if is_indent else 0.16
|
||||
size_value = float(size_value.replace("%", "")) * multiplier
|
||||
return str(size_value) + "px"
|
||||
elif has_style_attrs.group(3):
|
||||
values[size_number_idx] = convert_size_number(values[size_number_idx], "%", multiplier)
|
||||
elif has_size.group(2) == "em":
|
||||
multiplier = 18 if is_indent else 16
|
||||
size_value = float(size_value.replace("em", "")) * multiplier
|
||||
return str(size_value) + "px"
|
||||
elif has_style_attrs.group(5):
|
||||
size_value = float(size_value.replace("pt", "")) * 4/3
|
||||
return str(size_value) + "px"
|
||||
elif has_style_attrs.group(7):
|
||||
size_value = float(size_value.replace("in", "")) * 96
|
||||
return str(size_value) + "px"
|
||||
else:
|
||||
return ""
|
||||
return size_value
|
||||
|
||||
def convert_indents_tag_values(self, size_value: str) -> str:
|
||||
"""
|
||||
Function converts values of ["text-indent", "margin-left", "margin"]
|
||||
Parameters
|
||||
----------
|
||||
size_value: str
|
||||
|
||||
Returns
|
||||
-------
|
||||
size_value: str
|
||||
|
||||
"""
|
||||
size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\
|
||||
else self.convert_tag_style_values(size_value.split(" ")[-1], True)
|
||||
values[size_number_idx] = convert_size_number(values[size_number_idx], "em", multiplier)
|
||||
elif has_size.group(2) == "pt":
|
||||
values[size_number_idx] = convert_size_number(values[size_number_idx], "pt", 4 / 3)
|
||||
elif has_size.group(2) == "in":
|
||||
values[size_number_idx] = convert_size_number(values[size_number_idx], "in", 96)
|
||||
size_value = " ".join(values)
|
||||
return size_value
|
||||
|
||||
@staticmethod
|
||||
@@ -125,17 +109,18 @@ class StyleReader:
|
||||
return constraints_on_value, value_not_in_possible_values_list
|
||||
|
||||
def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list:
|
||||
for i, style in enumerate(split_style):
|
||||
for i, style in reversed(list(enumerate(split_style))):
|
||||
style_name, style_value = style.split(":")
|
||||
if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS:
|
||||
# property not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||
split_style[i] = ""
|
||||
return split_style
|
||||
# property not in LIVECARTA_STYLE_ATTRS, remove
|
||||
split_style.remove(style)
|
||||
continue
|
||||
|
||||
cleaned_value = self.clean_value(style_value, style_name)
|
||||
if all(self.style_conditions(cleaned_value, style_name)):
|
||||
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file
|
||||
split_style[i] = ""
|
||||
# there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove
|
||||
split_style.remove(style)
|
||||
continue
|
||||
else:
|
||||
if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING:
|
||||
# function that converts our data
|
||||
@@ -156,7 +141,7 @@ class StyleReader:
|
||||
|
||||
split_style = self.update_inline_styles_to_livecarta_convention(
|
||||
split_style)
|
||||
style = "; ".join(split_style)
|
||||
style = "; ".join(split_style) if split_style else ""
|
||||
return style
|
||||
|
||||
def process_inline_styles_in_html_soup(self, html_content):
|
||||
|
||||
@@ -103,7 +103,7 @@ def str2hex(s: str) -> str:
|
||||
return rgb_percent_to_hex((r, g, b))
|
||||
|
||||
if "rgb" in s.lower():
|
||||
rgba = re.findall("([0-9] *\.?[0-9]+)", s)
|
||||
rgba = re.findall("(\d+(?:\.\d+)?)", s)
|
||||
r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2])
|
||||
if len(rgba) == 4:
|
||||
alpha = float(rgba[3])
|
||||
|
||||
Reference in New Issue
Block a user