Add decomposer & unwrapper processes

This commit is contained in:
Kiryl
2022-09-02 14:48:01 +03:00
parent dfdf6bc7e9
commit 142c9bbe69

View File

@@ -11,7 +11,7 @@ from src.docx_converter.footnotes_processing import process_footnotes
from src.tag_inline_style_processor import modify_html_soup_with_css_styles from src.tag_inline_style_processor import modify_html_soup_with_css_styles
class HTMLDocxPreprocessor: class HTMLDocxProcessor:
def __init__(self, html_soup: BeautifulSoup, logger: BookLogger, def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
style_processor, preset_path: str = "presets/docx_presets.json"): style_processor, preset_path: str = "presets/docx_presets.json"):
@@ -28,6 +28,7 @@ class HTMLDocxPreprocessor:
self.content = list() self.content = list()
def _process_toc_links(self): def _process_toc_links(self):
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
def _check_parent_link_exist_in_toc(tag_with_link: Tag) -> bool: def _check_parent_link_exist_in_toc(tag_with_link: Tag) -> bool:
toc_links = [] toc_links = []
for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}): for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}):
@@ -36,65 +37,76 @@ class HTMLDocxPreprocessor:
if toc_item: if toc_item:
toc_links.append(toc_item) toc_links.append(toc_item)
return len(toc_links) > 0 return len(toc_links) > 0
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all( toc_links = self.body_tag.find_all(
"a", {"name": re.compile(r"^_Toc\d+")}) "a", {"name": re.compile(r"^_Toc\d+")})
headers = [link.parent for link in toc_links] headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1> outline_level = "1" # All the unknown outlines will be predicted as <h1>
for h_tag in headers: for tag in headers:
if re.search(r"^h\d$", h_tag.name): if re.search(r"^h\d$", tag.name):
h_tag.a.unwrap() tag.a.unwrap()
# outline_level = tag.name[-1] # TODO: add prediction of the outline level elif tag.name == "p":
elif h_tag.name == "p": exist_in_toc = _check_parent_link_exist_in_toc(tag)
exist_in_toc = _check_parent_link_exist_in_toc(h_tag) if tag in self.body_tag.find_all("p") and exist_in_toc:
if h_tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup( new_tag = BeautifulSoup(
features="lxml").new_tag("h" + outline_level) features="lxml").new_tag("h" + outline_level)
text = h_tag.text text = tag.text
h_tag.replaceWith(new_tag) tag.replaceWith(new_tag)
new_tag.string = text new_tag.string = text
else: else:
# rethink document structure when you have toc_links, other cases? # rethink document structure when you have toc_links, other cases?
self.logger_object.log(f"Something went wrong in processing toc_links." self.logger.log(f"Something went wrong in processing toc_links."
f" Check the structure of the file. " f"Check the structure of the file."
f"Tag name: {h_tag.name}") f"Tag name: {tag.name}")
def _clean_tag(self, tag: str, attr_name: str, attr_value: re): @staticmethod
# todo regex def _decompose_tag(tag):
tag.decompose()
@staticmethod
def _unwrap_tag(tag):
tag.unwrap()
@staticmethod
def _process_tags(body_tag: BeautifulSoup,
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
action):
""" """
Function to clean tags by its name and attribute value. Function do action with tags
Parameters Parameters
---------- ----------
tag: str body_tag: BeautifulSoup
tag name to clean Tag & contents of the chapter tag
attr_name: str rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
attribute name list of conditions when fire function
attr_value: [str,re] action: function
attribute value action what to do with tag
Returns Returns
------- -------
clean tag NoReturn
Body Tag with processed certain tags
""" """
tags = self.body_tag.find_all(tag, {attr_name: attr_value}) for rule in rules:
for tag in tags: tags: List[str] = rule["tags"]
if len(tag.attrs) == 1: if rule["condition"]:
tag.unwrap() for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == "parent_tags":
def _clean_underline_links(self): for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
# todo regex for tag in tags])):
"""Function cleans meaningless <u> tags before links.""" tag.parent.attrs.update(tag.attrs)
underlines = self.body_tag.find_all("u") action(tag)
for u in underlines: elif condition_on_tag[0] == "child_tags":
if u.find_all("a"): for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
u.unwrap() for tag in tags])):
action(tag)
links = self.body_tag.find_all("a") elif condition_on_tag[0] == "attrs":
for link in links: for attr in rule["condition"]["attrs"]:
u = link.find_all("u") for tag in body_tag.find_all([re.compile(tag) for tag in tags],
if u and len(u) == 1: {attr["name"]: re.compile(fr"{attr['value']}")}):
u[0].unwrap() action(tag)
else:
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
action(tag)
@classmethod @classmethod
def convert_pt_to_px(cls, value: float) -> float: def convert_pt_to_px(cls, value: float) -> float:
@@ -155,21 +167,8 @@ class HTMLDocxPreprocessor:
assert len(self.body_tag.find_all("font")) == 0 assert len(self.body_tag.find_all("font")) == 0
def clean_trash(self): def clean_trash(self):
# todo make it regex dict
"""Function to remove all styles and tags we don"t need.""" """Function to remove all styles and tags we don"t need."""
self._clean_tag("span", "style", re.compile( # todo replacer
r"^background: #[\da-fA-F]{6}$"))
# todo: check for another languages
self._clean_tag("span", "lang", re.compile(r"^ru-RU$"))
self._clean_tag("span", "style", re.compile(
"^letter-spacing: -?[\d.]+pt$"))
self._clean_tag("font", "face", re.compile(
r"^Times New Roman[\w, ]+$"))
self._clean_tag("a", "name", "_GoBack")
self._clean_underline_links()
self._font_to_span() self._font_to_span()
# replace toc with empty <TOC> tag # replace toc with empty <TOC> tag
@@ -180,7 +179,7 @@ class HTMLDocxPreprocessor:
table.decompose() table.decompose()
def _preprocessing_headings(self): def _preprocessing_headings(self):
# todo regex # todo replacer
"""Function to convert all lower level headings to p tags""" """Function to convert all lower level headings to p tags"""
pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$" pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$"
header_tags = self.body_tag.find_all(re.compile(pattern)) header_tags = self.body_tag.find_all(re.compile(pattern))
@@ -189,6 +188,7 @@ class HTMLDocxPreprocessor:
def _process_paragraph(self): def _process_paragraph(self):
"""Function to process <p> tags (text-align and text-indent value).""" """Function to process <p> tags (text-align and text-indent value)."""
# todo debug and remove if inline is enough
paragraphs = self.body_tag.find_all("p") paragraphs = self.body_tag.find_all("p")
for p in paragraphs: for p in paragraphs:
@@ -239,6 +239,7 @@ class HTMLDocxPreprocessor:
def _process_two_columns(self): def _process_two_columns(self):
"""Function to process paragraphs which has two columns layout.""" """Function to process paragraphs which has two columns layout."""
# todo replacer
two_columns = self.body_tag.find_all("div", style="column-count: 2") two_columns = self.body_tag.find_all("div", style="column-count: 2")
for div in two_columns: for div in two_columns:
for child in div.children: for child in div.children:
@@ -289,14 +290,11 @@ class HTMLDocxPreprocessor:
tables = self.body_tag.find_all("table") tables = self.body_tag.find_all("table")
for table in tables: for table in tables:
tds = table.find_all("td") tds = table.find_all("td")
sizes = [] sizes = []
for td in tds: for td in tds:
style = td.get("style") style = td.get("style")
if style: if style:
match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style)
if match: if match:
size = match.group(1) size = match.group(1)
units = match.group(2) units = match.group(2)
@@ -305,13 +303,10 @@ class HTMLDocxPreprocessor:
size = self.convert_pt_to_px(size) size = self.convert_pt_to_px(size)
sizes.append(float(size)) sizes.append(float(size))
width = td.get("width") width = td.get("width")
td.attrs = {} td.attrs = {}
if width: if width:
td.attrs["width"] = width td.attrs["width"] = width
if sizes: if sizes:
border_size = sum(sizes) / len(sizes) border_size = sum(sizes) / len(sizes)
table.attrs["border"] = f"{border_size:.2}" table.attrs["border"] = f"{border_size:.2}"
@@ -334,18 +329,8 @@ class HTMLDocxPreprocessor:
tag.string = tag.text.replace("\u200b", "") # zero-width-space tag.string = tag.text.replace("\u200b", "") # zero-width-space
tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "")
def _process_footer(self):
# todo regex
"""
Function to process <div title="footer"> tags.
All the tags will be deleted from file.
"""
divs = self.body_tag.find_all("div", {"title": "footer"})
for div in divs:
div.decompose()
def _process_div(self): def _process_div(self):
# todo regex # todo unwrapper
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.""" """Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
divs = self.body_tag.find_all("div") divs = self.body_tag.find_all("div")
for div in divs: for div in divs:
@@ -394,7 +379,7 @@ class HTMLDocxPreprocessor:
def _mark_introduction_headers(self): def _mark_introduction_headers(self):
""" """
Function to find out: Function to find out:
what header shouldn"t be numbered and can be treated as introduction chapter what header shouldn't be numbered and can be treated as introduction chapter
Assume header(s) to be introduction if: Assume header(s) to be introduction if:
1. one header not numbered, before 1 numbered header 1. one header not numbered, before 1 numbered header
2. it is first header from the top level list, and it equals to "introduction" 2. it is first header from the top level list, and it equals to "introduction"
@@ -442,13 +427,9 @@ class HTMLDocxPreprocessor:
self.apply_func_to_last_child(children[0], func) self.apply_func_to_last_child(children[0], func)
def _process_headings(self): def _process_headings(self):
# todo regex
""" """
Function to process tags <h>. Function to process tags <h>.
Steps Clean header from attrs and text in header from numbering and \n
----------
1. remove <b>, <span>
2. clean text in header from numbering and \n
Returns Returns
------- -------
@@ -458,34 +439,22 @@ class HTMLDocxPreprocessor:
""" """
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
# 1. remove <b>, <span> # clean header from attrs and text in header from numbering and \n
for tag in header_tags: for h_tag in header_tags:
b_tags = tag.find_all("b") h_tag.attrs = {}
[tag.unwrap() for tag in b_tags] if h_tag.parent.name == "li":
h_tag.parent.unwrap()
while h_tag.parent.name == "ol":
h_tag.parent.unwrap()
spans = tag.find_all("span") cleaned_title = re.sub(r"[\s\xa0]", " ", h_tag.text)
if spans:
[span.unwrap() for span in spans]
tag.attrs = {}
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
# 2. clean text in header from numbering and \n
for tag in header_tags:
if tag.parent.name == "li":
tag.parent.unwrap()
while tag.parent.name == "ol":
tag.parent.unwrap()
cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text)
if cleaned_title == "": if cleaned_title == "":
tag.unwrap() h_tag.unwrap()
else: else:
assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \ assert h_tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \
f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings." f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings."
content = list(tag.children) content = list(h_tag.children)
# do not take into account rubbish empty tags like <a>, but don"t remove them # do not take into account rubbish empty tags like <a>, but don"t remove them
content = [item for item in content if content = [item for item in content if
@@ -506,24 +475,6 @@ class HTMLDocxPreprocessor:
self.apply_func_to_last_child( self.apply_func_to_last_child(
content[i], self.clean_title_from_tabs) content[i], self.clean_title_from_tabs)
def _process_lists(self):
# todo regex
"""
Function
- process tags <li>.
- unwrap <p> tags.
Returns
-------
None
uwrap <p> tag with li
"""
li_tags = self.body_tag.find_all("li")
for li_tag in li_tags:
li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap()
def delete_content_before_toc(self): def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated # remove all tag upper the <TOC> only in content !!! body tag is not updated
toc_tag = self.html_soup.new_tag("TOC") toc_tag = self.html_soup.new_tag("TOC")
@@ -553,7 +504,7 @@ class HTMLDocxPreprocessor:
self.clean_trash() self.clean_trash()
# process main elements of the .html doc # process main elements of the .html doc
self.logger_object.log(f"Processing main elements of html.") self.logger.log(f"Processing main elements of html.")
self._preprocessing_headings() self._preprocessing_headings()
self._process_paragraph() self._process_paragraph()
self._process_two_columns() self._process_two_columns()
@@ -587,7 +538,6 @@ class HTMLDocxPreprocessor:
self._process_headings() self._process_headings()
self._process_lists()
# delete text before table of content if exists # delete text before table of content if exists
self.delete_content_before_toc() self.delete_content_before_toc()