From d7a9c52acdb1c1dcd7762d74ad1cfd4ff307b8e1 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Mon, 19 Sep 2022 19:25:17 +0300 Subject: [PATCH] Change workong process of docx body --- src/docx_converter/html_docx_processor.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/docx_converter/html_docx_processor.py b/src/docx_converter/html_docx_processor.py index 05c413e..c92e997 100644 --- a/src/docx_converter/html_docx_processor.py +++ b/src/docx_converter/html_docx_processor.py @@ -13,16 +13,15 @@ from src.inline_style_processor import modify_html_soup_with_css_styles class HtmlDocxProcessor: def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor): self.logger = logger - self.html_soup = html_soup + self.body_tag: BeautifulSoup = BeautifulSoup(str(html_soup.body)) self.html_preprocessor = html_preprocessor self.style_preprocessor = style_preprocessor self.content: List[Tag] = [] def _font_to_span(self): - for font in self.html_soup.find_all("font"): + for font in self.body_tag.find_all("font"): font.name = "span" - def _process_hrefs(self): a_tags_with_href = self.body_tag.find_all( "a", {"href": re.compile("^.*http.+")}) @@ -205,10 +204,9 @@ class HtmlDocxProcessor: else: h_tag.unwrap() - def delete_content_before_toc(self): # remove all tag upper the only in content !!! body tag is not updated - toc_tag = self.html_soup.new_tag("TOC") + toc_tag = self.body_tag.new_tag("TOC") if toc_tag in self.content: ind = self.content.index(toc_tag) + 1 self.content = self.content[ind:] @@ -225,12 +223,10 @@ class HtmlDocxProcessor: self.logger.log("Inline style reading.") self.style_preprocessor.process_inline_styles_in_html_soup( - self.html_soup) + self.body_tag) self.logger.log("Inline style processing.") - self.html_soup = modify_html_soup_with_css_styles(self.html_soup) - - self.body_tag = self.html_soup.body + self.body_tag = modify_html_soup_with_css_styles(self.body_tag) self.logger.log("Image processing.") images = process_images(access, path_to_html=html_path, @@ -257,9 +253,9 @@ class HtmlDocxProcessor: self.logger.log(f".html using presets processing.") _process_presets(html_preprocessor=self.html_preprocessor, - html_soup=self.html_soup) + html_soup=self.body_tag) - self.content = self.body_tag.find_all(recursive=False) + self.content = self.body_tag.body.find_all(recursive=False) # delete text before table of content if exists self.delete_content_before_toc()