diff --git a/src/docx_converter/html_docx_processor.py b/src/docx_converter/html_docx_processor.py
index 05c413e..c92e997 100644
--- a/src/docx_converter/html_docx_processor.py
+++ b/src/docx_converter/html_docx_processor.py
@@ -13,16 +13,15 @@ from src.inline_style_processor import modify_html_soup_with_css_styles
class HtmlDocxProcessor:
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
self.logger = logger
- self.html_soup = html_soup
+ self.body_tag: BeautifulSoup = BeautifulSoup(str(html_soup.body))
self.html_preprocessor = html_preprocessor
self.style_preprocessor = style_preprocessor
self.content: List[Tag] = []
def _font_to_span(self):
- for font in self.html_soup.find_all("font"):
+ for font in self.body_tag.find_all("font"):
font.name = "span"
-
def _process_hrefs(self):
a_tags_with_href = self.body_tag.find_all(
"a", {"href": re.compile("^.*http.+")})
@@ -205,10 +204,9 @@ class HtmlDocxProcessor:
else:
h_tag.unwrap()
-
def delete_content_before_toc(self):
# remove all tag upper the only in content !!! body tag is not updated
- toc_tag = self.html_soup.new_tag("TOC")
+ toc_tag = self.body_tag.new_tag("TOC")
if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1
self.content = self.content[ind:]
@@ -225,12 +223,10 @@ class HtmlDocxProcessor:
self.logger.log("Inline style reading.")
self.style_preprocessor.process_inline_styles_in_html_soup(
- self.html_soup)
+ self.body_tag)
self.logger.log("Inline style processing.")
- self.html_soup = modify_html_soup_with_css_styles(self.html_soup)
-
- self.body_tag = self.html_soup.body
+ self.body_tag = modify_html_soup_with_css_styles(self.body_tag)
self.logger.log("Image processing.")
images = process_images(access, path_to_html=html_path,
@@ -257,9 +253,9 @@ class HtmlDocxProcessor:
self.logger.log(f".html using presets processing.")
_process_presets(html_preprocessor=self.html_preprocessor,
- html_soup=self.html_soup)
+ html_soup=self.body_tag)
- self.content = self.body_tag.find_all(recursive=False)
+ self.content = self.body_tag.body.find_all(recursive=False)
# delete text before table of content if exists
self.delete_content_before_toc()