diff --git a/src/docx_converter/html_docx_processor.py b/src/docx_converter/html_docx_processor.py index 7868f02..b515a37 100644 --- a/src/docx_converter/html_docx_processor.py +++ b/src/docx_converter/html_docx_processor.py @@ -137,57 +137,6 @@ class HTMLDocxProcessor: for tag in body_tag.find_all([re.compile(tag) for tag in tags]): action(body_tag=body_tag, tag=tag, rule=rule) - def _process_paragraph(self): - """Function to process

tags (text-align and text-indent value).""" - # todo debug and remove if inline is enough - paragraphs = self.body_tag.find_all("p") - - for p in paragraphs: - # libre converts some \n into

with 2
- # there we remove 1 unnecessary
- brs = p.find_all("br") - text = p.text - - if brs and text == "\n\n" and len(brs) == 2: - brs[0].decompose() - - indent_should_be_added = False - if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")): - indent_should_be_added = True - - align = p.get("align") - style = p.get("style") - - if style: - indent = re.search(r"text-indent: ([\d.]{1,4})in", style) - margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style) - margin_right = re.search( - r"margin-right: ([\d.]{1,4})in", style) - margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style) - margin_bottom = re.search( - r"margin-bottom: ([\d.]{1,4})in", style) - else: - indent = margin_left = margin_right = \ - margin_top = margin_bottom = None - - if margin_left and margin_right and margin_top and margin_bottom and \ - margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \ - margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11": - p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote")) - - p.attrs = {} - style = "" - - if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE: - style += f"text-align: {align};" - - if indent is not None or indent_should_be_added: - # indent = indent.group(1) - style += f"text-indent: {LiveCartaConfig.INDENT};" - - if style: - p.attrs["style"] = style - def _process_quotes(self): """ Function to process block quotes. @@ -247,10 +196,8 @@ class HTMLDocxProcessor: if match: size = match.group(1) units = match.group(2) - if units == "pt": size = self.convert_pt_to_px(size) - sizes.append(float(size)) width = td.get("width") td.attrs = {} @@ -259,7 +206,6 @@ class HTMLDocxProcessor: if sizes: border_size = sum(sizes) / len(sizes) table.attrs["border"] = f"{border_size:.2}" - self.tables_amount = len(tables) def _process_hrefs(self): @@ -278,13 +224,6 @@ class HTMLDocxProcessor: tag.string = tag.text.replace("\u200b", "") # zero-width-space tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") - def _process_div(self): - # todo unwrapper - """Function to process

tags. All the tags will be deleted from file, all content of the tags will stay.""" - divs = self.body_tag.find_all("div") - for div in divs: - div.unwrap() - def _get_top_level_headers(self) -> List[Dict[str, Union[str, bool]]]: """ Function for gathering info about top-level chapters. @@ -439,35 +378,6 @@ class HTMLDocxProcessor: """Process html code to satisfy LiveCarta formatting.""" self.logger.log("Beginning of processing .html file.") - self.logger.log(f"Processing TOC and headers.") - self._process_toc_links() - - for rule in self.preset: - self.logger.log(rule["preset_name"] + " process.") - action = self.name2action[rule["preset_name"]] - self._process_tags(self.body_tag, rule["rules"], action) - - self.logger.log("CSS inline style preprocessing.") - self.style_processor.process_inline_styles_in_html_soup(self.html_soup) - - self.logger.log("CSS inline style processing.") - modify_html_soup_with_css_styles(self.html_soup) - - # process main elements of the .html doc - self.logger.log(f"Processing main elements of html.") - self._process_paragraph() - - self.logger.log("Block quotes processing.") - self._process_quotes() - - self.logger.log("Tables processing.") - self._process_tables() - self.logger.log( - f"{self.tables_amount} tables have been processed.") - - self.logger.log("Hrefs processing.") - self._process_hrefs() - self.logger.log("Image processing.") self.images = process_images(access, path_to_html=html_path, book_id=book_id, body_tag=self.body_tag) @@ -479,7 +389,34 @@ class HTMLDocxProcessor: self.logger.log( f"{len(self.footnotes)} footnotes have been processed.") - self._process_div() + self.logger.log(f"Processing TOC and headers.") + self._process_toc_links() + + for rule in self.preset: + self.logger.log(rule["preset_name"].title() + " process.") + action = self.name2action[rule["preset_name"]] + self._process_tags(self.body_tag, rule["rules"], action) + + # CSS after html processing cause of that aren't supported by html + self.logger.log("CSS inline style preprocessing.") + self.style_processor.process_inline_styles_in_html_soup(self.body_tag) + + self.logger.log("CSS inline style processing.") + modify_html_soup_with_css_styles(self.body_tag) + + # process main elements of the .html doc + self.logger.log(f"Processing main elements of html.") + + self.logger.log("Block quotes processing.") + self._process_quotes() + + self.logger.log("Tables processing.") + self._process_tables() + self.logger.log( + f"{self.tables_amount} tables have been processed.") + + self.logger.log("Hrefs processing.") + self._process_hrefs() self.top_level_headers: List[Dict[str, Union[str, bool]]]\ = self._get_top_level_headers()