From dfdf6bc7e998b3676a293a796609bab4ea248aab Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 2 Sep 2022 14:47:06 +0300 Subject: [PATCH] Add inline style processor [Docx] --- src/book_solver.py | 2 +- src/docx_converter/docx_solver.py | 18 +++--- ...preprocessor.py => html_docx_processor.py} | 57 ++++++++++++------- 3 files changed, 50 insertions(+), 27 deletions(-) rename src/docx_converter/{html_docx_preprocessor.py => html_docx_processor.py} (93%) diff --git a/src/book_solver.py b/src/book_solver.py index 3479d6b..4c42f3f 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -78,7 +78,7 @@ class BookSolver: """Method for getting and saving preset from server""" try: pass - self.preset_path = "presets/presets.json" + self.preset_path = "presets/docx_presets.json" # self.logger_object.log(f"Start receiving preset file from server. URL:" # f" {self.access.url}/doc-convert/{self.book_id}/presets") # content = self.access.get_file( diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 209077a..f0b7826 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -5,19 +5,20 @@ from threading import Event from src.book_solver import BookSolver from src.util.helpers import BookLogger +from src.style_preprocessor import StylePreprocessor from src.docx_converter.docx2libre_html import Docx2LibreHTML -from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor +from src.docx_converter.html_docx_processor import HTMLDocxProcessor from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter class DocxBook(BookSolver): """Class of .docx type book - child of BookSolver""" - def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker=None): + def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker: Event = None): super().__init__(book_id, access, main_logger) self.book_type = "docx" # critical section for occupying libreoffice by one thread - self.libre_locker: Event() = libre_locker + self.libre_locker = libre_locker def get_converted_book(self): """ @@ -47,8 +48,9 @@ class DocxBook(BookSolver): # 2. Parses and cleans html, gets list of tags, gets footnotes try: - parser = HTMLDocxPreprocessor( - html_converter.html_soup, self.logger_object) + style_processor = StylePreprocessor() + parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, + logger=self.logger_object, style_processor=style_processor) bs_tags, footnotes, top_level_headers = parser.process_html( self.access, html_converter.html_path, self.book_id) except Exception as exc: @@ -73,7 +75,7 @@ class DocxBook(BookSolver): if __name__ == "__main__": - docx_file_path = "../../books/docx/music_inquiry.docx" + docx_file_path = "../../books/docx/Bar_Exam_MPT_2e_prepared.docx" logger_object = BookLogger( name="docx", book_id=docx_file_path.split("/")[-1]) locker = Event() @@ -82,7 +84,9 @@ if __name__ == "__main__": html_converter = Docx2LibreHTML(file_path=docx_file_path, logger=logger_object, libre_locker=locker) - parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object) + css_processor = StylePreprocessor() + parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object, + style_processor=css_processor, preset_path="../../presets/docx_presets.json") content, footnotes, top_level_headers = parser.process_html( html_path=html_converter.html_path, book_id=html_converter.book_id) diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_processor.py similarity index 93% rename from src/docx_converter/html_docx_preprocessor.py rename to src/docx_converter/html_docx_processor.py index fcf468c..959ef55 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_processor.py @@ -1,20 +1,29 @@ import re +import json import pathlib from typing import List, Dict, Union from bs4 import BeautifulSoup, Tag, NavigableString -from src.livecarta_config import LiveCartaConfig from src.util.helpers import BookLogger -from src.docx_converter.footnotes_processing import process_footnotes +from src.livecarta_config import LiveCartaConfig from src.docx_converter.image_processing import process_images +from src.docx_converter.footnotes_processing import process_footnotes +from src.tag_inline_style_processor import modify_html_soup_with_css_styles class HTMLDocxPreprocessor: - def __init__(self, html_soup: BeautifulSoup, logger_object: BookLogger): + def __init__(self, html_soup: BeautifulSoup, logger: BookLogger, + style_processor, preset_path: str = "presets/docx_presets.json"): self.body_tag = html_soup.body self.html_soup = html_soup - self.logger_object = logger_object + self.logger = logger + self.preset = json.load(open(preset_path)) + self.style_processor = style_processor + self.name2action = { + "decomposer": self._decompose_tag, + "unwrapper": self._unwrap_tag + } self.top_level_headers = None self.content = list() @@ -525,11 +534,22 @@ class HTMLDocxPreprocessor: def process_html(self, access=None, html_path: pathlib.Path = "", book_id: int = 0): """Process html code to satisfy LiveCarta formatting.""" - self.logger_object.log("Beginning of processing .html file.") + self.logger.log("Beginning of processing .html file.") - self.logger_object.log(f"Processing TOC and headers.") + self.logger.log(f"Processing TOC and headers.") self._process_toc_links() + self.logger.log("CSS inline style preprocessing.") + self.style_processor.process_inline_styles_in_html_soup(self.html_soup) + + self.logger.log("CSS inline style processing.") + modify_html_soup_with_css_styles(self.html_soup) + + for rule in self.preset: + self.logger.log(rule["preset_name"] + " process.") + action = self.name2action[rule["preset_name"]] + self._process_tags(self.body_tag, rule["rules"], action) + self.clean_trash() # process main elements of the .html doc @@ -538,29 +558,28 @@ class HTMLDocxPreprocessor: self._process_paragraph() self._process_two_columns() - self.logger_object.log("Block quotes processing.") + self.logger.log("Block quotes processing.") self._process_quotes() - self.logger_object.log("Tables processing.") + self.logger.log("Tables processing.") self._process_tables() - self.logger_object.log( + self.logger.log( f"{self.tables_amount} tables have been processed.") - self.logger_object.log("Hrefs processing.") + self.logger.log("Hrefs processing.") self._process_hrefs() - self.logger_object.log("Footnotes processing.") - self.footnotes = process_footnotes(self.body_tag) - self.logger_object.log( - f"{len(self.footnotes)} footnotes have been processed.") - - self.logger_object.log("Image processing.") + self.logger.log("Image processing.") self.images = process_images(access, path_to_html=html_path, book_id=book_id, body_tag=self.body_tag) - self.logger_object.log( + self.logger.log( f"{len(self.images)} images have been processed.") - self._process_footer() + self.logger.log("Footnotes processing.") + self.footnotes = process_footnotes(self.body_tag) + self.logger.log( + f"{len(self.footnotes)} footnotes have been processed.") + self._process_div() self.top_level_headers = self._get_top_level_headers() @@ -572,6 +591,6 @@ class HTMLDocxPreprocessor: # delete text before table of content if exists self.delete_content_before_toc() - self.logger_object.log("End of processing .html file.") + self.logger.log("End of processing .html file.") return self.content, self.footnotes, self.top_level_headers