Add inline style processor [Docx]

This commit is contained in:
Kiryl
2022-09-02 14:47:06 +03:00
parent b97c5d8371
commit dfdf6bc7e9
3 changed files with 50 additions and 27 deletions

View File

@@ -78,7 +78,7 @@ class BookSolver:
"""Method for getting and saving preset from server""" """Method for getting and saving preset from server"""
try: try:
pass pass
self.preset_path = "presets/presets.json" self.preset_path = "presets/docx_presets.json"
# self.logger_object.log(f"Start receiving preset file from server. URL:" # self.logger_object.log(f"Start receiving preset file from server. URL:"
# f" {self.access.url}/doc-convert/{self.book_id}/presets") # f" {self.access.url}/doc-convert/{self.book_id}/presets")
# content = self.access.get_file( # content = self.access.get_file(

View File

@@ -5,19 +5,20 @@ from threading import Event
from src.book_solver import BookSolver from src.book_solver import BookSolver
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.style_preprocessor import StylePreprocessor
from src.docx_converter.docx2libre_html import Docx2LibreHTML from src.docx_converter.docx2libre_html import Docx2LibreHTML
from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor from src.docx_converter.html_docx_processor import HTMLDocxProcessor
from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter
class DocxBook(BookSolver): class DocxBook(BookSolver):
"""Class of .docx type book - child of BookSolver""" """Class of .docx type book - child of BookSolver"""
def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker=None): def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker: Event = None):
super().__init__(book_id, access, main_logger) super().__init__(book_id, access, main_logger)
self.book_type = "docx" self.book_type = "docx"
# critical section for occupying libreoffice by one thread # critical section for occupying libreoffice by one thread
self.libre_locker: Event() = libre_locker self.libre_locker = libre_locker
def get_converted_book(self): def get_converted_book(self):
""" """
@@ -47,8 +48,9 @@ class DocxBook(BookSolver):
# 2. Parses and cleans html, gets list of tags, gets footnotes # 2. Parses and cleans html, gets list of tags, gets footnotes
try: try:
parser = HTMLDocxPreprocessor( style_processor = StylePreprocessor()
html_converter.html_soup, self.logger_object) parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
logger=self.logger_object, style_processor=style_processor)
bs_tags, footnotes, top_level_headers = parser.process_html( bs_tags, footnotes, top_level_headers = parser.process_html(
self.access, html_converter.html_path, self.book_id) self.access, html_converter.html_path, self.book_id)
except Exception as exc: except Exception as exc:
@@ -73,7 +75,7 @@ class DocxBook(BookSolver):
if __name__ == "__main__": if __name__ == "__main__":
docx_file_path = "../../books/docx/music_inquiry.docx" docx_file_path = "../../books/docx/Bar_Exam_MPT_2e_prepared.docx"
logger_object = BookLogger( logger_object = BookLogger(
name="docx", book_id=docx_file_path.split("/")[-1]) name="docx", book_id=docx_file_path.split("/")[-1])
locker = Event() locker = Event()
@@ -82,7 +84,9 @@ if __name__ == "__main__":
html_converter = Docx2LibreHTML(file_path=docx_file_path, html_converter = Docx2LibreHTML(file_path=docx_file_path,
logger=logger_object, libre_locker=locker) logger=logger_object, libre_locker=locker)
parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object) css_processor = StylePreprocessor()
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
style_processor=css_processor, preset_path="../../presets/docx_presets.json")
content, footnotes, top_level_headers = parser.process_html( content, footnotes, top_level_headers = parser.process_html(
html_path=html_converter.html_path, book_id=html_converter.book_id) html_path=html_converter.html_path, book_id=html_converter.book_id)

View File

@@ -1,20 +1,29 @@
import re import re
import json
import pathlib import pathlib
from typing import List, Dict, Union from typing import List, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString from bs4 import BeautifulSoup, Tag, NavigableString
from src.livecarta_config import LiveCartaConfig
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.docx_converter.footnotes_processing import process_footnotes from src.livecarta_config import LiveCartaConfig
from src.docx_converter.image_processing import process_images from src.docx_converter.image_processing import process_images
from src.docx_converter.footnotes_processing import process_footnotes
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
class HTMLDocxPreprocessor: class HTMLDocxPreprocessor:
def __init__(self, html_soup: BeautifulSoup, logger_object: BookLogger): def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
style_processor, preset_path: str = "presets/docx_presets.json"):
self.body_tag = html_soup.body self.body_tag = html_soup.body
self.html_soup = html_soup self.html_soup = html_soup
self.logger_object = logger_object self.logger = logger
self.preset = json.load(open(preset_path))
self.style_processor = style_processor
self.name2action = {
"decomposer": self._decompose_tag,
"unwrapper": self._unwrap_tag
}
self.top_level_headers = None self.top_level_headers = None
self.content = list() self.content = list()
@@ -525,11 +534,22 @@ class HTMLDocxPreprocessor:
def process_html(self, access=None, html_path: pathlib.Path = "", book_id: int = 0): def process_html(self, access=None, html_path: pathlib.Path = "", book_id: int = 0):
"""Process html code to satisfy LiveCarta formatting.""" """Process html code to satisfy LiveCarta formatting."""
self.logger_object.log("Beginning of processing .html file.") self.logger.log("Beginning of processing .html file.")
self.logger_object.log(f"Processing TOC and headers.") self.logger.log(f"Processing TOC and headers.")
self._process_toc_links() self._process_toc_links()
self.logger.log("CSS inline style preprocessing.")
self.style_processor.process_inline_styles_in_html_soup(self.html_soup)
self.logger.log("CSS inline style processing.")
modify_html_soup_with_css_styles(self.html_soup)
for rule in self.preset:
self.logger.log(rule["preset_name"] + " process.")
action = self.name2action[rule["preset_name"]]
self._process_tags(self.body_tag, rule["rules"], action)
self.clean_trash() self.clean_trash()
# process main elements of the .html doc # process main elements of the .html doc
@@ -538,29 +558,28 @@ class HTMLDocxPreprocessor:
self._process_paragraph() self._process_paragraph()
self._process_two_columns() self._process_two_columns()
self.logger_object.log("Block quotes processing.") self.logger.log("Block quotes processing.")
self._process_quotes() self._process_quotes()
self.logger_object.log("Tables processing.") self.logger.log("Tables processing.")
self._process_tables() self._process_tables()
self.logger_object.log( self.logger.log(
f"{self.tables_amount} tables have been processed.") f"{self.tables_amount} tables have been processed.")
self.logger_object.log("Hrefs processing.") self.logger.log("Hrefs processing.")
self._process_hrefs() self._process_hrefs()
self.logger_object.log("Footnotes processing.") self.logger.log("Image processing.")
self.footnotes = process_footnotes(self.body_tag)
self.logger_object.log(
f"{len(self.footnotes)} footnotes have been processed.")
self.logger_object.log("Image processing.")
self.images = process_images(access, path_to_html=html_path, self.images = process_images(access, path_to_html=html_path,
book_id=book_id, body_tag=self.body_tag) book_id=book_id, body_tag=self.body_tag)
self.logger_object.log( self.logger.log(
f"{len(self.images)} images have been processed.") f"{len(self.images)} images have been processed.")
self._process_footer() self.logger.log("Footnotes processing.")
self.footnotes = process_footnotes(self.body_tag)
self.logger.log(
f"{len(self.footnotes)} footnotes have been processed.")
self._process_div() self._process_div()
self.top_level_headers = self._get_top_level_headers() self.top_level_headers = self._get_top_level_headers()
@@ -572,6 +591,6 @@ class HTMLDocxPreprocessor:
# delete text before table of content if exists # delete text before table of content if exists
self.delete_content_before_toc() self.delete_content_before_toc()
self.logger_object.log("End of processing .html file.") self.logger.log("End of processing .html file.")
return self.content, self.footnotes, self.top_level_headers return self.content, self.footnotes, self.top_level_headers