forked from LiveCarta/BookConverter
Add inline style processor [Docx]
This commit is contained in:
@@ -78,7 +78,7 @@ class BookSolver:
|
||||
"""Method for getting and saving preset from server"""
|
||||
try:
|
||||
pass
|
||||
self.preset_path = "presets/presets.json"
|
||||
self.preset_path = "presets/docx_presets.json"
|
||||
# self.logger_object.log(f"Start receiving preset file from server. URL:"
|
||||
# f" {self.access.url}/doc-convert/{self.book_id}/presets")
|
||||
# content = self.access.get_file(
|
||||
|
||||
@@ -5,19 +5,20 @@ from threading import Event
|
||||
|
||||
from src.book_solver import BookSolver
|
||||
from src.util.helpers import BookLogger
|
||||
from src.style_preprocessor import StylePreprocessor
|
||||
from src.docx_converter.docx2libre_html import Docx2LibreHTML
|
||||
from src.docx_converter.html_docx_preprocessor import HTMLDocxPreprocessor
|
||||
from src.docx_converter.html_docx_processor import HTMLDocxProcessor
|
||||
from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter
|
||||
|
||||
|
||||
class DocxBook(BookSolver):
|
||||
"""Class of .docx type book - child of BookSolver"""
|
||||
|
||||
def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker=None):
|
||||
def __init__(self, book_id: int = 0, access=None, main_logger=None, libre_locker: Event = None):
|
||||
super().__init__(book_id, access, main_logger)
|
||||
self.book_type = "docx"
|
||||
# critical section for occupying libreoffice by one thread
|
||||
self.libre_locker: Event() = libre_locker
|
||||
self.libre_locker = libre_locker
|
||||
|
||||
def get_converted_book(self):
|
||||
"""
|
||||
@@ -47,8 +48,9 @@ class DocxBook(BookSolver):
|
||||
|
||||
# 2. Parses and cleans html, gets list of tags, gets footnotes
|
||||
try:
|
||||
parser = HTMLDocxPreprocessor(
|
||||
html_converter.html_soup, self.logger_object)
|
||||
style_processor = StylePreprocessor()
|
||||
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
|
||||
logger=self.logger_object, style_processor=style_processor)
|
||||
bs_tags, footnotes, top_level_headers = parser.process_html(
|
||||
self.access, html_converter.html_path, self.book_id)
|
||||
except Exception as exc:
|
||||
@@ -73,7 +75,7 @@ class DocxBook(BookSolver):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
docx_file_path = "../../books/docx/music_inquiry.docx"
|
||||
docx_file_path = "../../books/docx/Bar_Exam_MPT_2e_prepared.docx"
|
||||
logger_object = BookLogger(
|
||||
name="docx", book_id=docx_file_path.split("/")[-1])
|
||||
locker = Event()
|
||||
@@ -82,7 +84,9 @@ if __name__ == "__main__":
|
||||
html_converter = Docx2LibreHTML(file_path=docx_file_path,
|
||||
logger=logger_object, libre_locker=locker)
|
||||
|
||||
parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object)
|
||||
css_processor = StylePreprocessor()
|
||||
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
|
||||
style_processor=css_processor, preset_path="../../presets/docx_presets.json")
|
||||
content, footnotes, top_level_headers = parser.process_html(
|
||||
html_path=html_converter.html_path, book_id=html_converter.book_id)
|
||||
|
||||
|
||||
@@ -1,20 +1,29 @@
|
||||
import re
|
||||
import json
|
||||
import pathlib
|
||||
from typing import List, Dict, Union
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.util.helpers import BookLogger
|
||||
from src.docx_converter.footnotes_processing import process_footnotes
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.docx_converter.image_processing import process_images
|
||||
from src.docx_converter.footnotes_processing import process_footnotes
|
||||
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
||||
|
||||
|
||||
class HTMLDocxPreprocessor:
|
||||
|
||||
def __init__(self, html_soup: BeautifulSoup, logger_object: BookLogger):
|
||||
def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
|
||||
style_processor, preset_path: str = "presets/docx_presets.json"):
|
||||
self.body_tag = html_soup.body
|
||||
self.html_soup = html_soup
|
||||
self.logger_object = logger_object
|
||||
self.logger = logger
|
||||
self.preset = json.load(open(preset_path))
|
||||
self.style_processor = style_processor
|
||||
self.name2action = {
|
||||
"decomposer": self._decompose_tag,
|
||||
"unwrapper": self._unwrap_tag
|
||||
}
|
||||
self.top_level_headers = None
|
||||
self.content = list()
|
||||
|
||||
@@ -525,11 +534,22 @@ class HTMLDocxPreprocessor:
|
||||
|
||||
def process_html(self, access=None, html_path: pathlib.Path = "", book_id: int = 0):
|
||||
"""Process html code to satisfy LiveCarta formatting."""
|
||||
self.logger_object.log("Beginning of processing .html file.")
|
||||
self.logger.log("Beginning of processing .html file.")
|
||||
|
||||
self.logger_object.log(f"Processing TOC and headers.")
|
||||
self.logger.log(f"Processing TOC and headers.")
|
||||
self._process_toc_links()
|
||||
|
||||
self.logger.log("CSS inline style preprocessing.")
|
||||
self.style_processor.process_inline_styles_in_html_soup(self.html_soup)
|
||||
|
||||
self.logger.log("CSS inline style processing.")
|
||||
modify_html_soup_with_css_styles(self.html_soup)
|
||||
|
||||
for rule in self.preset:
|
||||
self.logger.log(rule["preset_name"] + " process.")
|
||||
action = self.name2action[rule["preset_name"]]
|
||||
self._process_tags(self.body_tag, rule["rules"], action)
|
||||
|
||||
self.clean_trash()
|
||||
|
||||
# process main elements of the .html doc
|
||||
@@ -538,29 +558,28 @@ class HTMLDocxPreprocessor:
|
||||
self._process_paragraph()
|
||||
self._process_two_columns()
|
||||
|
||||
self.logger_object.log("Block quotes processing.")
|
||||
self.logger.log("Block quotes processing.")
|
||||
self._process_quotes()
|
||||
|
||||
self.logger_object.log("Tables processing.")
|
||||
self.logger.log("Tables processing.")
|
||||
self._process_tables()
|
||||
self.logger_object.log(
|
||||
self.logger.log(
|
||||
f"{self.tables_amount} tables have been processed.")
|
||||
|
||||
self.logger_object.log("Hrefs processing.")
|
||||
self.logger.log("Hrefs processing.")
|
||||
self._process_hrefs()
|
||||
|
||||
self.logger_object.log("Footnotes processing.")
|
||||
self.footnotes = process_footnotes(self.body_tag)
|
||||
self.logger_object.log(
|
||||
f"{len(self.footnotes)} footnotes have been processed.")
|
||||
|
||||
self.logger_object.log("Image processing.")
|
||||
self.logger.log("Image processing.")
|
||||
self.images = process_images(access, path_to_html=html_path,
|
||||
book_id=book_id, body_tag=self.body_tag)
|
||||
self.logger_object.log(
|
||||
self.logger.log(
|
||||
f"{len(self.images)} images have been processed.")
|
||||
|
||||
self._process_footer()
|
||||
self.logger.log("Footnotes processing.")
|
||||
self.footnotes = process_footnotes(self.body_tag)
|
||||
self.logger.log(
|
||||
f"{len(self.footnotes)} footnotes have been processed.")
|
||||
|
||||
self._process_div()
|
||||
|
||||
self.top_level_headers = self._get_top_level_headers()
|
||||
@@ -572,6 +591,6 @@ class HTMLDocxPreprocessor:
|
||||
# delete text before table of content if exists
|
||||
self.delete_content_before_toc()
|
||||
|
||||
self.logger_object.log("End of processing .html file.")
|
||||
self.logger.log("End of processing .html file.")
|
||||
|
||||
return self.content, self.footnotes, self.top_level_headers
|
||||
Reference in New Issue
Block a user