forked from LiveCarta/BookConverter
Merge processing tags[Docx, Epub]
This commit is contained in:
@@ -5,6 +5,7 @@ from threading import Event
|
||||
|
||||
from src.book_solver import BookSolver
|
||||
from src.util.helpers import BookLogger
|
||||
from src.html_preprocessor import HtmlPreprocessor
|
||||
from src.style_preprocessor import StylePreprocessor
|
||||
from src.docx_converter.docx2libre_html import Docx2LibreHTML
|
||||
from src.docx_converter.html_docx_processor import HTMLDocxProcessor
|
||||
@@ -48,10 +49,14 @@ class DocxBook(BookSolver):
|
||||
|
||||
# 2. Parses and cleans html, gets list of tags, gets footnotes
|
||||
try:
|
||||
style_processor = StylePreprocessor()
|
||||
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
|
||||
logger=self.logger_object, style_processor=style_processor)
|
||||
bs_tags, footnotes, top_level_headers = parser.process_html(
|
||||
html_preprocessor = HtmlPreprocessor(
|
||||
logger=self.logger_object, preset_path="presets/docx_presets.json")
|
||||
style_preprocessor = StylePreprocessor()
|
||||
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup,
|
||||
logger=self.logger_object,
|
||||
html_preprocessor=html_preprocessor,
|
||||
style_preprocessor=style_preprocessor)
|
||||
bs_tags, footnotes, top_level_headers = html_processor.process_html(
|
||||
self.access, html_converter.html_path, self.book_id)
|
||||
except Exception as exc:
|
||||
self.logger_object.log(
|
||||
@@ -84,10 +89,12 @@ if __name__ == "__main__":
|
||||
html_converter = Docx2LibreHTML(file_path=docx_file_path,
|
||||
logger=logger_object, libre_locker=locker)
|
||||
|
||||
css_processor = StylePreprocessor()
|
||||
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
|
||||
style_processor=css_processor, preset_path="../../presets/docx_presets.json")
|
||||
content, footnotes, top_level_headers = parser.process_html(
|
||||
html_preprocessor = HtmlPreprocessor(
|
||||
logger=logger_object, preset_path="../../presets/docx_presets.json")
|
||||
style_preprocessor = StylePreprocessor()
|
||||
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
|
||||
html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
|
||||
content, footnotes, top_level_headers = html_processor.process_html(
|
||||
html_path=html_converter.html_path, book_id=html_converter.book_id)
|
||||
|
||||
json_converter = LibreHTML2JSONConverter(
|
||||
|
||||
@@ -1,32 +1,23 @@
|
||||
import re
|
||||
import json
|
||||
import pathlib
|
||||
from typing import List, Tuple, Dict, Union
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.html_preprocessor import _preprocess_html
|
||||
from src.docx_converter.image_processing import process_images
|
||||
from src.docx_converter.footnotes_processing import process_footnotes
|
||||
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
||||
|
||||
|
||||
class HTMLDocxProcessor:
|
||||
|
||||
def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
|
||||
style_processor, preset_path: str = "presets/docx_presets.json"):
|
||||
self.html_soup = html_soup
|
||||
self.body_tag = html_soup.body
|
||||
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
|
||||
self.logger = logger
|
||||
self.preset = json.load(open(preset_path))
|
||||
self.style_processor = style_processor
|
||||
self.name2action = {
|
||||
"wrapper": self._wrap_tag,
|
||||
"decomposer": self._decompose_tag,
|
||||
"replacer": self._replace_tag,
|
||||
"attr_replacer": self._replace_attr,
|
||||
"unwrapper": self._unwrap_tag
|
||||
}
|
||||
self.html_soup = html_soup
|
||||
self.body_tag = self.html_soup.body
|
||||
self.html_preprocessor = html_preprocessor
|
||||
self.style_preprocessor = style_preprocessor
|
||||
|
||||
def _process_toc_links(self):
|
||||
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
|
||||
@@ -59,84 +50,6 @@ class HTMLDocxProcessor:
|
||||
f"Check the structure of the file."
|
||||
f"Tag name: {tag.name}")
|
||||
|
||||
def _wrap_tag(self, **kwargs):
|
||||
kwargs["tag"].wrap(self.html_soup.new_tag(kwargs["rule"]["tag_to_wrap"]))
|
||||
|
||||
@staticmethod
|
||||
def _decompose_tag(**kwargs):
|
||||
kwargs["tag"].decompose()
|
||||
|
||||
@staticmethod
|
||||
def _replace_tag(**kwargs):
|
||||
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
|
||||
kwargs["tag"].name = tag_to_replace
|
||||
|
||||
@staticmethod
|
||||
def _replace_attr(**kwargs):
|
||||
attr, attr_value =\
|
||||
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
|
||||
attr_to_replace, attr_value_to_replace =\
|
||||
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
|
||||
if attr_to_replace:
|
||||
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
|
||||
if attr_value_to_replace:
|
||||
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
|
||||
del kwargs["tag"][attr]
|
||||
elif attr_value_to_replace:
|
||||
kwargs["tag"].attrs[attr] = attr_value_to_replace
|
||||
|
||||
@staticmethod
|
||||
def _unwrap_tag(**kwargs):
|
||||
kwargs["tag"].unwrap()
|
||||
|
||||
@staticmethod
|
||||
def _process_tags(body_tag: Tag,
|
||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
|
||||
action):
|
||||
"""
|
||||
Function do action with tags
|
||||
Parameters
|
||||
----------
|
||||
body_tag: Tag
|
||||
Tag & contents of the chapter tag
|
||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
|
||||
list of conditions when fire function
|
||||
action: function
|
||||
action what to do with tag
|
||||
Returns
|
||||
-------
|
||||
NoReturn
|
||||
Body Tag with processed certain tags
|
||||
|
||||
"""
|
||||
for rule in rules:
|
||||
tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
|
||||
if rule["condition"]:
|
||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||
if condition_on_tag[0] == "parent_tags":
|
||||
for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
|
||||
for tag in tags])):
|
||||
tag.parent.attrs.update(tag.attrs)
|
||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||
elif condition_on_tag[0] == "child_tags":
|
||||
for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
|
||||
for tag in tags])):
|
||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||
elif condition_on_tag[0] == "attrs":
|
||||
for attr in rule["condition"]["attrs"]:
|
||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||
# attr replacer
|
||||
elif condition_on_tag[0] == "tags":
|
||||
attr = rule["attr"]
|
||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
||||
{attr['name']: re.compile(fr"{attr['value']}")}):
|
||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||
else:
|
||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
|
||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||
|
||||
def _process_quotes(self):
|
||||
"""
|
||||
Function to process block quotes.
|
||||
@@ -175,14 +88,6 @@ class HTMLDocxProcessor:
|
||||
|
||||
table.replaceWith(new_div)
|
||||
|
||||
@staticmethod
|
||||
def convert_pt_to_px(value: float) -> float:
|
||||
value = float(value)
|
||||
if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
|
||||
return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
|
||||
else:
|
||||
return value
|
||||
|
||||
def _process_tables(self):
|
||||
"""Function to process tables. Set "border" attribute."""
|
||||
tables = self.body_tag.find_all("table")
|
||||
@@ -197,7 +102,10 @@ class HTMLDocxProcessor:
|
||||
size = match.group(1)
|
||||
units = match.group(2)
|
||||
if units == "pt":
|
||||
size = self.convert_pt_to_px(size)
|
||||
value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\
|
||||
if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\
|
||||
else float(size)
|
||||
size = value
|
||||
sizes.append(float(size))
|
||||
width = td.get("width")
|
||||
td.attrs = {}
|
||||
@@ -392,14 +300,13 @@ class HTMLDocxProcessor:
|
||||
self.logger.log(f"Processing TOC and headers.")
|
||||
self._process_toc_links()
|
||||
|
||||
for rule in self.preset:
|
||||
self.logger.log(rule["preset_name"].title() + " process.")
|
||||
action = self.name2action[rule["preset_name"]]
|
||||
self._process_tags(self.body_tag, rule["rules"], action)
|
||||
_preprocess_html(html_preprocessor=self.html_preprocessor,
|
||||
html_soup=self.html_soup)
|
||||
|
||||
# CSS after html processing cause of <fonts> that aren't supported by html
|
||||
self.logger.log("CSS inline style preprocessing.")
|
||||
self.style_processor.process_inline_styles_in_html_soup(self.body_tag)
|
||||
self.style_preprocessor.process_inline_styles_in_html_soup(
|
||||
self.body_tag)
|
||||
|
||||
self.logger.log("CSS inline style processing.")
|
||||
modify_html_soup_with_css_styles(self.body_tag)
|
||||
|
||||
Reference in New Issue
Block a user