Merge processing tags[Docx, Epub]

This commit is contained in:
Kiryl
2022-09-06 16:26:08 +03:00
parent ea37b19c36
commit ddc45e2d04
6 changed files with 226 additions and 277 deletions

View File

@@ -5,6 +5,7 @@ from threading import Event
from src.book_solver import BookSolver
from src.util.helpers import BookLogger
from src.html_preprocessor import HtmlPreprocessor
from src.style_preprocessor import StylePreprocessor
from src.docx_converter.docx2libre_html import Docx2LibreHTML
from src.docx_converter.html_docx_processor import HTMLDocxProcessor
@@ -48,10 +49,14 @@ class DocxBook(BookSolver):
# 2. Parses and cleans html, gets list of tags, gets footnotes
try:
style_processor = StylePreprocessor()
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
logger=self.logger_object, style_processor=style_processor)
bs_tags, footnotes, top_level_headers = parser.process_html(
html_preprocessor = HtmlPreprocessor(
logger=self.logger_object, preset_path="presets/docx_presets.json")
style_preprocessor = StylePreprocessor()
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup,
logger=self.logger_object,
html_preprocessor=html_preprocessor,
style_preprocessor=style_preprocessor)
bs_tags, footnotes, top_level_headers = html_processor.process_html(
self.access, html_converter.html_path, self.book_id)
except Exception as exc:
self.logger_object.log(
@@ -84,10 +89,12 @@ if __name__ == "__main__":
html_converter = Docx2LibreHTML(file_path=docx_file_path,
logger=logger_object, libre_locker=locker)
css_processor = StylePreprocessor()
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
style_processor=css_processor, preset_path="../../presets/docx_presets.json")
content, footnotes, top_level_headers = parser.process_html(
html_preprocessor = HtmlPreprocessor(
logger=logger_object, preset_path="../../presets/docx_presets.json")
style_preprocessor = StylePreprocessor()
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
content, footnotes, top_level_headers = html_processor.process_html(
html_path=html_converter.html_path, book_id=html_converter.book_id)
json_converter = LibreHTML2JSONConverter(

View File

@@ -1,32 +1,23 @@
import re
import json
import pathlib
from typing import List, Tuple, Dict, Union
from bs4 import BeautifulSoup, Tag, NavigableString
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.html_preprocessor import _preprocess_html
from src.docx_converter.image_processing import process_images
from src.docx_converter.footnotes_processing import process_footnotes
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
class HTMLDocxProcessor:
def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
style_processor, preset_path: str = "presets/docx_presets.json"):
self.html_soup = html_soup
self.body_tag = html_soup.body
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
self.logger = logger
self.preset = json.load(open(preset_path))
self.style_processor = style_processor
self.name2action = {
"wrapper": self._wrap_tag,
"decomposer": self._decompose_tag,
"replacer": self._replace_tag,
"attr_replacer": self._replace_attr,
"unwrapper": self._unwrap_tag
}
self.html_soup = html_soup
self.body_tag = self.html_soup.body
self.html_preprocessor = html_preprocessor
self.style_preprocessor = style_preprocessor
def _process_toc_links(self):
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
@@ -59,84 +50,6 @@ class HTMLDocxProcessor:
f"Check the structure of the file."
f"Tag name: {tag.name}")
def _wrap_tag(self, **kwargs):
kwargs["tag"].wrap(self.html_soup.new_tag(kwargs["rule"]["tag_to_wrap"]))
@staticmethod
def _decompose_tag(**kwargs):
kwargs["tag"].decompose()
@staticmethod
def _replace_tag(**kwargs):
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
kwargs["tag"].name = tag_to_replace
@staticmethod
def _replace_attr(**kwargs):
attr, attr_value =\
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
attr_to_replace, attr_value_to_replace =\
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
if attr_to_replace:
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
if attr_value_to_replace:
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
del kwargs["tag"][attr]
elif attr_value_to_replace:
kwargs["tag"].attrs[attr] = attr_value_to_replace
@staticmethod
def _unwrap_tag(**kwargs):
kwargs["tag"].unwrap()
@staticmethod
def _process_tags(body_tag: Tag,
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
action):
"""
Function do action with tags
Parameters
----------
body_tag: Tag
Tag & contents of the chapter tag
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
list of conditions when fire function
action: function
action what to do with tag
Returns
-------
NoReturn
Body Tag with processed certain tags
"""
for rule in rules:
tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
if rule["condition"]:
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
if condition_on_tag[0] == "parent_tags":
for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
for tag in tags])):
tag.parent.attrs.update(tag.attrs)
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "child_tags":
for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
for tag in tags])):
action(body_tag=body_tag, tag=tag, rule=rule)
elif condition_on_tag[0] == "attrs":
for attr in rule["condition"]["attrs"]:
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
{attr["name"]: re.compile(fr"{attr['value']}")}):
action(body_tag=body_tag, tag=tag, rule=rule)
# attr replacer
elif condition_on_tag[0] == "tags":
attr = rule["attr"]
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
{attr['name']: re.compile(fr"{attr['value']}")}):
action(body_tag=body_tag, tag=tag, rule=rule)
else:
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
action(body_tag=body_tag, tag=tag, rule=rule)
def _process_quotes(self):
"""
Function to process block quotes.
@@ -175,14 +88,6 @@ class HTMLDocxProcessor:
table.replaceWith(new_div)
@staticmethod
def convert_pt_to_px(value: float) -> float:
value = float(value)
if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
else:
return value
def _process_tables(self):
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
@@ -197,7 +102,10 @@ class HTMLDocxProcessor:
size = match.group(1)
units = match.group(2)
if units == "pt":
size = self.convert_pt_to_px(size)
value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\
if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\
else float(size)
size = value
sizes.append(float(size))
width = td.get("width")
td.attrs = {}
@@ -392,14 +300,13 @@ class HTMLDocxProcessor:
self.logger.log(f"Processing TOC and headers.")
self._process_toc_links()
for rule in self.preset:
self.logger.log(rule["preset_name"].title() + " process.")
action = self.name2action[rule["preset_name"]]
self._process_tags(self.body_tag, rule["rules"], action)
_preprocess_html(html_preprocessor=self.html_preprocessor,
html_soup=self.html_soup)
# CSS after html processing cause of <fonts> that aren't supported by html
self.logger.log("CSS inline style preprocessing.")
self.style_processor.process_inline_styles_in_html_soup(self.body_tag)
self.style_preprocessor.process_inline_styles_in_html_soup(
self.body_tag)
self.logger.log("CSS inline style processing.")
modify_html_soup_with_css_styles(self.body_tag)