forked from LiveCarta/BookConverter
Merge processing tags[Docx, Epub]
This commit is contained in:
@@ -5,6 +5,7 @@ from threading import Event
|
|||||||
|
|
||||||
from src.book_solver import BookSolver
|
from src.book_solver import BookSolver
|
||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
|
from src.html_preprocessor import HtmlPreprocessor
|
||||||
from src.style_preprocessor import StylePreprocessor
|
from src.style_preprocessor import StylePreprocessor
|
||||||
from src.docx_converter.docx2libre_html import Docx2LibreHTML
|
from src.docx_converter.docx2libre_html import Docx2LibreHTML
|
||||||
from src.docx_converter.html_docx_processor import HTMLDocxProcessor
|
from src.docx_converter.html_docx_processor import HTMLDocxProcessor
|
||||||
@@ -48,10 +49,14 @@ class DocxBook(BookSolver):
|
|||||||
|
|
||||||
# 2. Parses and cleans html, gets list of tags, gets footnotes
|
# 2. Parses and cleans html, gets list of tags, gets footnotes
|
||||||
try:
|
try:
|
||||||
style_processor = StylePreprocessor()
|
html_preprocessor = HtmlPreprocessor(
|
||||||
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup,
|
logger=self.logger_object, preset_path="presets/docx_presets.json")
|
||||||
logger=self.logger_object, style_processor=style_processor)
|
style_preprocessor = StylePreprocessor()
|
||||||
bs_tags, footnotes, top_level_headers = parser.process_html(
|
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup,
|
||||||
|
logger=self.logger_object,
|
||||||
|
html_preprocessor=html_preprocessor,
|
||||||
|
style_preprocessor=style_preprocessor)
|
||||||
|
bs_tags, footnotes, top_level_headers = html_processor.process_html(
|
||||||
self.access, html_converter.html_path, self.book_id)
|
self.access, html_converter.html_path, self.book_id)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
self.logger_object.log(
|
self.logger_object.log(
|
||||||
@@ -84,10 +89,12 @@ if __name__ == "__main__":
|
|||||||
html_converter = Docx2LibreHTML(file_path=docx_file_path,
|
html_converter = Docx2LibreHTML(file_path=docx_file_path,
|
||||||
logger=logger_object, libre_locker=locker)
|
logger=logger_object, libre_locker=locker)
|
||||||
|
|
||||||
css_processor = StylePreprocessor()
|
html_preprocessor = HtmlPreprocessor(
|
||||||
parser = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
|
logger=logger_object, preset_path="../../presets/docx_presets.json")
|
||||||
style_processor=css_processor, preset_path="../../presets/docx_presets.json")
|
style_preprocessor = StylePreprocessor()
|
||||||
content, footnotes, top_level_headers = parser.process_html(
|
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
|
||||||
|
html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
|
||||||
|
content, footnotes, top_level_headers = html_processor.process_html(
|
||||||
html_path=html_converter.html_path, book_id=html_converter.book_id)
|
html_path=html_converter.html_path, book_id=html_converter.book_id)
|
||||||
|
|
||||||
json_converter = LibreHTML2JSONConverter(
|
json_converter = LibreHTML2JSONConverter(
|
||||||
|
|||||||
@@ -1,32 +1,23 @@
|
|||||||
import re
|
import re
|
||||||
import json
|
|
||||||
import pathlib
|
import pathlib
|
||||||
from typing import List, Tuple, Dict, Union
|
from typing import List, Tuple, Dict, Union
|
||||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||||
|
|
||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
|
from src.html_preprocessor import _preprocess_html
|
||||||
from src.docx_converter.image_processing import process_images
|
from src.docx_converter.image_processing import process_images
|
||||||
from src.docx_converter.footnotes_processing import process_footnotes
|
from src.docx_converter.footnotes_processing import process_footnotes
|
||||||
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
||||||
|
|
||||||
|
|
||||||
class HTMLDocxProcessor:
|
class HTMLDocxProcessor:
|
||||||
|
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
|
||||||
def __init__(self, html_soup: BeautifulSoup, logger: BookLogger,
|
|
||||||
style_processor, preset_path: str = "presets/docx_presets.json"):
|
|
||||||
self.html_soup = html_soup
|
|
||||||
self.body_tag = html_soup.body
|
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.preset = json.load(open(preset_path))
|
self.html_soup = html_soup
|
||||||
self.style_processor = style_processor
|
self.body_tag = self.html_soup.body
|
||||||
self.name2action = {
|
self.html_preprocessor = html_preprocessor
|
||||||
"wrapper": self._wrap_tag,
|
self.style_preprocessor = style_preprocessor
|
||||||
"decomposer": self._decompose_tag,
|
|
||||||
"replacer": self._replace_tag,
|
|
||||||
"attr_replacer": self._replace_attr,
|
|
||||||
"unwrapper": self._unwrap_tag
|
|
||||||
}
|
|
||||||
|
|
||||||
def _process_toc_links(self):
|
def _process_toc_links(self):
|
||||||
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
|
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
|
||||||
@@ -59,84 +50,6 @@ class HTMLDocxProcessor:
|
|||||||
f"Check the structure of the file."
|
f"Check the structure of the file."
|
||||||
f"Tag name: {tag.name}")
|
f"Tag name: {tag.name}")
|
||||||
|
|
||||||
def _wrap_tag(self, **kwargs):
|
|
||||||
kwargs["tag"].wrap(self.html_soup.new_tag(kwargs["rule"]["tag_to_wrap"]))
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _decompose_tag(**kwargs):
|
|
||||||
kwargs["tag"].decompose()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _replace_tag(**kwargs):
|
|
||||||
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
|
|
||||||
kwargs["tag"].name = tag_to_replace
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _replace_attr(**kwargs):
|
|
||||||
attr, attr_value =\
|
|
||||||
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
|
|
||||||
attr_to_replace, attr_value_to_replace =\
|
|
||||||
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
|
|
||||||
if attr_to_replace:
|
|
||||||
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
|
|
||||||
if attr_value_to_replace:
|
|
||||||
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
|
|
||||||
del kwargs["tag"][attr]
|
|
||||||
elif attr_value_to_replace:
|
|
||||||
kwargs["tag"].attrs[attr] = attr_value_to_replace
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _unwrap_tag(**kwargs):
|
|
||||||
kwargs["tag"].unwrap()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _process_tags(body_tag: Tag,
|
|
||||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
|
|
||||||
action):
|
|
||||||
"""
|
|
||||||
Function do action with tags
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
body_tag: Tag
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
|
|
||||||
list of conditions when fire function
|
|
||||||
action: function
|
|
||||||
action what to do with tag
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
NoReturn
|
|
||||||
Body Tag with processed certain tags
|
|
||||||
|
|
||||||
"""
|
|
||||||
for rule in rules:
|
|
||||||
tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
|
|
||||||
if rule["condition"]:
|
|
||||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
|
||||||
if condition_on_tag[0] == "parent_tags":
|
|
||||||
for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
|
|
||||||
for tag in tags])):
|
|
||||||
tag.parent.attrs.update(tag.attrs)
|
|
||||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
||||||
elif condition_on_tag[0] == "child_tags":
|
|
||||||
for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
|
|
||||||
for tag in tags])):
|
|
||||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
||||||
elif condition_on_tag[0] == "attrs":
|
|
||||||
for attr in rule["condition"]["attrs"]:
|
|
||||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
|
||||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
|
||||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
||||||
# attr replacer
|
|
||||||
elif condition_on_tag[0] == "tags":
|
|
||||||
attr = rule["attr"]
|
|
||||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
|
||||||
{attr['name']: re.compile(fr"{attr['value']}")}):
|
|
||||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
||||||
else:
|
|
||||||
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
|
|
||||||
action(body_tag=body_tag, tag=tag, rule=rule)
|
|
||||||
|
|
||||||
def _process_quotes(self):
|
def _process_quotes(self):
|
||||||
"""
|
"""
|
||||||
Function to process block quotes.
|
Function to process block quotes.
|
||||||
@@ -175,14 +88,6 @@ class HTMLDocxProcessor:
|
|||||||
|
|
||||||
table.replaceWith(new_div)
|
table.replaceWith(new_div)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def convert_pt_to_px(value: float) -> float:
|
|
||||||
value = float(value)
|
|
||||||
if value == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE:
|
|
||||||
return LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE
|
|
||||||
else:
|
|
||||||
return value
|
|
||||||
|
|
||||||
def _process_tables(self):
|
def _process_tables(self):
|
||||||
"""Function to process tables. Set "border" attribute."""
|
"""Function to process tables. Set "border" attribute."""
|
||||||
tables = self.body_tag.find_all("table")
|
tables = self.body_tag.find_all("table")
|
||||||
@@ -197,7 +102,10 @@ class HTMLDocxProcessor:
|
|||||||
size = match.group(1)
|
size = match.group(1)
|
||||||
units = match.group(2)
|
units = match.group(2)
|
||||||
if units == "pt":
|
if units == "pt":
|
||||||
size = self.convert_pt_to_px(size)
|
value = LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE\
|
||||||
|
if float(size) == LiveCartaConfig.WORD_DEFAULT_FONT_SIZE\
|
||||||
|
else float(size)
|
||||||
|
size = value
|
||||||
sizes.append(float(size))
|
sizes.append(float(size))
|
||||||
width = td.get("width")
|
width = td.get("width")
|
||||||
td.attrs = {}
|
td.attrs = {}
|
||||||
@@ -392,14 +300,13 @@ class HTMLDocxProcessor:
|
|||||||
self.logger.log(f"Processing TOC and headers.")
|
self.logger.log(f"Processing TOC and headers.")
|
||||||
self._process_toc_links()
|
self._process_toc_links()
|
||||||
|
|
||||||
for rule in self.preset:
|
_preprocess_html(html_preprocessor=self.html_preprocessor,
|
||||||
self.logger.log(rule["preset_name"].title() + " process.")
|
html_soup=self.html_soup)
|
||||||
action = self.name2action[rule["preset_name"]]
|
|
||||||
self._process_tags(self.body_tag, rule["rules"], action)
|
|
||||||
|
|
||||||
# CSS after html processing cause of <fonts> that aren't supported by html
|
# CSS after html processing cause of <fonts> that aren't supported by html
|
||||||
self.logger.log("CSS inline style preprocessing.")
|
self.logger.log("CSS inline style preprocessing.")
|
||||||
self.style_processor.process_inline_styles_in_html_soup(self.body_tag)
|
self.style_preprocessor.process_inline_styles_in_html_soup(
|
||||||
|
self.body_tag)
|
||||||
|
|
||||||
self.logger.log("CSS inline style processing.")
|
self.logger.log("CSS inline style processing.")
|
||||||
modify_html_soup_with_css_styles(self.body_tag)
|
modify_html_soup_with_css_styles(self.body_tag)
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from src.util.helpers import BookLogger
|
|||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.style_preprocessor import StylePreprocessor
|
from src.style_preprocessor import StylePreprocessor
|
||||||
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
from src.epub_converter.html_epub_processor import HTMLEpubProcessor
|
||||||
from src.epub_converter.image_processing import update_images_src_links
|
from src.epub_converter.image_processing import update_images_src_links
|
||||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||||
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
||||||
@@ -21,7 +21,7 @@ from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
|||||||
|
|
||||||
class EpubConverter:
|
class EpubConverter:
|
||||||
def __init__(self, book_path, access=None, logger: BookLogger = None,
|
def __init__(self, book_path, access=None, logger: BookLogger = None,
|
||||||
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
|
style_processor: StylePreprocessor = None, html_processor: HTMLEpubProcessor = None):
|
||||||
self.book_path = book_path
|
self.book_path = book_path
|
||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
|
|||||||
@@ -30,13 +30,16 @@ class EpubBook(BookSolver):
|
|||||||
json for LiveCarta platform
|
json for LiveCarta platform
|
||||||
|
|
||||||
"""
|
"""
|
||||||
style_processor = StylePreprocessor()
|
html_preprocessor = HtmlPreprocessor(
|
||||||
html_processor = HtmlEpubProcessor(
|
logger=self.logger_object, preset_path="presets/epub_presets.json")
|
||||||
logger=self.logger_object)
|
style_preprocessor = StylePreprocessor()
|
||||||
|
html_processor = HTMLEpubProcessor(logger=self.logger_object,
|
||||||
|
html_preprocessor=html_preprocessor)
|
||||||
json_converter = EpubConverter(
|
json_converter = EpubConverter(
|
||||||
self.book_path, access=self.access, logger=self.logger_object,
|
self.book_path, access=self.access, logger=self.logger_object,
|
||||||
style_processor=style_processor, html_processor=html_processor)
|
style_processor=style_preprocessor, html_processor=html_processor)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
|
|
||||||
return content_dict
|
return content_dict
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,58 +1,16 @@
|
|||||||
import re
|
import re
|
||||||
import json
|
from typing import Union
|
||||||
from typing import List, Dict, Union
|
|
||||||
from bs4.element import PageElement
|
from bs4.element import PageElement
|
||||||
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
||||||
|
|
||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
|
from src.html_preprocessor import _preprocess_html
|
||||||
|
|
||||||
|
|
||||||
class HtmlEpubProcessor:
|
class HTMLEpubProcessor:
|
||||||
def __init__(self, preset_path: str = "presets/epub_presets.json", logger: BookLogger = None):
|
def __init__(self, logger: BookLogger = None, html_preprocessor=None):
|
||||||
self.preset = json.load(open(preset_path))
|
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.name2action = {
|
self.html_preprocessor = html_preprocessor
|
||||||
"table_wrapper": self._process_tag_using_table,
|
|
||||||
"replacer": self._replace_tag,
|
|
||||||
"attr_replacer": self._replace_attr,
|
|
||||||
"unwrapper": self._unwrap_tag,
|
|
||||||
"inserter": self._insert_tag
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
|
|
||||||
chapter_tag: BeautifulSoup):
|
|
||||||
"""
|
|
||||||
Function adds span with id from tag_to_be_removed
|
|
||||||
because this tag will be removed(unwrapped/extract)
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
tag_to_be_removed: Union[PageElement, BeautifulSoup]
|
|
||||||
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
NoReturn
|
|
||||||
updated body tag
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
|
|
||||||
tag_to_be_removed: Tag,
|
|
||||||
id_: str,
|
|
||||||
class_: Union[List[str], str]):
|
|
||||||
"""Function inserts span before tag aren't supported by LiveCarta"""
|
|
||||||
new_tag: Tag = chapter_tag.new_tag("span")
|
|
||||||
new_tag.attrs["id"] = id_ or ""
|
|
||||||
new_tag.attrs["class"] = class_ or ""
|
|
||||||
new_tag.string = "\xa0"
|
|
||||||
tag_to_be_removed.insert_before(new_tag)
|
|
||||||
|
|
||||||
if tag_to_be_removed.attrs.get("id"):
|
|
||||||
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
|
|
||||||
tag_to_be_removed=tag_to_be_removed,
|
|
||||||
id_=tag_to_be_removed.attrs["id"],
|
|
||||||
class_=tag_to_be_removed.attrs.get("class"))
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def prepare_title(title_of_chapter: str) -> str:
|
def prepare_title(title_of_chapter: str) -> str:
|
||||||
@@ -116,111 +74,6 @@ class HtmlEpubProcessor:
|
|||||||
p_tag.append(str(node))
|
p_tag.append(str(node))
|
||||||
node.replace_with(p_tag)
|
node.replace_with(p_tag)
|
||||||
|
|
||||||
def _process_tag_using_table(self, **kwargs):
|
|
||||||
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
|
|
||||||
table = kwargs["chapter_tag"].new_tag("table")
|
|
||||||
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
|
||||||
= border, "center", f"width:{width}%;"
|
|
||||||
tbody, tr, td = \
|
|
||||||
kwargs["chapter_tag"].new_tag("tbody"), kwargs["chapter_tag"].new_tag(
|
|
||||||
"tr"), kwargs["chapter_tag"].new_tag("td")
|
|
||||||
td.attrs["bgcolor"] = bg_color
|
|
||||||
kwargs["tag"].wrap(td)
|
|
||||||
td.wrap(tr)
|
|
||||||
tr.wrap(tbody)
|
|
||||||
tbody.wrap(table)
|
|
||||||
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
|
||||||
return table
|
|
||||||
_wrap_tag_with_table(
|
|
||||||
width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
|
|
||||||
"width") else "100",
|
|
||||||
border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
|
|
||||||
"border") else None,
|
|
||||||
bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
|
|
||||||
self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["chapter_tag"])
|
|
||||||
kwargs["tag"].unwrap()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _replace_tag(**kwargs):
|
|
||||||
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
|
|
||||||
kwargs["tag"].name = tag_to_replace
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _replace_attr(**kwargs):
|
|
||||||
attr, attr_value =\
|
|
||||||
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
|
|
||||||
attr_to_replace, attr_value_to_replace =\
|
|
||||||
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
|
|
||||||
if attr_to_replace:
|
|
||||||
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
|
|
||||||
if attr_value_to_replace:
|
|
||||||
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
|
|
||||||
del kwargs["tag"][attr]
|
|
||||||
elif attr_value_to_replace:
|
|
||||||
kwargs["tag"].attrs[attr] = attr_value_to_replace
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _unwrap_tag(**kwargs):
|
|
||||||
kwargs["tag"].unwrap()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _insert_tag(**kwargs):
|
|
||||||
tag_to_insert = \
|
|
||||||
kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
|
|
||||||
# insert all items that was in tag to subtag and remove from tag
|
|
||||||
for content in reversed(kwargs["tag"].contents):
|
|
||||||
tag_to_insert.insert(0, content.extract())
|
|
||||||
# wrap subtag with items
|
|
||||||
kwargs["tag"].append(tag_to_insert)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _process_tags(chapter_tag: BeautifulSoup,
|
|
||||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
|
|
||||||
action):
|
|
||||||
"""
|
|
||||||
Function do action with tags
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
|
|
||||||
list of conditions when fire function
|
|
||||||
action: function
|
|
||||||
action what to do with tag
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
NoReturn
|
|
||||||
Body Tag with processed certain tags
|
|
||||||
|
|
||||||
"""
|
|
||||||
for rule in rules:
|
|
||||||
tags: List[str] = rule["tags"] if rule.get("tags") else rule["condition"]["tags"]
|
|
||||||
if rule["condition"]:
|
|
||||||
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
|
||||||
if condition_on_tag[0] == "parent_tags":
|
|
||||||
for tag in chapter_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
|
|
||||||
for tag in tags])):
|
|
||||||
tag.parent.attrs.update(tag.attrs)
|
|
||||||
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
|
||||||
elif condition_on_tag[0] == "child_tags":
|
|
||||||
for tag in chapter_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
|
|
||||||
for tag in tags])):
|
|
||||||
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
|
||||||
elif condition_on_tag[0] == "attrs":
|
|
||||||
for attr in rule["condition"]["attrs"]:
|
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
|
||||||
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
|
||||||
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
|
||||||
# attr replacer
|
|
||||||
elif condition_on_tag[0] == "tags":
|
|
||||||
attr = rule["attr"]
|
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags],
|
|
||||||
{attr['name']: re.compile(fr"{attr['value']}")}):
|
|
||||||
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
|
||||||
else:
|
|
||||||
for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]):
|
|
||||||
action(chapter_tag=chapter_tag, tag=tag, rule=rule)
|
|
||||||
|
|
||||||
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
|
def _remove_headings_content(self, chapter_tag: Union[BeautifulSoup, PageElement], title_of_chapter: str):
|
||||||
"""
|
"""
|
||||||
Function
|
Function
|
||||||
@@ -250,7 +103,8 @@ class HtmlEpubProcessor:
|
|||||||
if title_of_chapter == text or \
|
if title_of_chapter == text or \
|
||||||
(title_of_chapter in text and
|
(title_of_chapter in text and
|
||||||
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
|
re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)):
|
||||||
self._add_span_to_save_ids_for_links(tag, chapter_tag)
|
self.html_preprocessor._add_span_to_save_ids_for_links(
|
||||||
|
tag, chapter_tag)
|
||||||
tag.extract()
|
tag.extract()
|
||||||
return
|
return
|
||||||
elif not self._remove_headings_content(tag, title_of_chapter):
|
elif not self._remove_headings_content(tag, title_of_chapter):
|
||||||
@@ -350,9 +204,8 @@ class HtmlEpubProcessor:
|
|||||||
# 2.
|
# 2.
|
||||||
self._wrap_strings_with_p(chapter_tag)
|
self._wrap_strings_with_p(chapter_tag)
|
||||||
# 3-6.
|
# 3-6.
|
||||||
for rule in self.preset:
|
_preprocess_html(
|
||||||
action = self.name2action[rule["preset_name"]]
|
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
|
||||||
self._process_tags(chapter_tag, rule["rules"], action)
|
|
||||||
# 7.
|
# 7.
|
||||||
if remove_title_from_chapter:
|
if remove_title_from_chapter:
|
||||||
self._remove_headings_content(chapter_tag, title_str)
|
self._remove_headings_content(chapter_tag, title_str)
|
||||||
|
|||||||
179
src/html_preprocessor.py
Normal file
179
src/html_preprocessor.py
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
import re
|
||||||
|
import json
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
from bs4.element import PageElement
|
||||||
|
from typing import List, Dict, Union
|
||||||
|
|
||||||
|
from src.util.helpers import BookLogger
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlPreprocessor:
|
||||||
|
def __init__(self, logger: BookLogger, preset_path):
|
||||||
|
self.preset = json.load(open(preset_path))
|
||||||
|
self.logger = logger
|
||||||
|
self.name2action = {
|
||||||
|
"wrapper": self._wrap_tag,
|
||||||
|
"table_wrapper": self._process_tag_using_table,
|
||||||
|
"decomposer": self._decompose_tag,
|
||||||
|
"replacer": self._replace_tag,
|
||||||
|
"attr_replacer": self._replace_attr,
|
||||||
|
"unwrapper": self._unwrap_tag,
|
||||||
|
"inserter": self._insert_tag
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _wrap_tag(**kwargs):
|
||||||
|
kwargs["tag"].wrap(kwargs["body_tag"].new_tag(
|
||||||
|
kwargs["rule"]["tag_to_wrap"]))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _decompose_tag(**kwargs):
|
||||||
|
kwargs["tag"].decompose()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add_span_to_save_ids_for_links(tag_to_be_removed: Union[PageElement, BeautifulSoup],
|
||||||
|
chapter_tag: BeautifulSoup):
|
||||||
|
"""
|
||||||
|
Function adds span with id from tag_to_be_removed
|
||||||
|
because this tag will be removed(unwrapped/extract)
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tag_to_be_removed: Union[PageElement, BeautifulSoup]
|
||||||
|
|
||||||
|
chapter_tag: BeautifulSoup
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
NoReturn
|
||||||
|
updated body tag
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup,
|
||||||
|
tag_to_be_removed: Tag,
|
||||||
|
id_: str,
|
||||||
|
class_: Union[List[str], str]):
|
||||||
|
"""Function inserts span before tag aren't supported by LiveCarta"""
|
||||||
|
new_tag: Tag = chapter_tag.new_tag("span")
|
||||||
|
new_tag.attrs["id"] = id_ or ""
|
||||||
|
new_tag.attrs["class"] = class_ or ""
|
||||||
|
new_tag.string = "\xa0"
|
||||||
|
tag_to_be_removed.insert_before(new_tag)
|
||||||
|
|
||||||
|
if tag_to_be_removed.attrs.get("id"):
|
||||||
|
_insert_span_with_attrs_before_tag(chapter_tag=chapter_tag,
|
||||||
|
tag_to_be_removed=tag_to_be_removed,
|
||||||
|
id_=tag_to_be_removed.attrs["id"],
|
||||||
|
class_=tag_to_be_removed.attrs.get("class"))
|
||||||
|
|
||||||
|
def _process_tag_using_table(self, **kwargs):
|
||||||
|
def _wrap_tag_with_table(width: str = "100", border: str = "", bg_color: str = None) -> Tag:
|
||||||
|
table = kwargs["body_tag"].new_tag("table")
|
||||||
|
table.attrs["border"], table.attrs["align"], table.attrs["style"] \
|
||||||
|
= border, "center", f"width:{width}%;"
|
||||||
|
tbody, tr, td = \
|
||||||
|
kwargs["body_tag"].new_tag("tbody"), kwargs["body_tag"].new_tag(
|
||||||
|
"tr"), kwargs["body_tag"].new_tag("td")
|
||||||
|
td.attrs["bgcolor"] = bg_color
|
||||||
|
kwargs["tag"].wrap(td)
|
||||||
|
td.wrap(tr)
|
||||||
|
tr.wrap(tbody)
|
||||||
|
tbody.wrap(table)
|
||||||
|
table.insert_after(BeautifulSoup(features="lxml").new_tag("br"))
|
||||||
|
return table
|
||||||
|
_wrap_tag_with_table(
|
||||||
|
width=kwargs["tag"].attrs["width"] if kwargs["tag"].attrs.get(
|
||||||
|
"width") else "100",
|
||||||
|
border=kwargs["tag"].attrs["border"] if kwargs["tag"].attrs.get(
|
||||||
|
"border") else None,
|
||||||
|
bg_color=kwargs["tag"].attrs["bgcolor"] if kwargs["tag"].attrs.get("bgcolor") else None)
|
||||||
|
self._add_span_to_save_ids_for_links(kwargs["tag"], kwargs["body_tag"])
|
||||||
|
kwargs["tag"].unwrap()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _replace_tag(**kwargs):
|
||||||
|
tag_to_replace: str = kwargs["rule"]["tag_to_replace"]
|
||||||
|
kwargs["tag"].name = tag_to_replace
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _replace_attr(**kwargs):
|
||||||
|
attr, attr_value =\
|
||||||
|
kwargs["rule"]["attr"]["name"], kwargs["rule"]["attr"]["value"]
|
||||||
|
attr_to_replace, attr_value_to_replace =\
|
||||||
|
kwargs["rule"]["attr_to_replace"]["name"], kwargs["rule"]["attr_to_replace"]["value"]
|
||||||
|
if attr_to_replace:
|
||||||
|
kwargs["tag"][attr_to_replace] = kwargs["tag"][attr]
|
||||||
|
if attr_value_to_replace:
|
||||||
|
kwargs["tag"].attrs[attr_to_replace] = attr_value_to_replace
|
||||||
|
del kwargs["tag"][attr]
|
||||||
|
elif attr_value_to_replace:
|
||||||
|
kwargs["tag"].attrs[attr] = attr_value_to_replace
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _unwrap_tag(**kwargs):
|
||||||
|
kwargs["tag"].unwrap()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _insert_tag(**kwargs):
|
||||||
|
tag_to_insert = \
|
||||||
|
kwargs["chapter_tag"].new_tag(kwargs["rule"]["tag_to_insert"])
|
||||||
|
# insert all items that was in tag to subtag and remove from tag
|
||||||
|
for content in reversed(kwargs["tag"].contents):
|
||||||
|
tag_to_insert.insert(0, content.extract())
|
||||||
|
# wrap subtag with items
|
||||||
|
kwargs["tag"].append(tag_to_insert)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _process_tags(body_tag: BeautifulSoup,
|
||||||
|
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]],
|
||||||
|
action):
|
||||||
|
"""
|
||||||
|
Function does action with tags
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
body_tag: BeautifulSoup
|
||||||
|
Tag & contents of the body tag
|
||||||
|
rules: List[Dict[str, Union[List[str], str, Dict[str, Union[List[Dict[str, str]], int, str]]]]]
|
||||||
|
list of conditions when fire function
|
||||||
|
action: function
|
||||||
|
action what to do with tag
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
NoReturn
|
||||||
|
Body Tag with processed certain tags
|
||||||
|
|
||||||
|
"""
|
||||||
|
for rule in rules:
|
||||||
|
tags: List[str] = rule["tags"] if rule.get(
|
||||||
|
"tags") else rule["condition"]["tags"]
|
||||||
|
if rule["condition"]:
|
||||||
|
for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v):
|
||||||
|
if condition_on_tag[0] == "parent_tags":
|
||||||
|
for tag in body_tag.select(', '.join([condition_on_tag[1] + " > " + re.sub(r"[\^$]", "", tag)
|
||||||
|
for tag in tags])):
|
||||||
|
tag.parent.attrs.update(tag.attrs)
|
||||||
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||||
|
elif condition_on_tag[0] == "child_tags":
|
||||||
|
for tag in body_tag.select(', '.join([re.sub(r"[\^$]", "", tag) + condition_on_tag[1]
|
||||||
|
for tag in tags])):
|
||||||
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||||
|
elif condition_on_tag[0] == "attrs":
|
||||||
|
for attr in rule["condition"]["attrs"]:
|
||||||
|
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
|
{attr["name"]: re.compile(fr"{attr['value']}")}):
|
||||||
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||||
|
# attr replacer
|
||||||
|
elif condition_on_tag[0] == "tags":
|
||||||
|
attr = rule["attr"]
|
||||||
|
for tag in body_tag.find_all([re.compile(tag) for tag in tags],
|
||||||
|
{attr['name']: re.compile(fr"{attr['value']}")}):
|
||||||
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||||
|
else:
|
||||||
|
for tag in body_tag.find_all([re.compile(tag) for tag in tags]):
|
||||||
|
action(body_tag=body_tag, tag=tag, rule=rule)
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocess_html(html_preprocessor: HtmlPreprocessor, html_soup: BeautifulSoup):
|
||||||
|
for rule in html_preprocessor.preset:
|
||||||
|
# html_preprocessor.logger.log(rule["preset_name"].title() + " process.")
|
||||||
|
action = html_preprocessor.name2action[rule["preset_name"]]
|
||||||
|
html_preprocessor._process_tags(html_soup, rule["rules"], action)
|
||||||
Reference in New Issue
Block a user