forked from LiveCarta/BookConverter
Annot.[HTML->Html, _]
This commit is contained in:
@@ -7,11 +7,10 @@ from typing import Union
|
|||||||
from threading import Event
|
from threading import Event
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
|
|
||||||
|
|
||||||
class Docx2LibreHTML:
|
class Docx2LibreHtml:
|
||||||
def __init__(self, book_id: int = 0, file_path: Union[pathlib.PosixPath, str] = None,
|
def __init__(self, book_id: int = 0, file_path: Union[pathlib.PosixPath, str] = None,
|
||||||
access=None, logger: BookLogger = None, libre_locker: Event = None):
|
access=None, logger: BookLogger = None, libre_locker: Event = None):
|
||||||
self.book_id = book_id if book_id != 0 else pathlib.Path(
|
self.book_id = book_id if book_id != 0 else pathlib.Path(
|
||||||
|
|||||||
@@ -7,9 +7,9 @@ from src.book_solver import BookSolver
|
|||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
from src.html_preprocessor import HtmlPreprocessor
|
from src.html_preprocessor import HtmlPreprocessor
|
||||||
from src.style_preprocessor import StylePreprocessor
|
from src.style_preprocessor import StylePreprocessor
|
||||||
from src.docx_converter.docx2libre_html import Docx2LibreHTML
|
from src.docx_converter.docx2libre_html import Docx2LibreHtml
|
||||||
from src.docx_converter.html_docx_processor import HTMLDocxProcessor
|
from src.docx_converter.html_docx_processor import HtmlDocxProcessor
|
||||||
from src.docx_converter.libre_html2json_converter import LibreHTML2JSONConverter
|
from src.docx_converter.libre_html2json_converter import LibreHtml2JsonConverter
|
||||||
|
|
||||||
|
|
||||||
class DocxBook(BookSolver):
|
class DocxBook(BookSolver):
|
||||||
@@ -38,7 +38,7 @@ class DocxBook(BookSolver):
|
|||||||
"""
|
"""
|
||||||
# 1. Converts docx to html with LibreOffice
|
# 1. Converts docx to html with LibreOffice
|
||||||
try:
|
try:
|
||||||
html_converter = Docx2LibreHTML(self.book_id, self.book_path, self.access,
|
html_converter = Docx2LibreHtml(self.book_id, self.book_path, self.access,
|
||||||
self.logger_object, self.libre_locker)
|
self.logger_object, self.libre_locker)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
self.logger_object.log(
|
self.logger_object.log(
|
||||||
@@ -52,7 +52,7 @@ class DocxBook(BookSolver):
|
|||||||
html_preprocessor = HtmlPreprocessor(
|
html_preprocessor = HtmlPreprocessor(
|
||||||
logger=self.logger_object, preset_path="presets/docx_presets.json")
|
logger=self.logger_object, preset_path="presets/docx_presets.json")
|
||||||
style_preprocessor = StylePreprocessor()
|
style_preprocessor = StylePreprocessor()
|
||||||
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup,
|
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup,
|
||||||
logger=self.logger_object,
|
logger=self.logger_object,
|
||||||
html_preprocessor=html_preprocessor,
|
html_preprocessor=html_preprocessor,
|
||||||
style_preprocessor=style_preprocessor)
|
style_preprocessor=style_preprocessor)
|
||||||
@@ -67,7 +67,7 @@ class DocxBook(BookSolver):
|
|||||||
|
|
||||||
# 3. Parses from line structure to nested structure with JSONConverter
|
# 3. Parses from line structure to nested structure with JSONConverter
|
||||||
try:
|
try:
|
||||||
json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers,
|
json_converter = LibreHtml2JsonConverter(bs_tags, footnotes, top_level_headers,
|
||||||
self.logger_object)
|
self.logger_object)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
@@ -86,18 +86,18 @@ if __name__ == "__main__":
|
|||||||
locker = Event()
|
locker = Event()
|
||||||
locker.set()
|
locker.set()
|
||||||
|
|
||||||
html_converter = Docx2LibreHTML(file_path=docx_file_path,
|
html_converter = Docx2LibreHtml(file_path=docx_file_path,
|
||||||
logger=logger_object, libre_locker=locker)
|
logger=logger_object, libre_locker=locker)
|
||||||
|
|
||||||
html_preprocessor = HtmlPreprocessor(
|
html_preprocessor = HtmlPreprocessor(
|
||||||
logger=logger_object, preset_path="../../presets/docx_presets.json")
|
logger=logger_object, preset_path="../../presets/docx_presets.json")
|
||||||
style_preprocessor = StylePreprocessor()
|
style_preprocessor = StylePreprocessor()
|
||||||
html_processor = HTMLDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
|
html_processor = HtmlDocxProcessor(html_soup=html_converter.html_soup, logger=logger_object,
|
||||||
html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
|
html_preprocessor=html_preprocessor, style_preprocessor=style_preprocessor)
|
||||||
content, footnotes, top_level_headers = html_processor.process_html(
|
content, footnotes, top_level_headers = html_processor.process_html(
|
||||||
html_path=html_converter.html_path, book_id=html_converter.book_id)
|
html_path=html_converter.html_path, book_id=html_converter.book_id)
|
||||||
|
|
||||||
json_converter = LibreHTML2JSONConverter(
|
json_converter = LibreHtml2JsonConverter(
|
||||||
content, footnotes, top_level_headers, logger_object)
|
content, footnotes, top_level_headers, logger_object)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from typing import List
|
|||||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||||
|
|
||||||
|
|
||||||
def _clean_footnote_content(content: str) -> str:
|
def clean_footnote_content(content: str) -> str:
|
||||||
content = content.strip()
|
content = content.strip()
|
||||||
return content.strip()
|
return content.strip()
|
||||||
|
|
||||||
@@ -66,7 +66,7 @@ def process_footnotes(body_tag: Tag) -> List[str]:
|
|||||||
else:
|
else:
|
||||||
unicode_string += child.decode_contents()
|
unicode_string += child.decode_contents()
|
||||||
|
|
||||||
content = _clean_footnote_content(unicode_string)
|
content = clean_footnote_content(unicode_string)
|
||||||
cont_tag.decompose()
|
cont_tag.decompose()
|
||||||
footnotes.append(content)
|
footnotes.append(content)
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from src.docx_converter.footnotes_processing import process_footnotes
|
|||||||
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
||||||
|
|
||||||
|
|
||||||
class HTMLDocxProcessor:
|
class HtmlDocxProcessor:
|
||||||
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
|
def __init__(self, logger: BookLogger, html_soup: BeautifulSoup, html_preprocessor, style_preprocessor):
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.html_soup = html_soup
|
self.html_soup = html_soup
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from bs4 import Tag
|
|||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
|
|
||||||
|
|
||||||
class LibreHTML2JSONConverter:
|
class LibreHtml2JsonConverter:
|
||||||
def __init__(self, content: List[Tag], footnotes: List[str], top_level_headers: List[Dict[str, Union[str, bool]]],
|
def __init__(self, content: List[Tag], footnotes: List[str], top_level_headers: List[Dict[str, Union[str, bool]]],
|
||||||
logger_object, book_api_status=None):
|
logger_object, book_api_status=None):
|
||||||
self.content_dict = None
|
self.content_dict = None
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from src.util.helpers import BookLogger
|
|||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.style_preprocessor import StylePreprocessor
|
from src.style_preprocessor import StylePreprocessor
|
||||||
from src.epub_converter.html_epub_processor import HTMLEpubProcessor
|
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||||
from src.epub_converter.image_processing import update_images_src_links
|
from src.epub_converter.image_processing import update_images_src_links
|
||||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||||
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
||||||
@@ -21,7 +21,7 @@ from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
|||||||
|
|
||||||
class EpubConverter:
|
class EpubConverter:
|
||||||
def __init__(self, book_path, access=None, logger: BookLogger = None,
|
def __init__(self, book_path, access=None, logger: BookLogger = None,
|
||||||
style_processor: StylePreprocessor = None, html_processor: HTMLEpubProcessor = None):
|
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
|
||||||
self.book_path = book_path
|
self.book_path = book_path
|
||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from src.book_solver import BookSolver
|
|||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
from src.html_preprocessor import HtmlPreprocessor
|
from src.html_preprocessor import HtmlPreprocessor
|
||||||
from src.style_preprocessor import StylePreprocessor
|
from src.style_preprocessor import StylePreprocessor
|
||||||
from src.epub_converter.html_epub_processor import HTMLEpubProcessor
|
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||||
from src.epub_converter.epub_converter import EpubConverter
|
from src.epub_converter.epub_converter import EpubConverter
|
||||||
|
|
||||||
|
|
||||||
@@ -33,7 +33,7 @@ class EpubBook(BookSolver):
|
|||||||
html_preprocessor = HtmlPreprocessor(
|
html_preprocessor = HtmlPreprocessor(
|
||||||
logger=self.logger_object, preset_path="presets/epub_presets.json")
|
logger=self.logger_object, preset_path="presets/epub_presets.json")
|
||||||
style_preprocessor = StylePreprocessor()
|
style_preprocessor = StylePreprocessor()
|
||||||
html_processor = HTMLEpubProcessor(logger=self.logger_object,
|
html_processor = HtmlEpubProcessor(logger=self.logger_object,
|
||||||
html_preprocessor=html_preprocessor)
|
html_preprocessor=html_preprocessor)
|
||||||
json_converter = EpubConverter(
|
json_converter = EpubConverter(
|
||||||
self.book_path, access=self.access, logger=self.logger_object,
|
self.book_path, access=self.access, logger=self.logger_object,
|
||||||
@@ -51,7 +51,7 @@ if __name__ == "__main__":
|
|||||||
html_preprocessor = HtmlPreprocessor(
|
html_preprocessor = HtmlPreprocessor(
|
||||||
logger=logger_object, preset_path="../../presets/epub_presets.json")
|
logger=logger_object, preset_path="../../presets/epub_presets.json")
|
||||||
style_preprocessor = StylePreprocessor()
|
style_preprocessor = StylePreprocessor()
|
||||||
html_processor = HTMLEpubProcessor(logger=logger_object,
|
html_processor = HtmlEpubProcessor(logger=logger_object,
|
||||||
html_preprocessor=html_preprocessor)
|
html_preprocessor=html_preprocessor)
|
||||||
|
|
||||||
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
json_converter = EpubConverter(epub_file_path, logger=logger_object,
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from typing import List, Tuple
|
|||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
|
||||||
def _replace_with_livecarta_anchor_tag(anchor, i):
|
def replace_with_livecarta_anchor_tag(anchor, i):
|
||||||
"""Function replace noteref_tag(anchor) with new livecarta tag"""
|
"""Function replace noteref_tag(anchor) with new livecarta tag"""
|
||||||
new_tag = BeautifulSoup(features="lxml").new_tag("sup")
|
new_tag = BeautifulSoup(features="lxml").new_tag("sup")
|
||||||
new_tag["class"] = "footnote-element"
|
new_tag["class"] = "footnote-element"
|
||||||
@@ -75,7 +75,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
|||||||
if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote":
|
if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote":
|
||||||
footnote_tag = footnote_tag.parent
|
footnote_tag = footnote_tag.parent
|
||||||
new_noterefs_tags.append(
|
new_noterefs_tags.append(
|
||||||
_replace_with_livecarta_anchor_tag(noteref_tag, i))
|
replace_with_livecarta_anchor_tag(noteref_tag, i))
|
||||||
content = footnote_tag.text
|
content = footnote_tag.text
|
||||||
# footnote_tag.decompose()
|
# footnote_tag.decompose()
|
||||||
footnotes.append(content)
|
footnotes.append(content)
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from src.util.helpers import BookLogger
|
|||||||
from src.html_preprocessor import _preprocess_html
|
from src.html_preprocessor import _preprocess_html
|
||||||
|
|
||||||
|
|
||||||
class HTMLEpubProcessor:
|
class HtmlEpubProcessor:
|
||||||
def __init__(self, logger: BookLogger = None, html_preprocessor=None):
|
def __init__(self, logger: BookLogger = None, html_preprocessor=None):
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.html_preprocessor = html_preprocessor
|
self.html_preprocessor = html_preprocessor
|
||||||
|
|||||||
Reference in New Issue
Block a user