forked from LiveCarta/BookConverter
Remove processing of tables
This commit is contained in:
@@ -12,16 +12,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString
|
|||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
from src.data_objects import ChapterItem, NavPoint
|
from src.data_objects import ChapterItem, NavPoint
|
||||||
from src.style_preprocessor import StylePreprocessor
|
from src.style_reader import StyleReader
|
||||||
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||||
from src.epub_converter.image_processing import update_images_src_links
|
from src.epub_converter.image_processing import update_images_src_links
|
||||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||||
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
from src.inline_style_processor import modify_html_soup_with_css_styles
|
||||||
|
|
||||||
|
|
||||||
class EpubConverter:
|
class EpubConverter:
|
||||||
def __init__(self, book_path, access=None, logger: BookLogger = None,
|
def __init__(self, book_path, access=None, logger: BookLogger = None,
|
||||||
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
|
style_processor: StyleReader = None, html_processor: HtmlEpubProcessor = None):
|
||||||
self.book_path = book_path
|
self.book_path = book_path
|
||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
@@ -57,13 +57,6 @@ class EpubConverter:
|
|||||||
self.noterefs: List[Tag] = [] # start of the footnote
|
self.noterefs: List[Tag] = [] # start of the footnote
|
||||||
self.footnotes: List[Tag] = [] # end of the footnote
|
self.footnotes: List[Tag] = [] # end of the footnote
|
||||||
|
|
||||||
self.logger.log("Image processing.")
|
|
||||||
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
|
|
||||||
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
|
|
||||||
file_name = x.file_name
|
|
||||||
content = x.content
|
|
||||||
self.img_href2img_bytes[file_name] = content
|
|
||||||
|
|
||||||
self.logger.log("HTML files reading.")
|
self.logger.log("HTML files reading.")
|
||||||
self.html_href2html_body_soup: Dict[str,
|
self.html_href2html_body_soup: Dict[str,
|
||||||
BeautifulSoup] = self.build_href2soup_content()
|
BeautifulSoup] = self.build_href2soup_content()
|
||||||
@@ -76,6 +69,13 @@ class EpubConverter:
|
|||||||
self.logger.log("CSS styles fusion(inline+file).")
|
self.logger.log("CSS styles fusion(inline+file).")
|
||||||
self.add_css_styles_to_html_soup()
|
self.add_css_styles_to_html_soup()
|
||||||
|
|
||||||
|
self.logger.log("Image processing.")
|
||||||
|
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
|
||||||
|
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
|
||||||
|
file_name = x.file_name
|
||||||
|
content = x.content
|
||||||
|
self.img_href2img_bytes[file_name] = content
|
||||||
|
|
||||||
self.logger.log("Footnotes processing.")
|
self.logger.log("Footnotes processing.")
|
||||||
for href in self.html_href2html_body_soup:
|
for href in self.html_href2html_body_soup:
|
||||||
self.footnotes_contents, self.noterefs, self.footnotes =\
|
self.footnotes_contents, self.noterefs, self.footnotes =\
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ import codecs
|
|||||||
|
|
||||||
from src.book_solver import BookSolver
|
from src.book_solver import BookSolver
|
||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
from src.html_preprocessor import HtmlPreprocessor
|
from src.html_presets_processor import HtmlPresetsProcessor
|
||||||
from src.style_preprocessor import StylePreprocessor
|
from src.style_reader import StyleReader
|
||||||
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||||
from src.epub_converter.epub_converter import EpubConverter
|
from src.epub_converter.epub_converter import EpubConverter
|
||||||
|
|
||||||
@@ -30,16 +30,15 @@ class EpubBook(BookSolver):
|
|||||||
json for LiveCarta platform
|
json for LiveCarta platform
|
||||||
|
|
||||||
"""
|
"""
|
||||||
html_preprocessor = HtmlPreprocessor(
|
html_preprocessor = HtmlPresetsProcessor(
|
||||||
logger=self.logger_object, preset_path="presets/epub_presets.json")
|
logger=self.logger_object, preset_path="presets/epub_presets.json")
|
||||||
style_preprocessor = StylePreprocessor()
|
style_preprocessor = StyleReader()
|
||||||
html_processor = HtmlEpubProcessor(logger=self.logger_object,
|
html_processor = HtmlEpubProcessor(logger=self.logger_object,
|
||||||
html_preprocessor=html_preprocessor)
|
html_preprocessor=html_preprocessor)
|
||||||
json_converter = EpubConverter(
|
json_converter = EpubConverter(
|
||||||
self.book_path, access=self.access, logger=self.logger_object,
|
self.book_path, access=self.access, logger=self.logger_object,
|
||||||
style_processor=style_preprocessor, html_processor=html_processor)
|
style_processor=style_preprocessor, html_processor=html_processor)
|
||||||
content_dict = json_converter.convert_to_dict()
|
content_dict = json_converter.convert_to_dict()
|
||||||
|
|
||||||
return content_dict
|
return content_dict
|
||||||
|
|
||||||
|
|
||||||
@@ -48,9 +47,9 @@ if __name__ == "__main__":
|
|||||||
logger_object = BookLogger(
|
logger_object = BookLogger(
|
||||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||||
|
|
||||||
html_preprocessor = HtmlPreprocessor(
|
html_preprocessor = HtmlPresetsProcessor(
|
||||||
logger=logger_object, preset_path="../../presets/epub_presets.json")
|
logger=logger_object, preset_path="../../presets/epub_presets.json")
|
||||||
style_preprocessor = StylePreprocessor()
|
style_preprocessor = StyleReader()
|
||||||
html_processor = HtmlEpubProcessor(logger=logger_object,
|
html_processor = HtmlEpubProcessor(logger=logger_object,
|
||||||
html_preprocessor=html_preprocessor)
|
html_preprocessor=html_preprocessor)
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from bs4.element import PageElement
|
|||||||
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
||||||
|
|
||||||
from src.util.helpers import BookLogger
|
from src.util.helpers import BookLogger
|
||||||
from src.html_preprocessor import _preprocess_html
|
from src.html_presets_processor import _process_presets
|
||||||
|
|
||||||
|
|
||||||
class HtmlEpubProcessor:
|
class HtmlEpubProcessor:
|
||||||
@@ -113,43 +113,6 @@ class HtmlEpubProcessor:
|
|||||||
tag.extract()
|
tag.extract()
|
||||||
return
|
return
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _process_tables(chapter_tag: BeautifulSoup):
|
|
||||||
"""
|
|
||||||
Function preprocesses tables and tags(td|th|tr)
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
chapter_tag: BeautifulSoup
|
|
||||||
Tag & contents of the chapter tag
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
NoReturn
|
|
||||||
Chapter Tag with processed tables
|
|
||||||
|
|
||||||
"""
|
|
||||||
tables = chapter_tag.find_all("table")
|
|
||||||
for table in tables:
|
|
||||||
for t_tag in table.find_all(re.compile("td|th|tr")):
|
|
||||||
width = ""
|
|
||||||
if t_tag.get("style"):
|
|
||||||
width_match = re.search(
|
|
||||||
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
|
|
||||||
if width_match:
|
|
||||||
size = width_match.group(1)
|
|
||||||
width = size + "px"
|
|
||||||
|
|
||||||
t_tag.attrs["width"] = t_tag.get("width") or width
|
|
||||||
|
|
||||||
if t_tag.attrs.get("style"):
|
|
||||||
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
|
||||||
"border:0;", "")
|
|
||||||
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
|
|
||||||
del t_tag.attrs["style"]
|
|
||||||
|
|
||||||
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
|
||||||
table.attrs["border"] = "1"
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _class_removing(chapter_tag: BeautifulSoup):
|
def _class_removing(chapter_tag: BeautifulSoup):
|
||||||
"""
|
"""
|
||||||
@@ -185,13 +148,13 @@ class HtmlEpubProcessor:
|
|||||||
----------
|
----------
|
||||||
1. comments removal
|
1. comments removal
|
||||||
2. wrap NavigableString with tag <p>
|
2. wrap NavigableString with tag <p>
|
||||||
3-6. wrap tags with <table>
|
3. heading removal
|
||||||
|
4. wrap tags with <table>
|
||||||
replace tags with correspond LiveCarta tags
|
replace tags with correspond LiveCarta tags
|
||||||
|
replace/remove attrs, values of attrs
|
||||||
unwrap tags
|
unwrap tags
|
||||||
insert tags into correspond tags
|
insert tags into correspond tags
|
||||||
7. heading removal
|
5. class removal
|
||||||
8. process_tables
|
|
||||||
9. class removal
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@@ -203,14 +166,12 @@ class HtmlEpubProcessor:
|
|||||||
self._remove_comments(chapter_tag)
|
self._remove_comments(chapter_tag)
|
||||||
# 2.
|
# 2.
|
||||||
self._wrap_strings_with_p(chapter_tag)
|
self._wrap_strings_with_p(chapter_tag)
|
||||||
# 3-6.
|
# 3.
|
||||||
_preprocess_html(
|
|
||||||
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
|
|
||||||
# 7.
|
|
||||||
if remove_title_from_chapter:
|
if remove_title_from_chapter:
|
||||||
self._remove_headings_content(chapter_tag, title_str)
|
self._remove_headings_content(chapter_tag, title_str)
|
||||||
# 8.
|
# 4.
|
||||||
self._process_tables(chapter_tag)
|
_process_presets(
|
||||||
# 9. remove classes that weren't created by converter
|
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
|
||||||
|
# 5. remove classes that weren't created by converter
|
||||||
self._class_removing(chapter_tag)
|
self._class_removing(chapter_tag)
|
||||||
return chapter_tag
|
return chapter_tag
|
||||||
|
|||||||
Reference in New Issue
Block a user