forked from LiveCarta/BookConverter
Remove processing of tables
This commit is contained in:
@@ -12,16 +12,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
from src.util.helpers import BookLogger
|
||||
from src.livecarta_config import LiveCartaConfig
|
||||
from src.data_objects import ChapterItem, NavPoint
|
||||
from src.style_preprocessor import StylePreprocessor
|
||||
from src.style_reader import StyleReader
|
||||
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||
from src.epub_converter.image_processing import update_images_src_links
|
||||
from src.epub_converter.footnotes_processing import preprocess_footnotes
|
||||
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
|
||||
from src.inline_style_processor import modify_html_soup_with_css_styles
|
||||
|
||||
|
||||
class EpubConverter:
|
||||
def __init__(self, book_path, access=None, logger: BookLogger = None,
|
||||
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
|
||||
style_processor: StyleReader = None, html_processor: HtmlEpubProcessor = None):
|
||||
self.book_path = book_path
|
||||
self.access = access
|
||||
self.logger: BookLogger = logger
|
||||
@@ -57,13 +57,6 @@ class EpubConverter:
|
||||
self.noterefs: List[Tag] = [] # start of the footnote
|
||||
self.footnotes: List[Tag] = [] # end of the footnote
|
||||
|
||||
self.logger.log("Image processing.")
|
||||
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
|
||||
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
|
||||
file_name = x.file_name
|
||||
content = x.content
|
||||
self.img_href2img_bytes[file_name] = content
|
||||
|
||||
self.logger.log("HTML files reading.")
|
||||
self.html_href2html_body_soup: Dict[str,
|
||||
BeautifulSoup] = self.build_href2soup_content()
|
||||
@@ -76,6 +69,13 @@ class EpubConverter:
|
||||
self.logger.log("CSS styles fusion(inline+file).")
|
||||
self.add_css_styles_to_html_soup()
|
||||
|
||||
self.logger.log("Image processing.")
|
||||
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
|
||||
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
|
||||
file_name = x.file_name
|
||||
content = x.content
|
||||
self.img_href2img_bytes[file_name] = content
|
||||
|
||||
self.logger.log("Footnotes processing.")
|
||||
for href in self.html_href2html_body_soup:
|
||||
self.footnotes_contents, self.noterefs, self.footnotes =\
|
||||
|
||||
@@ -3,8 +3,8 @@ import codecs
|
||||
|
||||
from src.book_solver import BookSolver
|
||||
from src.util.helpers import BookLogger
|
||||
from src.html_preprocessor import HtmlPreprocessor
|
||||
from src.style_preprocessor import StylePreprocessor
|
||||
from src.html_presets_processor import HtmlPresetsProcessor
|
||||
from src.style_reader import StyleReader
|
||||
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
|
||||
from src.epub_converter.epub_converter import EpubConverter
|
||||
|
||||
@@ -30,16 +30,15 @@ class EpubBook(BookSolver):
|
||||
json for LiveCarta platform
|
||||
|
||||
"""
|
||||
html_preprocessor = HtmlPreprocessor(
|
||||
html_preprocessor = HtmlPresetsProcessor(
|
||||
logger=self.logger_object, preset_path="presets/epub_presets.json")
|
||||
style_preprocessor = StylePreprocessor()
|
||||
style_preprocessor = StyleReader()
|
||||
html_processor = HtmlEpubProcessor(logger=self.logger_object,
|
||||
html_preprocessor=html_preprocessor)
|
||||
json_converter = EpubConverter(
|
||||
self.book_path, access=self.access, logger=self.logger_object,
|
||||
style_processor=style_preprocessor, html_processor=html_processor)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
|
||||
return content_dict
|
||||
|
||||
|
||||
@@ -48,9 +47,9 @@ if __name__ == "__main__":
|
||||
logger_object = BookLogger(
|
||||
name="epub", book_id=epub_file_path.split("/")[-1])
|
||||
|
||||
html_preprocessor = HtmlPreprocessor(
|
||||
html_preprocessor = HtmlPresetsProcessor(
|
||||
logger=logger_object, preset_path="../../presets/epub_presets.json")
|
||||
style_preprocessor = StylePreprocessor()
|
||||
style_preprocessor = StyleReader()
|
||||
html_processor = HtmlEpubProcessor(logger=logger_object,
|
||||
html_preprocessor=html_preprocessor)
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ from bs4.element import PageElement
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
||||
|
||||
from src.util.helpers import BookLogger
|
||||
from src.html_preprocessor import _preprocess_html
|
||||
from src.html_presets_processor import _process_presets
|
||||
|
||||
|
||||
class HtmlEpubProcessor:
|
||||
@@ -113,43 +113,6 @@ class HtmlEpubProcessor:
|
||||
tag.extract()
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def _process_tables(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function preprocesses tables and tags(td|th|tr)
|
||||
Parameters
|
||||
----------
|
||||
chapter_tag: BeautifulSoup
|
||||
Tag & contents of the chapter tag
|
||||
|
||||
Returns
|
||||
-------
|
||||
NoReturn
|
||||
Chapter Tag with processed tables
|
||||
|
||||
"""
|
||||
tables = chapter_tag.find_all("table")
|
||||
for table in tables:
|
||||
for t_tag in table.find_all(re.compile("td|th|tr")):
|
||||
width = ""
|
||||
if t_tag.get("style"):
|
||||
width_match = re.search(
|
||||
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
|
||||
if width_match:
|
||||
size = width_match.group(1)
|
||||
width = size + "px"
|
||||
|
||||
t_tag.attrs["width"] = t_tag.get("width") or width
|
||||
|
||||
if t_tag.attrs.get("style"):
|
||||
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
|
||||
"border:0;", "")
|
||||
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
|
||||
del t_tag.attrs["style"]
|
||||
|
||||
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
|
||||
table.attrs["border"] = "1"
|
||||
|
||||
@staticmethod
|
||||
def _class_removing(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
@@ -185,13 +148,13 @@ class HtmlEpubProcessor:
|
||||
----------
|
||||
1. comments removal
|
||||
2. wrap NavigableString with tag <p>
|
||||
3-6. wrap tags with <table>
|
||||
3. heading removal
|
||||
4. wrap tags with <table>
|
||||
replace tags with correspond LiveCarta tags
|
||||
replace/remove attrs, values of attrs
|
||||
unwrap tags
|
||||
insert tags into correspond tags
|
||||
7. heading removal
|
||||
8. process_tables
|
||||
9. class removal
|
||||
5. class removal
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -203,14 +166,12 @@ class HtmlEpubProcessor:
|
||||
self._remove_comments(chapter_tag)
|
||||
# 2.
|
||||
self._wrap_strings_with_p(chapter_tag)
|
||||
# 3-6.
|
||||
_preprocess_html(
|
||||
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
|
||||
# 7.
|
||||
# 3.
|
||||
if remove_title_from_chapter:
|
||||
self._remove_headings_content(chapter_tag, title_str)
|
||||
# 8.
|
||||
self._process_tables(chapter_tag)
|
||||
# 9. remove classes that weren't created by converter
|
||||
# 4.
|
||||
_process_presets(
|
||||
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
|
||||
# 5. remove classes that weren't created by converter
|
||||
self._class_removing(chapter_tag)
|
||||
return chapter_tag
|
||||
|
||||
Reference in New Issue
Block a user