Remove processing of tables

This commit is contained in:
Kiryl
2022-09-08 13:11:45 +03:00
parent 539a8df176
commit 001e55a27b
3 changed files with 26 additions and 66 deletions

View File

@@ -12,16 +12,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.style_preprocessor import StylePreprocessor
from src.style_reader import StyleReader
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.tag_inline_style_processor import modify_html_soup_with_css_styles
from src.inline_style_processor import modify_html_soup_with_css_styles
class EpubConverter:
def __init__(self, book_path, access=None, logger: BookLogger = None,
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None):
style_processor: StyleReader = None, html_processor: HtmlEpubProcessor = None):
self.book_path = book_path
self.access = access
self.logger: BookLogger = logger
@@ -57,13 +57,6 @@ class EpubConverter:
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
self.logger.log("Image processing.")
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.img_href2img_bytes[file_name] = content
self.logger.log("HTML files reading.")
self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content()
@@ -76,6 +69,13 @@ class EpubConverter:
self.logger.log("CSS styles fusion(inline+file).")
self.add_css_styles_to_html_soup()
self.logger.log("Image processing.")
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.img_href2img_bytes[file_name] = content
self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup:
self.footnotes_contents, self.noterefs, self.footnotes =\

View File

@@ -3,8 +3,8 @@ import codecs
from src.book_solver import BookSolver
from src.util.helpers import BookLogger
from src.html_preprocessor import HtmlPreprocessor
from src.style_preprocessor import StylePreprocessor
from src.html_presets_processor import HtmlPresetsProcessor
from src.style_reader import StyleReader
from src.epub_converter.html_epub_processor import HtmlEpubProcessor
from src.epub_converter.epub_converter import EpubConverter
@@ -30,16 +30,15 @@ class EpubBook(BookSolver):
json for LiveCarta platform
"""
html_preprocessor = HtmlPreprocessor(
html_preprocessor = HtmlPresetsProcessor(
logger=self.logger_object, preset_path="presets/epub_presets.json")
style_preprocessor = StylePreprocessor()
style_preprocessor = StyleReader()
html_processor = HtmlEpubProcessor(logger=self.logger_object,
html_preprocessor=html_preprocessor)
json_converter = EpubConverter(
self.book_path, access=self.access, logger=self.logger_object,
style_processor=style_preprocessor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict()
return content_dict
@@ -48,9 +47,9 @@ if __name__ == "__main__":
logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1])
html_preprocessor = HtmlPreprocessor(
html_preprocessor = HtmlPresetsProcessor(
logger=logger_object, preset_path="../../presets/epub_presets.json")
style_preprocessor = StylePreprocessor()
style_preprocessor = StyleReader()
html_processor = HtmlEpubProcessor(logger=logger_object,
html_preprocessor=html_preprocessor)

View File

@@ -4,7 +4,7 @@ from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from src.util.helpers import BookLogger
from src.html_preprocessor import _preprocess_html
from src.html_presets_processor import _process_presets
class HtmlEpubProcessor:
@@ -113,43 +113,6 @@ class HtmlEpubProcessor:
tag.extract()
return
@staticmethod
def _process_tables(chapter_tag: BeautifulSoup):
"""
Function preprocesses tables and tags(td|th|tr)
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
NoReturn
Chapter Tag with processed tables
"""
tables = chapter_tag.find_all("table")
for table in tables:
for t_tag in table.find_all(re.compile("td|th|tr")):
width = ""
if t_tag.get("style"):
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
if width_match:
size = width_match.group(1)
width = size + "px"
t_tag.attrs["width"] = t_tag.get("width") or width
if t_tag.attrs.get("style"):
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
"border:0;", "")
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
del t_tag.attrs["style"]
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1"
@staticmethod
def _class_removing(chapter_tag: BeautifulSoup):
"""
@@ -185,13 +148,13 @@ class HtmlEpubProcessor:
----------
1. comments removal
2. wrap NavigableString with tag <p>
3-6. wrap tags with <table>
3. heading removal
4. wrap tags with <table>
replace tags with correspond LiveCarta tags
replace/remove attrs, values of attrs
unwrap tags
insert tags into correspond tags
7. heading removal
8. process_tables
9. class removal
5. class removal
Returns
-------
@@ -203,14 +166,12 @@ class HtmlEpubProcessor:
self._remove_comments(chapter_tag)
# 2.
self._wrap_strings_with_p(chapter_tag)
# 3-6.
_preprocess_html(
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
# 7.
# 3.
if remove_title_from_chapter:
self._remove_headings_content(chapter_tag, title_str)
# 8.
self._process_tables(chapter_tag)
# 9. remove classes that weren't created by converter
# 4.
_process_presets(
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
# 5. remove classes that weren't created by converter
self._class_removing(chapter_tag)
return chapter_tag