Remove processing of tables

This commit is contained in:
Kiryl
2022-09-08 13:11:45 +03:00
parent 539a8df176
commit 001e55a27b
3 changed files with 26 additions and 66 deletions

View File

@@ -12,16 +12,16 @@ from bs4 import BeautifulSoup, Tag, NavigableString
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.style_preprocessor import StylePreprocessor from src.style_reader import StyleReader
from src.epub_converter.html_epub_processor import HtmlEpubProcessor from src.epub_converter.html_epub_processor import HtmlEpubProcessor
from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.image_processing import update_images_src_links
from src.epub_converter.footnotes_processing import preprocess_footnotes from src.epub_converter.footnotes_processing import preprocess_footnotes
from src.tag_inline_style_processor import modify_html_soup_with_css_styles from src.inline_style_processor import modify_html_soup_with_css_styles
class EpubConverter: class EpubConverter:
def __init__(self, book_path, access=None, logger: BookLogger = None, def __init__(self, book_path, access=None, logger: BookLogger = None,
style_processor: StylePreprocessor = None, html_processor: HtmlEpubProcessor = None): style_processor: StyleReader = None, html_processor: HtmlEpubProcessor = None):
self.book_path = book_path self.book_path = book_path
self.access = access self.access = access
self.logger: BookLogger = logger self.logger: BookLogger = logger
@@ -57,13 +57,6 @@ class EpubConverter:
self.noterefs: List[Tag] = [] # start of the footnote self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote self.footnotes: List[Tag] = [] # end of the footnote
self.logger.log("Image processing.")
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.img_href2img_bytes[file_name] = content
self.logger.log("HTML files reading.") self.logger.log("HTML files reading.")
self.html_href2html_body_soup: Dict[str, self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content() BeautifulSoup] = self.build_href2soup_content()
@@ -76,6 +69,13 @@ class EpubConverter:
self.logger.log("CSS styles fusion(inline+file).") self.logger.log("CSS styles fusion(inline+file).")
self.add_css_styles_to_html_soup() self.add_css_styles_to_html_soup()
self.logger.log("Image processing.")
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.img_href2img_bytes[file_name] = content
self.logger.log("Footnotes processing.") self.logger.log("Footnotes processing.")
for href in self.html_href2html_body_soup: for href in self.html_href2html_body_soup:
self.footnotes_contents, self.noterefs, self.footnotes =\ self.footnotes_contents, self.noterefs, self.footnotes =\

View File

@@ -3,8 +3,8 @@ import codecs
from src.book_solver import BookSolver from src.book_solver import BookSolver
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.html_preprocessor import HtmlPreprocessor from src.html_presets_processor import HtmlPresetsProcessor
from src.style_preprocessor import StylePreprocessor from src.style_reader import StyleReader
from src.epub_converter.html_epub_processor import HtmlEpubProcessor from src.epub_converter.html_epub_processor import HtmlEpubProcessor
from src.epub_converter.epub_converter import EpubConverter from src.epub_converter.epub_converter import EpubConverter
@@ -30,16 +30,15 @@ class EpubBook(BookSolver):
json for LiveCarta platform json for LiveCarta platform
""" """
html_preprocessor = HtmlPreprocessor( html_preprocessor = HtmlPresetsProcessor(
logger=self.logger_object, preset_path="presets/epub_presets.json") logger=self.logger_object, preset_path="presets/epub_presets.json")
style_preprocessor = StylePreprocessor() style_preprocessor = StyleReader()
html_processor = HtmlEpubProcessor(logger=self.logger_object, html_processor = HtmlEpubProcessor(logger=self.logger_object,
html_preprocessor=html_preprocessor) html_preprocessor=html_preprocessor)
json_converter = EpubConverter( json_converter = EpubConverter(
self.book_path, access=self.access, logger=self.logger_object, self.book_path, access=self.access, logger=self.logger_object,
style_processor=style_preprocessor, html_processor=html_processor) style_processor=style_preprocessor, html_processor=html_processor)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
return content_dict return content_dict
@@ -48,9 +47,9 @@ if __name__ == "__main__":
logger_object = BookLogger( logger_object = BookLogger(
name="epub", book_id=epub_file_path.split("/")[-1]) name="epub", book_id=epub_file_path.split("/")[-1])
html_preprocessor = HtmlPreprocessor( html_preprocessor = HtmlPresetsProcessor(
logger=logger_object, preset_path="../../presets/epub_presets.json") logger=logger_object, preset_path="../../presets/epub_presets.json")
style_preprocessor = StylePreprocessor() style_preprocessor = StyleReader()
html_processor = HtmlEpubProcessor(logger=logger_object, html_processor = HtmlEpubProcessor(logger=logger_object,
html_preprocessor=html_preprocessor) html_preprocessor=html_preprocessor)

View File

@@ -4,7 +4,7 @@ from bs4.element import PageElement
from bs4 import BeautifulSoup, Tag, NavigableString, Comment from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from src.util.helpers import BookLogger from src.util.helpers import BookLogger
from src.html_preprocessor import _preprocess_html from src.html_presets_processor import _process_presets
class HtmlEpubProcessor: class HtmlEpubProcessor:
@@ -113,43 +113,6 @@ class HtmlEpubProcessor:
tag.extract() tag.extract()
return return
@staticmethod
def _process_tables(chapter_tag: BeautifulSoup):
"""
Function preprocesses tables and tags(td|th|tr)
Parameters
----------
chapter_tag: BeautifulSoup
Tag & contents of the chapter tag
Returns
-------
NoReturn
Chapter Tag with processed tables
"""
tables = chapter_tag.find_all("table")
for table in tables:
for t_tag in table.find_all(re.compile("td|th|tr")):
width = ""
if t_tag.get("style"):
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"])
if width_match:
size = width_match.group(1)
width = size + "px"
t_tag.attrs["width"] = t_tag.get("width") or width
if t_tag.attrs.get("style"):
t_tag.attrs["style"] = t_tag.attrs["style"].replace(
"border:0;", "")
if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "":
del t_tag.attrs["style"]
if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]:
table.attrs["border"] = "1"
@staticmethod @staticmethod
def _class_removing(chapter_tag: BeautifulSoup): def _class_removing(chapter_tag: BeautifulSoup):
""" """
@@ -185,13 +148,13 @@ class HtmlEpubProcessor:
---------- ----------
1. comments removal 1. comments removal
2. wrap NavigableString with tag <p> 2. wrap NavigableString with tag <p>
3-6. wrap tags with <table> 3. heading removal
4. wrap tags with <table>
replace tags with correspond LiveCarta tags replace tags with correspond LiveCarta tags
replace/remove attrs, values of attrs
unwrap tags unwrap tags
insert tags into correspond tags insert tags into correspond tags
7. heading removal 5. class removal
8. process_tables
9. class removal
Returns Returns
------- -------
@@ -203,14 +166,12 @@ class HtmlEpubProcessor:
self._remove_comments(chapter_tag) self._remove_comments(chapter_tag)
# 2. # 2.
self._wrap_strings_with_p(chapter_tag) self._wrap_strings_with_p(chapter_tag)
# 3-6. # 3.
_preprocess_html(
html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
# 7.
if remove_title_from_chapter: if remove_title_from_chapter:
self._remove_headings_content(chapter_tag, title_str) self._remove_headings_content(chapter_tag, title_str)
# 8. # 4.
self._process_tables(chapter_tag) _process_presets(
# 9. remove classes that weren't created by converter html_preprocessor=self.html_preprocessor, html_soup=chapter_tag)
# 5. remove classes that weren't created by converter
self._class_removing(chapter_tag) self._class_removing(chapter_tag)
return chapter_tag return chapter_tag