Revert "epub converter: add new type of blocks"

This reverts commit eb882a700356149c6133c5291ed003dcaa0a183d.
This commit is contained in:
shirshasa
2021-08-20 16:17:28 +03:00
parent 77cf6e91a9
commit 660cd077a0
3 changed files with 13 additions and 36 deletions

View File

@@ -44,7 +44,7 @@ def convert_book(book_id, access, logger, libra_locker):
print('Book has been proceeded.')
def convert_epub_book(book_id, access, logger):
def convert_epub_book(book_id, access, logger=None):
logger.info(f'Start processing epub book-{book_id}.')
try:

View File

@@ -414,7 +414,7 @@ if __name__ == "__main__":
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/9781119605959_f3.epub',
json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/9781119682387_pre_code2.epub',
logger=logger_object)
tmp = json_converter.convert_to_dict()

View File

@@ -250,24 +250,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
def unwrap_structural_tags(body_tag):
def _preserve_class_in_aside_tag(tag_):
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
# to save css style inherited from class, copy class to aside tag
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
if tag_.parent.name == 'aside':
if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class
def _preserve_class_in_section_tag(tag_):
# to save css style inherited from class, copy class to child <p>
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
child_p_tag = tag_.find_all("p")
if len(child_p_tag) != 1:
return
child_p_tag = child_p_tag[0]
if not child_p_tag.attrs.get('class'):
child_p_tag.attrs['class'] = tag_class
def _add_table_to_abc_books(tag_, border, bg_color):
wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color)
@@ -303,7 +292,6 @@ def unwrap_structural_tags(body_tag):
for s in body_tag.find_all("section"):
if s.attrs.get('class'):
_preserve_class_in_aside_tag(s)
_preserve_class_in_section_tag(s)
_add_span_to_save_ids_for_links(s)
s.unwrap()
@@ -423,36 +411,26 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co
return table
def _clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
hr.extract()
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
def preprocess_block_tags(chapter_tag):
for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
_clean_wiley_block(block)
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
hr.extract()
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
wrap_block_tag_with_table(chapter_tag, block, color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(future_block)
color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None
color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
def _prepare_formatted(text):
# replace <,> to save them as is in html code
text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
@@ -465,7 +443,7 @@ def preprocess_pre_tags(chapter_tag):
new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy()
spans = pre.find_all("span")
to_add_br = len(spans) > 1 # if in <pre> there are multiple <span>, we need to add <br> after each content
to_add_br = len(spans) > 1
for child in pre.children:
if isinstance(child, NavigableString):
@@ -492,7 +470,6 @@ def preprocess_pre_tags(chapter_tag):
def preprocess_code_tags(chapter_tag):
# function that emulates style of <code>, <kdb>, <var>
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
code.name = 'span'
if code.parent.name == "pre":