Revert "epub converter: add new type of blocks"

This reverts commit eb882a700356149c6133c5291ed003dcaa0a183d.
2021-08-20 16:17:28 +03:00
parent 77cf6e91a9
commit 660cd077a0
3 changed files with 13 additions and 36 deletions
--- a/src/consumer.py
+++ b/src/consumer.py
@@ -44,7 +44,7 @@ def convert_book(book_id, access, logger, libra_locker):
    print('Book has been proceeded.')
-def convert_epub_book(book_id, access, logger):
+def convert_epub_book(book_id, access, logger=None):
    logger.info(f'Start processing epub book-{book_id}.')
    try:
--- a/src/epub_postprocessor.py
+++ b/src/epub_postprocessor.py
@@ -414,7 +414,7 @@ if __name__ == "__main__":
    logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
-    json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/9781119605959_f3.epub',
+    json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/9781119682387_pre_code2.epub',
                                       logger=logger_object)
    tmp = json_converter.convert_to_dict()
--- a/src/html_epub_preprocessor.py
+++ b/src/html_epub_preprocessor.py
@@ -250,24 +250,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
 def unwrap_structural_tags(body_tag):
    def _preserve_class_in_aside_tag(tag_):
-        # to save css style inherited from class, copy class to aside tag (which is parent to tag_)
+        # to save css style inherited from class, copy class to aside tag
        # this is for Wiley books with boxes
        tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
        if tag_.parent.name == 'aside':
            if not tag_.parent.attrs.get('class'):
                tag_.parent.attrs['class'] = tag_class
    def _preserve_class_in_section_tag(tag_):
        # to save css style inherited from class, copy class to child <p>
        # this is for Wiley books with boxes
        tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
        child_p_tag = tag_.find_all("p")
        if len(child_p_tag) != 1:
            return
        child_p_tag = child_p_tag[0]
        if not child_p_tag.attrs.get('class'):
            child_p_tag.attrs['class'] = tag_class
    def _add_table_to_abc_books(tag_, border, bg_color):
        wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color)
@@ -303,7 +292,6 @@ def unwrap_structural_tags(body_tag):
    for s in body_tag.find_all("section"):
        if s.attrs.get('class'):
            _preserve_class_in_aside_tag(s)
            _preserve_class_in_section_tag(s)
        _add_span_to_save_ids_for_links(s)
        s.unwrap()
@@ -423,36 +411,26 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co
    return table
 def _clean_wiley_block(block):
    hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
    for hr in hrs:
        hr.extract()
    h = block.find(re.compile("h[1-9]"))
    if h:
        h.name = "p"
        h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
 def preprocess_block_tags(chapter_tag):
    for block in chapter_tag.find_all("blockquote"):
        if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
-            _clean_wiley_block(block)
+            hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
            for hr in hrs:
                hr.extract()
            h = block.find(re.compile("h[1-9]"))
            if h:
                h.name = "p"
                h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
            color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None
            color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color
-            wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
+            wrap_block_tag_with_table(chapter_tag, block, color)
            block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
            block.unwrap()
    for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
        _clean_wiley_block(future_block)
        color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None
        color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color
        wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
 def _prepare_formatted(text):
    # replace <,> to save them as is in html code
    text = text.replace("<", "\x3C")
    text = text.replace(">", "\x3E")
    text = text.replace('\t', "\xa0 \xa0 ")  # &nbsp; &nbsp;
@@ -465,7 +443,7 @@ def preprocess_pre_tags(chapter_tag):
        new_tag = BeautifulSoup(features='lxml').new_tag("span")
        new_tag.attrs = pre.attrs.copy()
        spans = pre.find_all("span")
-        to_add_br = len(spans) > 1  # if in <pre> there are multiple <span>, we need to add <br> after each content
+        to_add_br = len(spans) > 1
        for child in pre.children:
            if isinstance(child, NavigableString):
@@ -492,7 +470,6 @@ def preprocess_pre_tags(chapter_tag):
 def preprocess_code_tags(chapter_tag):
    # function that emulates style of <code>, <kdb>, <var>
    for code in chapter_tag.find_all(re.compile("code|kdb|var")):
        code.name = 'span'
        if code.parent.name == "pre":