diff --git a/src/consumer.py b/src/consumer.py index ed2346f..7a0a6d5 100644 --- a/src/consumer.py +++ b/src/consumer.py @@ -44,7 +44,7 @@ def convert_book(book_id, access, logger, libra_locker): print('Book has been proceeded.') -def convert_epub_book(book_id, access, logger): +def convert_epub_book(book_id, access, logger=None): logger.info(f'Start processing epub book-{book_id}.') try: diff --git a/src/epub_postprocessor.py b/src/epub_postprocessor.py index 66f7454..69c27aa 100644 --- a/src/epub_postprocessor.py +++ b/src/epub_postprocessor.py @@ -414,7 +414,7 @@ if __name__ == "__main__": logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) - json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/9781119605959_f3.epub', + json_converter = EpubPostprocessor('/home/katerina/PycharmProjects/Jenia/converter/epub/9781119682387_pre_code2.epub', logger=logger_object) tmp = json_converter.convert_to_dict() diff --git a/src/html_epub_preprocessor.py b/src/html_epub_preprocessor.py index 76f07b5..baf1b59 100644 --- a/src/html_epub_preprocessor.py +++ b/src/html_epub_preprocessor.py @@ -250,24 +250,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note def unwrap_structural_tags(body_tag): def _preserve_class_in_aside_tag(tag_): - # to save css style inherited from class, copy class to aside tag (which is parent to tag_) + # to save css style inherited from class, copy class to aside tag # this is for Wiley books with boxes tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0] if tag_.parent.name == 'aside': if not tag_.parent.attrs.get('class'): tag_.parent.attrs['class'] = tag_class - def _preserve_class_in_section_tag(tag_): - # to save css style inherited from class, copy class to child

- # this is for Wiley books with boxes - tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0] - child_p_tag = tag_.find_all("p") - if len(child_p_tag) != 1: - return - child_p_tag = child_p_tag[0] - if not child_p_tag.attrs.get('class'): - child_p_tag.attrs['class'] = tag_class - def _add_table_to_abc_books(tag_, border, bg_color): wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color) @@ -303,7 +292,6 @@ def unwrap_structural_tags(body_tag): for s in body_tag.find_all("section"): if s.attrs.get('class'): _preserve_class_in_aside_tag(s) - _preserve_class_in_section_tag(s) _add_span_to_save_ids_for_links(s) s.unwrap() @@ -423,36 +411,26 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co return table -def _clean_wiley_block(block): - hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) - for hr in hrs: - hr.extract() - h = block.find(re.compile("h[1-9]")) - if h: - h.name = "p" - h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) - - def preprocess_block_tags(chapter_tag): for block in chapter_tag.find_all("blockquote"): if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: - _clean_wiley_block(block) + hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) + for hr in hrs: + hr.extract() + + h = block.find(re.compile("h[1-9]")) + if h: + h.name = "p" + h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color - wrap_block_tag_with_table(chapter_tag, block, bg_color=color) + wrap_block_tag_with_table(chapter_tag, block, color) block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) block.unwrap() - for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): - _clean_wiley_block(future_block) - color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None - color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color - wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) - def _prepare_formatted(text): - # replace <,> to save them as is in html code text = text.replace("<", "\x3C") text = text.replace(">", "\x3E") text = text.replace('\t', "\xa0 \xa0 ") #     @@ -465,7 +443,7 @@ def preprocess_pre_tags(chapter_tag): new_tag = BeautifulSoup(features='lxml').new_tag("span") new_tag.attrs = pre.attrs.copy() spans = pre.find_all("span") - to_add_br = len(spans) > 1 # if in

 there are multiple , we need to add 
after each content + to_add_br = len(spans) > 1 for child in pre.children: if isinstance(child, NavigableString): @@ -492,7 +470,6 @@ def preprocess_pre_tags(chapter_tag): def preprocess_code_tags(chapter_tag): - # function that emulates style of , , for code in chapter_tag.find_all(re.compile("code|kdb|var")): code.name = 'span' if code.parent.name == "pre":