From acb2ce48c2c27c1b88445270cf6d90a9698561e6 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Mon, 6 Jun 2022 16:37:42 +0300 Subject: [PATCH] Formatting: documentation + optimization --- src/book_solver.py | 8 ++-- src/docx_converter/html_docx_preprocessor.py | 26 +++++------- src/epub_converter/html_epub_preprocessor.py | 42 +++++++++----------- 3 files changed, 33 insertions(+), 43 deletions(-) diff --git a/src/book_solver.py b/src/book_solver.py index 4176280..c45af0f 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -13,8 +13,8 @@ class BookSolver: """ This is Main Abstract class for solving a task of a book conversion Having an id of coming book, gets book from server, runs conversion. - In parallel it updates status of a book conversion on admin panel. - Finally sends result to server. + In parallel, it updates status of a book conversion on admin panel. + Finally, sends result to server. Result is a json, JSON schema in book_schema.json """ @@ -35,12 +35,12 @@ class BookSolver: assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \ "Length of headers doesn't match allowed levels." - def save_book_file(self, content: str): + def save_book_file(self, content: bytes): """ Function saves binary content of file to .docx/.epub Parameters ---------- - content: str + content: bytes str binary content of the file """ diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index b2b89a1..db847b0 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -116,8 +116,8 @@ class HTMLDocxPreprocessor: if face is not None: face = re.sub(r",[\w,\- ]*$", "", face) - if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.font_correspondence_table.get(face): - font.attrs["face"] = LiveCartaConfig.font_correspondence_table[face] + if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(face): + font.attrs["face"] = LiveCartaConfig.FONT_CORRESPONDANCE_TABLE[face] else: font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME @@ -137,11 +137,11 @@ class HTMLDocxPreprocessor: def clean_trash(self): """Function to remove all styles and tags we don't need.""" self._clean_tag('span', 'style', re.compile( - r'^background: #[0-9a-fA-F]{6}$')) + r'^background: #[\da-fA-F]{6}$')) # todo: check for another languages self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) self._clean_tag('span', 'style', re.compile( - '^letter-spacing: -?[\d\.]+pt$')) + '^letter-spacing: -?[\d.]+pt$')) self._clean_tag('font', 'face', re.compile( r'^Times New Roman[\w, ]+$')) @@ -179,13 +179,13 @@ class HTMLDocxPreprocessor: style = p.get('style') if style: - indent = re.search(r'text-indent: ([\d\.]{1,4})in', style) - margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style) + indent = re.search(r'text-indent: ([\d.]{1,4})in', style) + margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style) margin_right = re.search( - r'margin-right: ([\d\.]{1,4})in', style) - margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style) + r'margin-right: ([\d.]{1,4})in', style) + margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style) margin_bottom = re.search( - r'margin-bottom: ([\d\.]{1,4})in', style) + r'margin-bottom: ([\d.]{1,4})in', style) else: indent = None margin_left = None @@ -517,7 +517,7 @@ class HTMLDocxPreprocessor: Function for gathering info about top-level chapters. Assume: - - Headers with smallest outline(or digit in ) are top level chapters. + - Headers with the smallest outline(or digit in ) are top level chapters. [ It is consistent with a recursive algorithm for saving content to a resulted json structure, which happens in header_to_json()] @@ -560,7 +560,7 @@ class HTMLDocxPreprocessor: Assume header(s) to be introduction if: 1. one header not numbered, before 1 numbered header - 2. it is first header from the top level list and it equals to 'introductio + 2. it is first header from the top level list, and it equals to 'introduction' Returns ------- None @@ -665,10 +665,6 @@ class HTMLDocxPreprocessor: Function - process tags
  • . - unwrap

    tags. - Parameters - ---------- - body_tag: Tag, soup object - Returns ------- None diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 065481f..73e357c 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -75,8 +75,8 @@ def _preprocess_table(body_tag: BeautifulSoup): """Function to preprocess tables and tags(td|th|tr): style""" tables = body_tag.find_all("table") for table in tables: - ts = table.find_all(re.compile("td|th|tr")) - for t_tag in ts: + t_tags = table.find_all(re.compile("td|th|tr")) + for t_tag in t_tags: style = t_tag.get('style') width = '' if style: @@ -113,7 +113,6 @@ def _process_lists(body_tag: BeautifulSoup): None """ - li_tags = body_tag.find_all("li") for li_tag in li_tags: if li_tag.p: @@ -268,7 +267,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note return footnotes, new_noterefs_tags, new_footnotes_tags -def unwrap_structural_tags(body_tag: BeautifulSoup): +def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup: """ Main function that works with structure of html. Make changes inplace. Parameters @@ -288,10 +287,10 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): Returns ------- - None + body_tag: Tag, BeautifulSoup + adjusted body_tag """ - def _preserve_class_in_aside_tag(tag_): """to save css style inherited from class, copy class to aside tag (which is parent to tag_)""" # this is for Wiley books with boxes @@ -311,7 +310,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): Returns ------- - None + bool """ # this is for Wiley books with boxes @@ -454,21 +453,19 @@ def unwrap_structural_tags(body_tag: BeautifulSoup): tag = body_tag.new_tag('p') tag.append(str(node)) node.replace_with(tag) - return body_tag def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: """After processing on a first_id that corresponds to current chapter, from initial html_soup all tags from current chapter are extracted - Parameters ---------- first_id: Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' href: Name of current chapter's file - html_soup: Tag, soup object + html_soup: Tag Soup object of current file Returns @@ -530,19 +527,17 @@ def _clean_wiley_block(block): h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) -def _preprocess_block_tags(chapter_tag): +def _preprocess_block_tags(chapter_tag: Tag): """Function preprocessing tags""" - for block in chapter_tag.find_all("blockquote"): - if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: - _clean_wiley_block(block) - - color = '#DDDDDD' if block.attrs.get( - 'class') == 'feature1' else None - color = '#EEEEEE' if block.attrs.get( - 'class') == 'feature2' else color - _wrap_block_tag_with_table(chapter_tag, block, bg_color=color) - block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) - block.unwrap() + for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}): + _clean_wiley_block(block) + color = '#DDDDDD' if block.attrs.get( + 'class') == 'feature1' else None + color = '#EEEEEE' if block.attrs.get( + 'class') == 'feature2' else color + _wrap_block_tag_with_table(chapter_tag, block, bg_color=color) + block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) + block.unwrap() for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): _clean_wiley_block(future_block) @@ -647,8 +642,7 @@ def _preprocess_code_tags(chapter_tag: BeautifulSoup): code.name = "span" if code.parent.name == "pre": continue - - # if tags aren't in pre + # if tags aren't in pre and don't have style if not code.attrs.get('style'): code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'