diff --git a/consumer.py b/consumer.py index bbb38fb..b40c862 100644 --- a/consumer.py +++ b/consumer.py @@ -54,7 +54,6 @@ def convert_book(book_type: [DocxBook, EpubBook], params: dict, logger, book_id) raise exc logger.info(f'Book-{book_id} has been proceeded.') - print('Book has been proceeded.') def callback(ch, method, properties, body, logger, libra_locker): diff --git a/src/book_solver.py b/src/book_solver.py index 28288aa..cb57cb3 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -1,10 +1,3 @@ -""" This is Main Abstract class for solving a task of a book conversion - -Having an id of coming book, gets book from server, runs conversion. -In parallel it updates status of a book conversion on admin panel. -Finally sends result to server. -Result is a json, JSON schema in book_schema.json -""" import os import json import codecs @@ -17,6 +10,14 @@ from src.util.helpers import BookLogger, BookStatusWrapper class BookSolver: + """ + This is Main Abstract class for solving a task of a book conversion + Having an id of coming book, gets book from server, runs conversion. + In parallel it updates status of a book conversion on admin panel. + Finally sends result to server. + Result is a json, JSON schema in book_schema.json + """ + __metaclass__ = ABCMeta def __init__(self, book_id=0, access=None, main_logger=None): @@ -55,9 +56,7 @@ class BookSolver: self.file_path = pathlib.Path(file_path) def get_book_file(self): - """ - Method for getting and saving book from server. - """ + """ Method for getting and saving book from server. """ try: self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file') content = self.access.get_book(self.book_id) @@ -92,6 +91,7 @@ class BookSolver: self.logger_object.log('Error has occurred while writing json file.' + str(exc), logging.ERROR) def send_json_content_to_server(self, content: dict): + """ Function sends json_content to site """ try: self.access.send_book(self.book_id, content) self.logger_object.log(f'JSON data has been sent to server.') @@ -108,8 +108,10 @@ class BookSolver: return {} def test_conversion(self): - '''Function - without sending to server''' + """ + Function + - without sending to server + """ self.logger_object.log('Beginning of the test.') folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.join(folder_path, f'{self.book_type}') @@ -121,9 +123,11 @@ class BookSolver: self.logger_object.log('End of the test.') def conversion(self): - '''Function - with downloading book from server - with sending to server''' + """ + Function + - with downloading book from server + - with sending to server + """ try: self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.') self.get_book_file() @@ -140,9 +144,11 @@ class BookSolver: raise exc def conversion_local(self): - '''Function - without downloading book from server (local) - with sending to server''' + """ + Function + - without downloading book from server (local) + - with sending to server + """ try: self.logger_object.log(f'Data has been downloaded from tmp.json file: {self.file_path}') with codecs.open('json/tmp.json', 'r', encoding='utf-8') as f_json: diff --git a/src/data_objects.py b/src/data_objects.py index efd8f0b..8c51bc3 100644 --- a/src/data_objects.py +++ b/src/data_objects.py @@ -2,21 +2,22 @@ import re from typing import Union from ebooklib.epub import Section, Link - from src.livecarta_config import LiveCartaConfig -""" -These are data structures which form mapping from NCX to python data structures. -""" - class NavPoint: + """ + Class - Navigation Point, - every html|xhtml from epub + These are data structures which form mapping from NCX to python data structures. + """ + def __init__(self, obj: Union[Link, Section] = None, ): self.href, self.id = self.parse_href_id(obj) self.title = obj.title @staticmethod def parse_href_id(item: Union[Link, Section]): + """Function parses href & id from item.href""" reg = r'(.+\..+\#)(.+)' match = re.search(reg, item.href) href, div_id = None, None @@ -36,13 +37,8 @@ class NavPoint: return '' % (self.href, self.id) -""" -These are data structures which form mapping to livecarta json structure. -""" - - def flatten(x): - """ magic function from stackoverflow for list flattening """ + """magic function from stackoverflow for list flattening""" atom = lambda i: not isinstance(i, list) nil = lambda i: not i car = lambda i: i[0] @@ -54,12 +50,18 @@ def flatten(x): class ChapterItem: + """ + Class of Chapter that could have subchapters + These are data structures which form mapping to livecarta json structure. + """ + def __init__(self, title, content, sub_items): self.title = title self.content = content self.sub_items = sub_items def to_dict(self, lvl=1): + """Function returns dictionary of chapter""" sub_dicts = [] if self.sub_items: for i in self.sub_items: @@ -86,4 +88,4 @@ class ChapterItem: } def __str__(self): - return '' % self.title + return '' % self.title \ No newline at end of file diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index aadd753..db75c1b 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -12,6 +12,7 @@ from src.book_solver import BookSolver class DocxBook(BookSolver): + """Class of .docx type book - child of BookSolver""" def __init__(self, book_id=0, access=None, html_path=None, main_logger=None, libra_locker=None): @@ -30,9 +31,7 @@ class DocxBook(BookSolver): self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG) def convert_doc_to_html(self): - """ - Method for convert .docx document to .html file. - """ + """Method for convert .docx document to .html file.""" self.logger_object.log(f'File - {self.file_path}.') print(f'{self.file_path}') self.logger_object.log('Beginning of conversion from .docx to .html.') @@ -92,9 +91,7 @@ class DocxBook(BookSolver): self.logger_object.log(f'Input file path after conversion: {self.html_path}.') def read_html(self): - """ - Method for reading .html file into beautiful soup tag. - """ + """Method for reading .html file into beautiful soup tag.""" try: html_text = open(self.html_path, 'r', encoding='utf8').read() self.logger_object.log('HTML for book has been loaded.') @@ -130,7 +127,6 @@ class DocxBook(BookSolver): 1. Convert docx to html with libra office 2. Parse and clean html, get list of tags, get footnotes 3. Parse from line structure to nested structure with JSONConverter - """ self.convert_doc_to_html() self.check_output_directory() diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index fad6e4a..e5d4d4b 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -35,9 +35,7 @@ class HTMLDocxPreprocessor: tag.unwrap() def _clean_underline_links(self): - """ - Function cleans meaningless tags before links. - """ + """Function cleans meaningless tags before links.""" underlines = self.body_tag.find_all("u") for u in underlines: if u.find_all('a'): @@ -79,9 +77,7 @@ class HTMLDocxPreprocessor: return re.sub(size + "pt", str(new_size) + "px", style) def _font_to_span(self): - """ - Function to convert tag to . If font style is default, then remove this tag. - """ + """Function to convert tag to . If font style is default, then remove this tag.""" fonts = self.body_tag.find_all("font") for font in fonts: face = font.get("face") @@ -119,9 +115,7 @@ class HTMLDocxPreprocessor: self.content = self.content[ind:] def clean_trash(self): - """ - Function to remove all styles and tags we don't need. - """ + """Function to remove all styles and tags we don't need.""" self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$')) self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$')) @@ -140,9 +134,7 @@ class HTMLDocxPreprocessor: table.decompose() def _process_paragraph(self): - """ - Function to process

tags (text-align and text-indent value). - """ + """Function to process

tags (text-align and text-indent value).""" paragraphs = self.body_tag.find_all('p') for p in paragraphs: @@ -193,9 +185,7 @@ class HTMLDocxPreprocessor: p.attrs['style'] = style def _process_two_columns(self): - """ - Function to process paragraphs which has two columns layout. - """ + """Function to process paragraphs which has two columns layout.""" two_columns = self.body_tag.find_all("div", style="column-count: 2") for div in two_columns: for child in div.children: @@ -204,9 +194,7 @@ class HTMLDocxPreprocessor: div.unwrap() def _process_tables(self): - """ - Function to process tables. Set "border" attribute. - """ + """Function to process tables. Set "border" attribute.""" tables = self.body_tag.find_all("table") for table in tables: tds = table.find_all("td") @@ -296,9 +284,7 @@ class HTMLDocxPreprocessor: return content.strip() def _process_footnotes(self): - """ - Function returns list of footnotes and delete them from html_soup. - """ + """Function returns list of footnotes and delete them from html_soup.""" footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc') footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$')) footnote_amt = len(footnote_anchors) @@ -404,9 +390,7 @@ class HTMLDocxPreprocessor: div.decompose() def _process_div(self): - """ - Function to process

tags. All the tags will be deleted from file, all content of the tags will stay. - """ + """Function to process
tags. All the tags will be deleted from file, all content of the tags will stay.""" divs = self.body_tag.find_all("div") for div in divs: @@ -423,9 +407,7 @@ class HTMLDocxPreprocessor: return len(toc_links) > 0 def _process_toc_links(self): - """ - Function to extract nodes which contains TOC links, remove links from file and detect headers. - """ + """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')}) headers = [link.parent for link in toc_links] outline_level = "1" # All the unknown outlines will be predicted as

@@ -448,13 +430,11 @@ class HTMLDocxPreprocessor: @staticmethod def clean_title_from_numbering(title: str): - """ - Function to remove digits from headers. - """ + """Function to remove digits from headers.""" title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title - title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) + title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title return title @staticmethod @@ -485,9 +465,7 @@ class HTMLDocxPreprocessor: self.apply_func_to_last_child(children[0], func) def _preprocessing_headings(self): - """ - Function to convert all lower level headings to p tags - """ + """Function to convert all lower level headings to p tags""" pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = self.body_tag.find_all(re.compile(pattern)) for tag in header_tags: @@ -561,9 +539,7 @@ class HTMLDocxPreprocessor: self.top_level_headers[i]['should_be_numbered'] = True def _process_headings(self): - """ - Function to process tags . - """ + """Function to process tags .""" header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) # 1. remove , @@ -634,9 +610,7 @@ class HTMLDocxPreprocessor: il_tag.p.unwrap() def process_html(self, access, html_path, book_id): - """ - Process html code to satisfy LiveCarta formatting. - """ + """Process html code to satisfy LiveCarta formatting.""" try: self.logger_object.log(f'Processing TOC and headers.') self._process_toc_links() diff --git a/src/docx_converter/libra_html2json_converter.py b/src/docx_converter/libra_html2json_converter.py index cafbb39..9366b60 100644 --- a/src/docx_converter/libra_html2json_converter.py +++ b/src/docx_converter/libra_html2json_converter.py @@ -90,9 +90,7 @@ class LibraHTML2JSONConverter: return True def convert_to_dict(self): - """ - Function which convert list of html nodes to appropriate json structure. - """ + """Function which convert list of html nodes to appropriate json structure.""" json_strc = [] ind = 0 ch_num = 0 diff --git a/src/epub_converter/css_reader.py b/src/epub_converter/css_reader.py index 20363f6..44776ea 100644 --- a/src/epub_converter/css_reader.py +++ b/src/epub_converter/css_reader.py @@ -11,9 +11,9 @@ from itertools import takewhile from src.util.color_reader import str2hex from src.livecarta_config import LiveCartaConfig - cssutils.log.setLevel(CRITICAL) + sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] @@ -29,6 +29,7 @@ list_types = ['circle', 'disc', 'armenian', 'decimal', def convert_font_size(value): + """ Function converts font-size in mapping """ if 'pt' in value: if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE: return '' @@ -58,6 +59,7 @@ def convert_font_size(value): def convert_indents(value): + """ Function converts text-indent and margin-left values to px """ # 30px = 3.2% = 1.25em = 23pt text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)') has_style_attrs = re.search(text_indent_regexp, value) @@ -115,13 +117,6 @@ LIVECARTA_STYLE_ATTRS = { 'margin-left': [] } -""" -LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } - -Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated -to suit livecarta style convention. -""" - def get_bg_color(x): color = str2hex(x) @@ -135,6 +130,12 @@ def get_text_color(x): return color +""" +LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } + +Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated +to suit livecarta style convention. +""" LIVECARTA_STYLE_ATTRS_MAPPING = { 'text-indent': convert_indents, 'font-variant': lambda x: x, @@ -178,8 +179,10 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { def check_style_to_be_tag(style) -> List[tuple]: - """ Some css style properties converts to tags. - Search for them and prepare list of properties to be removed from style string""" + """ + Some css style properties converts to tags. + Search for them and prepare list of properties to be removed from style string + """ to_remove = [] for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: if f'{k[0]}:{k[1]}' in style: @@ -208,6 +211,7 @@ def update_css_style_types_to_livecarta_convention(css_rule, style_type): def build_css_content(css_content): + """ Build css content with livecarta convention """ sheet = cssutils.parseString(css_content, validate=False) for css_rule in sheet: @@ -231,6 +235,7 @@ class TagStyleConverter: @staticmethod def remove_white_if_no_bgcolor(style_, tag): + """ Function remove white color if there is no text bg color """ if 'background' in style_: return style_ @@ -260,8 +265,7 @@ class TagStyleConverter: @staticmethod def process_indents_to_px(split_style: list) -> str: - # clean with convert_indents() style string and make new clean_style - + """ Function cleans using convert_indents() style string and returns new clean_style """ clean_style = '' for item in split_style: item = item.split(':') @@ -276,7 +280,7 @@ class TagStyleConverter: has_margin_left = re.search(margin_left_regexp, clean_style) has_text_indent = re.search(text_indent_regexp, clean_style) - #formula_of_indent: indent = abs(margin_left - text_indent) + # formula_of_indent: indent = abs(margin_left - text_indent) if has_margin_left: num_ml = abs(int("".join( filter(str.isdigit, str(has_margin_left.group(2)))))) @@ -302,6 +306,7 @@ class TagStyleConverter: def preprocess_style(self): def remove_extra_spaces(style: str) -> List: + """ Function to remove extra spaces in style to process clean_style """ # replace all spaces between '; & letter' to ';' style = re.sub(r"; *", ";", style) split_style = style.split(';') @@ -381,7 +386,7 @@ class TagStyleConverter: @staticmethod def wrap_span_in_p_to_save_style_attrs(tag): - '''Function designed to save style attrs that cannot be in p -> span''' + """ Function designed to save style attrs that cannot be in p -> span """ if tag.name == 'p' and tag.attrs.get('style'): styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']] @@ -414,6 +419,7 @@ class TagStyleConverter: @staticmethod def wrap_span_in_li_to_save_style_attrs(tag): + """ Function designed to save style attrs that cannot be in li -> span """ if tag.name == 'li' and tag.attrs.get('style'): styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['text-align', 'list-style-type']] @@ -441,6 +447,7 @@ class TagStyleConverter: @staticmethod def wrap_span_in_ul_ol_to_save_style_attrs(tag): + """ Function designed to save style attrs that cannot be in ul/ol -> span """ if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): styles_cant_be_in_ul_ol = [ attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] @@ -465,6 +472,7 @@ class TagStyleConverter: @staticmethod def wrap_span_in_h_to_save_style_attrs(tag): + """ Function designed to save style attrs that cannot be in h -> span """ h_regexp = re.compile('(^h[1-9]$)') if re.search(h_regexp, tag.name) and tag.attrs.get('style'): @@ -487,6 +495,7 @@ class TagStyleConverter: def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): + """ Function adds styles from .css to inline style """ css_text = css_text.replace( '@namespace epub "http://www.idpf.org/2007/ops";', '') livecarta_tmp_ids = [] diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 4dd8dd1..c9e3bbd 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -20,7 +20,7 @@ from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \ - update_src_links_in_images, preprocess_footnotes + update_images_src_links, preprocess_footnotes class EpubConverter: @@ -48,7 +48,7 @@ class EpubConverter: # flag to be updated while ebooklib.toc is parsed self.id_anchor_exist_in_nav_points = False self.img_href2img_bytes = {} # file path to bytes - self.old_image_path2aws_path = {} # file path from to generated aws path + self.book_image_src_path2aws_path = {} # file path from to generated aws path self.footnotes_contents: List[str] = [] # to be sent on server as is self.noterefs: List[Tag] = [] # start of the footnote self.footnotes: List[Tag] = [] # end of the footnote @@ -124,12 +124,12 @@ class EpubConverter: return css_content def build_html_and_css_relations(self): - ''' + """ This function is designed to get 2 dictionaries: The first is css_href2css_content. It is created to connect href of css to content of css The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html ...2... = key2value - ''' + """ # dictionary: href of html to related css files html_href2css_href: defaultdict = defaultdict(list) @@ -159,10 +159,10 @@ class EpubConverter: return html_href2css_href, css_href2css_content, def add_css_styles_to_html_soup(self): - ''' + """ This function is designed to update html_href2html_body_soup And add to html_inline_style css_style_content - ''' + """ for html_href in self.html_href2html_body_soup: if self.html_href2css_href.get(html_href): css = '' @@ -179,6 +179,7 @@ class EpubConverter: return links + # t_nodes = [] def build_adjacency_list_from_toc(self, element, lvl=0): """ self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc @@ -211,25 +212,31 @@ class EpubConverter: sub_nodes = [] for i in second: + # if 'chapter' in (i.title.lower() if isinstance(i, Link) else i[0].title.lower()): + # self.t_nodes.append(self.build_adjacency_list_from_toc(i, lvl)) + # else: sub_nodes.append( self.build_adjacency_list_from_toc(i, lvl + 1)) - self.adjacency_list[nav_point] = sub_nodes self.hrefs_added_to_toc.add(nav_point.href) return nav_point elif isinstance(element, list) and (lvl == 0): - sub_nodes = [] + nodes = [] for i in element: - sub_nodes.append( + nodes.append( self.build_adjacency_list_from_toc(i, lvl + 1)) - - self.adjacency_list[-1] = sub_nodes + # for j in self.t_nodes: + # nodes.append(j) + # self.t_nodes = [] + # + # self.adjacency_list[-1] = nodes else: assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' def is_toc_empty(self): + """ Function checks is toc empty """ # there is no toc in ebook or no top chapters if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): return True @@ -247,6 +254,7 @@ class EpubConverter: self.hrefs_added_to_toc.add(nav_point.href) def add_not_added_files_to_adjacency_list(self, not_added): + """ Function add files that not added to adjacency list """ for i, file in enumerate(not_added): nav_point = NavPoint( Section(f'To check #{i}, filename: {file}', file)) @@ -315,6 +323,11 @@ class EpubConverter: return full_path[0] def process_internal_links(self): + """ + Function + - processing internal links in a book + - make ids unique + """ # 1. rebuild ids to be unique in all documents for toc_href in self.hrefs_added_to_toc: for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}): @@ -429,6 +442,7 @@ class EpubConverter: self.build_one_chapter(sub_node) def define_chapters_content(self): + """ Function build chapters content starts from top level chapters """ top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: for point in top_level_nav_points: @@ -441,12 +455,12 @@ class EpubConverter: nav_point.href, nav_point.id)] else: content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href] - self.old_image_path2aws_path = update_src_links_in_images(content, - self.img_href2img_bytes, - path_to_html=nav_point.href, - access=self.access, - path2aws_path=self.old_image_path2aws_path, - book_id=lambda x: self.file.stem if hasattr(self.file, self.file.stem) else 'book_id') + self.book_image_src_path2aws_path = update_images_src_links(content, + self.img_href2img_bytes, + path_to_html=nav_point.href, + access=self.access, + path2aws_path=self.book_image_src_path2aws_path, + book_id=self.file.stem if hasattr(self.file, self.file.stem) else 'book_id') is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS title_preprocessed = prepare_title(title) @@ -466,6 +480,7 @@ class EpubConverter: return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) def convert_to_dict(self): + """ Function which convert list of html nodes to appropriate json structure. """ top_level_nav_points = self.adjacency_list[-1] top_level_chapters = [] @@ -491,7 +506,7 @@ if __name__ == "__main__": logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) - json_converter = EpubConverter('../../epub/9781641051217.epub', + json_converter = EpubConverter('../../epub/9781614382263.epub', logger=logger_object) tmp = json_converter.convert_to_dict() diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index f38e113..8defe7a 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -2,12 +2,17 @@ from src.book_solver import BookSolver from src.epub_converter.epub_converter import EpubConverter class EpubBook(BookSolver): + """ Class of .epub type book - child of BookSolver """ def __init__(self, book_id=0, access=None, main_logger=None): super().__init__(book_id, access, main_logger) self.book_type = 'epub' def get_converted_book(self): + """ + 1. Convert epub to html + 2. Parse from line structure to nested structure + """ json_converter = EpubConverter(self.file_path, access=self.access, logger=self.logger_object) content_dict = json_converter.convert_to_dict() self.status_wrapper.set_generating() diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 15e026a..8743306 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -10,6 +10,7 @@ from src.livecarta_config import LiveCartaConfig def save_image_locally(img_file_path, img_content, book_id): + """ Function saves all images locally """ folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join( folder_path, f'../json/img_{book_id}/')) @@ -24,17 +25,19 @@ def save_image_locally(img_file_path, img_content, book_id): def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id): - link = access.send_image( + """ Function saves all images to Amazon web service """ + link_path = access.send_image( img_file_path, doc_id=book_id, img_content=img_content) - return link + return link_path -def update_src_links_in_images(body_tag: Tag, - href2img_content: dict, - path_to_html, - access=None, - path2aws_path=None, - book_id=None): +def update_images_src_links(body_tag: Tag, + href2img_content: dict, + path_to_html, + access=None, + path2aws_path=None, + book_id=None): + """ Function makes dictionary image_src_path -> Amazon web service_path """ img_tags = body_tag.find_all('img') for img in img_tags: @@ -65,16 +68,16 @@ def update_src_links_in_images(body_tag: Tag, del img.attrs['height'] if img.attrs.get('style'): del img.attrs['style'] - return path2aws_path def preprocess_table(body_tag: BeautifulSoup): + """ Function to preprocess tables and tags(td|th|tr): style """ tables = body_tag.find_all("table") for table in tables: - tds = table.find_all(re.compile("td|th|tr")) - for td in tds: - style = td.get('style') + ts = table.find_all(re.compile("td|th|tr")) + for t_tag in ts: + style = t_tag.get('style') width = '' if style: width_match = re.search( @@ -84,13 +87,13 @@ def preprocess_table(body_tag: BeautifulSoup): units = width_match.group(2) width = size+'px' - td.attrs['width'] = td.get('width') or width + t_tag.attrs['width'] = t_tag.get('width') or width - if td.attrs.get('style'): - td.attrs['style'] = td.attrs['style'].replace('border:0;', '') + if t_tag.attrs.get('style'): + t_tag.attrs['style'] = t_tag.attrs['style'].replace('border:0;', '') - if td.attrs.get('style') == '': - del td.attrs['style'] + elif t_tag.attrs.get('style') == '': + del t_tag.attrs['style'] if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']: table.attrs['border'] = '1' @@ -110,6 +113,7 @@ def process_lists(body_tag): def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): + """ Function inserts span before tag to be removed(aren't supported by livecarta) """ new_tag = main_tag.new_tag("span") new_tag.attrs['id'] = id_ or '' new_tag.attrs['class'] = class_ or '' @@ -153,9 +157,7 @@ def clean_headings_content(content: Tag, title: str): def heading_tag_to_p_tag(body_tag): - """ - Function to convert all lower level headings to p tags - """ + """ Function to convert all lower level headings to p tags """ pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = body_tag.find_all(re.compile(pattern)) for tag in header_tags: @@ -163,17 +165,16 @@ def heading_tag_to_p_tag(body_tag): def clean_title_from_numbering(title: str): - """ - Function to remove digits from headers. - """ + """ Function removes numbering from titles """ title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title - title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) + title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title return title def replace_with_livecarta_anchor_tag(anchor, i): + """ Function replace noteref_tag(anchor) with new livecarta tag """ new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag['class'] = 'footnote-element' new_tag['data-id'] = i + 1 @@ -188,11 +189,11 @@ def replace_with_livecarta_anchor_tag(anchor, i): def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \ -> Tuple[list, list, list]: """ + This function preprocessing footnotes This function should be earlier that adding fonts in pipeline.

Here is an example footnote1

- """ footnotes = [] noterefs_tags = source_html_tag.find_all( @@ -205,12 +206,14 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note new_footnotes_tags = [] [tag.decompose() for tag in bad_noterefs_tags] - def parse_a_tag_href(s: str): + def parse_a_tag_href(s: str) -> Tuple[str, str]: + """ Returns name of file & id of an anchor """ assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' f, id_ = s.split('#') return f, id_ def verify_footnote_tag(tags: list): + """ Function verifies is tag - footnote """ assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}' if len(tags) == 0: anchored_tags = list(target_html_tag.find_all(id=element_id)) @@ -275,7 +278,7 @@ def unwrap_structural_tags(body_tag): """ def _preserve_class_in_aside_tag(tag_): - # to save css style inherited from class, copy class to aside tag (which is parent to tag_) + """ to save css style inherited from class, copy class to aside tag (which is parent to tag_) """ # this is for Wiley books with boxes tag_class = tag_.attrs['class'] if not isinstance( tag_.attrs['class'], list) else tag_.attrs['class'][0] @@ -284,10 +287,11 @@ def unwrap_structural_tags(body_tag): tag_.parent.attrs['class'] = tag_class def preserve_class_in_section_tag(tag_) -> bool: - # to save css style inherited from class, copy class to child

+ """ + to save css style inherited from class, copy class to child

+ returns True, if

could be unwrapped + """ # this is for Wiley books with boxes - # returns True, if
could be unwrapped - tag_class = tag_.attrs['class'] if not isinstance( tag_.attrs['class'], list) else tag_.attrs['class'][0] if 'feature' not in tag_class: @@ -312,6 +316,10 @@ def unwrap_structural_tags(body_tag): class_=tag_to_be_removed.attrs.get('class')) def replace_div_tag_with_table(): + """Function replace
with : + 1. Convert div with certain classes to tables + 2. Add background color to div with background-color + """ for div in body_tag.find_all("div"): if div.attrs.get('class'): div_class = div.attrs['class'] if not isinstance( @@ -348,12 +356,12 @@ def unwrap_structural_tags(body_tag): continue add_span_to_save_ids_for_links(div) div.unwrap() + # comments removal for tag in body_tag.find_all(): for element in tag(text=lambda text: isinstance(text, Comment)): element.extract() - replace_div_tag_with_table() for s in body_tag.find_all("section"): @@ -458,23 +466,8 @@ def get_tags_between_chapter_marks(first_id, href, html_soup): return tags -def wrap_preformatted_span_with_table(main_tag, old_tag): - table = main_tag.new_tag("table") - table.attrs['border'] = '1px #ccc;' - table.attrs['style'] = 'width:100%;' - tbody = main_tag.new_tag("tbody") - tr = main_tag.new_tag("tr") - td = main_tag.new_tag("td") - td.attrs['bgcolor'] = '#f5f5f5' - # td.attrs['border-radius'] = '4px' - old_tag.wrap(td) - td.wrap(tr) - tr.wrap(tbody) - tbody.wrap(table) - return table - - def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None): + """ Function wraps with
""" table = main_tag.new_tag("table") table.attrs['border'] = border table.attrs['align'] = 'center' @@ -497,7 +490,6 @@ def clean_wiley_block(block): hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) for hr in hrs: hr.extract() - print(hr) h = block.find(re.compile("h[1-9]")) if h: h.name = "p" @@ -505,6 +497,7 @@ def clean_wiley_block(block): def preprocess_block_tags(chapter_tag): + """ Function preprocessing tags """ for block in chapter_tag.find_all("blockquote"): if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: clean_wiley_block(block) @@ -527,7 +520,7 @@ def preprocess_block_tags(chapter_tag): def prepare_formatted(text): - # replace <,> to save them as is in html code + """ Function replaces special symbols with their Unicode representation """ text = text.replace("<", "\x3C") text = text.replace(">", "\x3E") text = text.replace('\t', "\xa0 \xa0 ") #     @@ -536,7 +529,25 @@ def prepare_formatted(text): return text +def wrap_preformatted_span_with_table(main_tag, old_tag): + """ Function wraps with
""" + table = main_tag.new_tag("table") + table.attrs['border'] = '1px #ccc;' + table.attrs['style'] = 'width:100%;' + tbody = main_tag.new_tag("tbody") + tr = main_tag.new_tag("tr") + td = main_tag.new_tag("td") + td.attrs['bgcolor'] = '#f5f5f5' + # td.attrs['border-radius'] = '4px' + old_tag.wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + return table + + def preprocess_pre_tags(chapter_tag): + """ Function preprocessing
 tags """
     for pre in chapter_tag.find_all("pre"):
         new_tag = BeautifulSoup(features='lxml').new_tag("span")
         new_tag.attrs = pre.attrs.copy()
@@ -575,7 +586,7 @@ def preprocess_pre_tags(chapter_tag):
 
 
 def preprocess_code_tags(chapter_tag):
-    # function that emulates style of , , 
+    """ Function that emulates style of , ,  """
     for code in chapter_tag.find_all(re.compile("code|kdb|var")):
         code.name = 'span'
         if code.parent.name == "pre":
@@ -584,9 +595,7 @@ def preprocess_code_tags(chapter_tag):
 
 
 def prepare_title(title_of_chapter: str) -> str:
-    """
-    Final processing/cleaning function.
-    """
+    """ Function finalise processing/cleaning title """
     title_str = BeautifulSoup(title_of_chapter, features='lxml').string
     title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
     title_str = re.sub(r' +', ' ', title_str).rstrip()
@@ -596,7 +605,11 @@ def prepare_title(title_of_chapter: str) -> str:
 
 def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
     """
-    Final processing/cleaning function.
+    Function finalise processing/cleaning content
+    1. cleaning \n
+    2. heading removal
+    3. processing tags
+    4. class removal
     """
     # 0. cleaning \n
     to_remove = []
@@ -609,13 +622,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
     # 1. heading removal
     if remove_title_from_chapter:
         clean_headings_content(content_tag, title_str)
+
+    # 2. processing tags (
  • ,
  • , ,
    , )
         process_lists(content_tag)
         preprocess_table(content_tag)
         preprocess_code_tags(content_tag)
         preprocess_pre_tags(content_tag)
         preprocess_block_tags(content_tag)
     
    -    # 2. class removal
    +    # 3. class removal
         for tag in content_tag.find_all(recursive=True):
             if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
                                                                                                     'footnote-element']):
    diff --git a/src/livecarta_config.py b/src/livecarta_config.py
    index 694befd..21e7db1 100644
    --- a/src/livecarta_config.py
    +++ b/src/livecarta_config.py
    @@ -1,5 +1,5 @@
    -
     class LiveCartaConfig:
    +    """Class of values that LiveCarta platform using and supports"""
         SUPPORTED_LEVELS = 5
         SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
         HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}
    diff --git a/src/util/color_reader.py b/src/util/color_reader.py
    index 8da83a7..08db998 100644
    --- a/src/util/color_reader.py
    +++ b/src/util/color_reader.py
    @@ -6,6 +6,7 @@ from webcolors import html4_hex_to_names, hex_to_rgb, rgb_to_name, rgb_percent_t
     
     
     def closest_colour_rgb(requested_color):
    +    """ Function finds closes colour rgb """
         min_colours = {}
         for key, name in html4_hex_to_names.items():
             r_c, g_c, b_c = hex_to_rgb(key)
    @@ -18,6 +19,7 @@ def closest_colour_rgb(requested_color):
     
     
     def rgb2color_name(color):
    +    """ Transform rgb -> color name """
         try:
             closest_name = actual_name = rgb_to_name(color, 'html4')
         except ValueError:
    @@ -30,6 +32,7 @@ def rgb2color_name(color):
     
     
     def hex2color_name(color):
    +    """ Transform hex -> color name """
         try:
             color = hex_to_rgb(color)
         except ValueError:
    @@ -47,6 +50,7 @@ def hex2color_name(color):
     
     
     def str2closest_html_color_name(s: str):
    +    """ Transform str -> closest color name """
         if 'rgb' in s:
             rgb_str = 'rgba' if ('rgba' in s) else 'rgb'
             s = s.replace(rgb_str, '').replace('(', '').replace(')', '')
    @@ -80,6 +84,7 @@ def str2closest_html_color_name(s: str):
     
     
     def rgba2rgb(r, g, b, alpha):
    +    """ Transform rgba -> rgb """
         r_background, g_background, b_background = 255, 255, 255
         r_new = int((1 - alpha) * r_background + alpha * r)
         g_new = int((1 - alpha) * g_background + alpha * g)
    @@ -88,6 +93,7 @@ def rgba2rgb(r, g, b, alpha):
     
     
     def str2hex(s: str):
    +    """ Transform str -> hex """
         if '#' in s and (len(s) <= 7):
             return s.lower()
     
    diff --git a/src/util/helpers.py b/src/util/helpers.py
    index 6be1c8b..13b2099 100644
    --- a/src/util/helpers.py
    +++ b/src/util/helpers.py
    @@ -3,6 +3,7 @@ import logging
     
     
     class ColoredFormatter(logging.Formatter):
    +    """ Class to prettify logger and command line output """
         MAPPING = {
             'DEBUG': 37,  # white
             'INFO': 36,  # cyan
    @@ -61,9 +62,7 @@ class BookLogger:
             self.logger.log(msg=message, level=logging_level, stacklevel=2)
     
         def log_error_to_main_log(self, message=''):
    -        """
    -        Method for logging error to main log file.
    -        """
    +        """ Method for logging error to main log file. """
             if self.main_logger:
                 if not message:
                     message = f'Error in book conversion. Check log file.'
    @@ -71,6 +70,8 @@ class BookLogger:
     
     
     class BookStatusWrapper:
    +    """Class sets/updates statuses of Converter on Platform"""
    +
         def __init__(self, access, logger_object, book_id=0):
             self.access = access
             self.logger_object = logger_object