diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index e9683f4..80d96a3 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -222,7 +222,6 @@ class HTMLDocxPreprocessor: def _process_tables(self): """Function to process tables. Set "border" attribute.""" - tables = self.body_tag.find_all("table") for table in tables: tds = table.find_all("td") diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py index 2212bd5..11e4a16 100644 --- a/src/epub_converter/css_preprocessing.py +++ b/src/epub_converter/css_preprocessing.py @@ -11,13 +11,13 @@ from src.livecarta_config import LiveCartaConfig def get_text_color(x): color = str2hex(x) - color = color if color not in ['#000000', '#000', 'black'] else '' + color = color if color not in ["#000000", "#000", "black"] else "" return color def get_bg_color(x): color = str2hex(x) - color = color if color not in ['#ffffff', '#fff', 'white'] else '' + color = color if color not in ["#ffffff", "#fff", "white"] else "" return color @@ -43,25 +43,25 @@ def convert_tag_style_values(size_value: str) -> str: return LiveCartaConfig.sizes_px[last_possible_size_index] font_size_regexp = re.compile( - r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') + r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)") has_style_attrs = re.search(font_size_regexp, size_value) if has_style_attrs: if has_style_attrs.group(1): - size_value = float(size_value.replace('%', '')) / 100.0 + size_value = float(size_value.replace("%", "")) / 100.0 return find_closest_size(size_value) elif has_style_attrs.group(3): - size_value = float(size_value.replace('em', '')) + size_value = float(size_value.replace("em", "")) return find_closest_size(size_value) elif has_style_attrs.group(5): - return size_value.replace('pt', 'px') + return size_value.replace("pt", "px") else: - return '' + return "" return size_value def convert_indents_tag_values(size_value: str) -> str: """ - Function converts values of ['text-indent', 'margin-left', 'margin'] + Function converts values of ["text-indent", "margin-left", "margin"] Parameters ---------- size_value: str @@ -71,12 +71,12 @@ def convert_indents_tag_values(size_value: str) -> str: size_value: str """ - if len(size_value.split(' ')) == 3: + if len(size_value.split(" ")) == 3: size_value = convert_tag_style_values(size_value.split( - ' ')[-2]) # returns middle value + " ")[-2]) # returns middle value else: size_value = convert_tag_style_values(size_value.split( - ' ')[-1]) # returns last value + " ")[-1]) # returns last value return size_value @@ -87,35 +87,35 @@ If property has empty list, it means that any value can be converted. If property has not empty list, it means that only certain property-value combinations can be transformed. """ LIVECARTA_STYLE_ATTRS = { - 'text-indent': [], - 'font-variant': ['small-caps'], - 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], - 'align': [], - 'font': [], - 'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys() + "text-indent": [], + "font-variant": ["small-caps"], + "text-align": [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], + "align": [], + "font": [], + "font-family": [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys() if x != LiveCartaConfig.DEFAULT_FONT_NAME], - 'font-size': [], - 'font-weight': ['bold', '600', '700', '800', '900'], # - 'font-style': ['italic'], # - 'text-decoration': ['underline', 'line-through'], # , - 'text-decoration-line': ['underline', 'line-through'], # , - 'vertical-align': ['super'], # - 'color': [], - 'background-color': [], - 'background': [], - 'width': [], - 'border': [], - 'border-top-width': [], - 'border-right-width': [], - 'border-left-width': [], - 'border-bottom-width': [], - 'border-top': [], - 'border-bottom': [], - 'list-style-type': [], - 'list-style-image': [], - 'margin-left': [], - 'margin-top': [], - 'margin': [], + "font-size": [], + "font-weight": ["bold", "600", "700", "800", "900"], # + "font-style": ["italic"], # + "text-decoration": ["underline", "line-through"], # , + "text-decoration-line": ["underline", "line-through"], # , + "vertical-align": ["super"], # + "color": [], + "background-color": [], + "background": [], + "width": [], + "border": [], + "border-top-width": [], + "border-right-width": [], + "border-left-width": [], + "border-bottom-width": [], + "border-top": [], + "border-bottom": [], + "list-style-type": [], + "list-style-image": [], + "margin-left": [], + "margin-top": [], + "margin": [], } """ @@ -125,28 +125,28 @@ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING shou to suit livecarta style convention. """ LIVECARTA_STYLE_ATTRS_MAPPING = { - 'text-indent': convert_indents_tag_values, - 'font-variant': lambda x: x, - 'text-align': lambda x: x, - 'font': lambda x: '', - 'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title())) + "text-indent": convert_indents_tag_values, + "font-variant": lambda x: x, + "text-align": lambda x: x, + "font": lambda x: "", + "font-family": lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title())) or LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x)), - 'font-size': convert_tag_style_values, - 'color': get_text_color, - 'background-color': get_bg_color, - 'background': get_bg_color, - 'border': lambda x: x if x != '0' else '', - 'border-top-width': lambda x: x if x != '0' else '', - 'border-right-width': lambda x: x if x != '0' else '', - 'border-left-width': lambda x: x if x != '0' else '', - 'border-bottom-width': lambda x: x if x != '0' else '', - 'border-top': lambda x: x if x != '0' else '', - 'border-bottom': lambda x: x if x != '0' else '', - 'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc', - 'list-style-image': lambda x: 'disc', - 'margin-left': convert_indents_tag_values, - 'margin-top': convert_tag_style_values, - 'margin': convert_indents_tag_values + "font-size": convert_tag_style_values, + "color": get_text_color, + "background-color": get_bg_color, + "background": get_bg_color, + "border": lambda x: x if x != "0" else "", + "border-top-width": lambda x: x if x != "0" else "", + "border-right-width": lambda x: x if x != "0" else "", + "border-left-width": lambda x: x if x != "0" else "", + "border-bottom-width": lambda x: x if x != "0" else "", + "border-top": lambda x: x if x != "0" else "", + "border-bottom": lambda x: x if x != "0" else "", + "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", + "list-style-image": lambda x: "disc", + "margin-left": convert_indents_tag_values, + "margin-top": convert_tag_style_values, + "margin": convert_indents_tag_values } @@ -155,17 +155,17 @@ def update_inline_styles_to_livecarta_convention(split_style: list): style_name, style_value = style.split(":") if style_name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = '' + split_style[i] = "" return split_style - cleaned_value = style_value.replace('\"', '').split()[-1] + cleaned_value = style_value.replace("\"", "").split()[-1] constraints_on_value = LIVECARTA_STYLE_ATTRS.get( style_name) value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ style_name] if constraints_on_value and value_not_in_possible_values_list: # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = '' + split_style[i] = "" else: if style_name in LIVECARTA_STYLE_ATTRS_MAPPING: # function that converts our data @@ -177,14 +177,14 @@ def update_inline_styles_to_livecarta_convention(split_style: list): def build_inline_style_content(style: str) -> str: """Build inline style with livecarta convention""" - # replace all spaces between '; & letter' to ';' + # replace all spaces between "; & letter" to ";" style = re.sub(r"; *", ";", style) - # when we split style by ';', last element of the list is '' - None + # when we split style by ";", last element of the list is "" - None # remove it - split_style: list = list(filter(None, style.split(';'))) - # replace all spaces between ': & letter' to ':' + split_style: list = list(filter(None, style.split(";"))) + # replace all spaces between ": & letter" to ":" split_style = [el.replace( - re.search(r'(:\s*)', el).group(1), ':') for el in split_style] + re.search(r"(:\s*)", el).group(1), ":") for el in split_style] split_style = update_inline_styles_to_livecarta_convention(split_style) style = "; ".join(split_style) @@ -195,17 +195,17 @@ def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRul style_type: cssutils.css.property.Property): if style_type.name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = '' + css_rule.style[style_type.name] = "" return - cleaned_value = style_type.value.replace('\"', '') + cleaned_value = style_type.value.replace("\"", "") constraints_on_value = LIVECARTA_STYLE_ATTRS.get( style_type.name) value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ style_type.name] if constraints_on_value and value_not_in_possible_values_list: # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = '' + css_rule.style[style_type.name] = "" else: if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: # function that converts our data @@ -227,12 +227,12 @@ def build_css_file_content(css_content: str) -> str: return css_text -if __name__ == '__main__': - file = '../../epub/9781627222174.epub' +if __name__ == "__main__": + file = "../../epub/9781627222174.epub" ebooklib_book = epub.read_epub(file) - css_ = ebooklib_book.get_item_with_href('css/epub.css') + css_ = ebooklib_book.get_item_with_href("css/epub.css") css_ = css_.get_content().decode() css_cleaned = build_css_file_content(css_) html_ = ebooklib_book.get_item_with_href( - 'pr01s05.xhtml').get_body_content().decode() - html_soup = BeautifulSoup(html_, features='lxml') + "pr01s05.xhtml").get_body_content().decode() + html_soup = BeautifulSoup(html_, features="lxml") diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index dc8d3a2..57f2904 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -17,10 +17,12 @@ from bs4 import BeautifulSoup, Tag from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint +from src.epub_converter.image_processing import update_images_src_links +from src.epub_converter.footnotes_processing import preprocess_footnotes from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style -from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\ - prepare_title, prepare_content, update_images_src_links, preprocess_footnotes +from src.epub_converter.html_epub_preprocessor import process_structural_tags, get_tags_between_chapter_marks,\ + prepare_title, prepare_content class EpubConverter: @@ -57,26 +59,27 @@ class EpubConverter: self.noterefs: List[Tag] = [] # start of the footnote self.footnotes: List[Tag] = [] # end of the footnote - self.logger.log('Image processing.') + self.logger.log("Image processing.") for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): file_name = x.file_name content = x.content self.img_href2img_bytes[file_name] = content - self.logger.log('HTML files reading.') + self.logger.log("HTML files reading.") self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() - # TODO Presets - self.logger.log('Process CSS inline styles.') + self.logger.log("Process CSS inline styles.") self.process_inline_styles_in_html_soup() - self.logger.log('CSS files processing.') + self.logger.log("CSS files processing.") self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() - self.logger.log('CSS styles adding.') + self.logger.log("CSS styles adding.") self.add_css_styles_to_html_soup() - self.logger.log('Footnotes processing.') + # todo presets + + self.logger.log("Footnotes processing.") for href in self.html_href2html_body_soup: content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup) @@ -85,27 +88,28 @@ class EpubConverter: self.footnotes.extend(footnotes_tags) for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)): - noteref.attrs['data-id'] = i + 1 - noteref.attrs['id'] = f'footnote-{i + 1}' - footnote.attrs['href'] = f'#footnote-{i + 1}' + noteref.attrs["data-id"] = i + 1 + noteref.attrs["id"] = f"footnote-{i + 1}" + footnote.attrs["href"] = f"#footnote-{i + 1}" - self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.') - self.logger.log('TOC processing.') + self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") + self.logger.log("TOC processing.") self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed if self.is_toc_empty(): self.build_adjacency_list_from_spine() not_added = [ x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] - self.logger.log(f'Html documents not added to TOC: {not_added}.') + self.logger.log(f"Html documents not added to TOC: {not_added}.") self.add_not_added_files_to_adjacency_list(not_added) - self.logger.log(f'Html internal links and structure processing.') - self.label_chapters_ids_with_tmp_id() + self.logger.log(f"Html internal links and structure processing.") + self.label_chapters_ids_with_lc_id() # used only after parsed toc, ids from toc needed self.process_html_soup_structure_to_line() self.process_internal_links() - self.logger.log(f'Building chapters content.') + self.logger.log(f"Define chapters content.") self.define_chapters_content() + self.logger.log(f"Converting html_nodes to LiveCarta chapter items.") def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: # using EpubElements @@ -115,7 +119,7 @@ class EpubConverter: for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_body_text = item.get_body_content() # html.parser closes tags if needed - soup = BeautifulSoup(html_body_text, features='html.parser') + soup = BeautifulSoup(html_body_text, features="html.parser") nodes[item.file_name] = soup return nodes @@ -123,15 +127,15 @@ class EpubConverter: path_to_css_from_html = css_href html_folder = dirname(html_href) path_to_css_from_root = normpath( - join(html_folder, path_to_css_from_html)).replace('\\', '/') + join(html_folder, path_to_css_from_html)).replace("\\", "/") css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) # if in css file we import another css if "@import" in str(css_obj.content): path_to_css_from_root = "css/" + \ - re.search('"(.*)"', str(css_obj.content)).group(1) + re.search("'(.*)'", str(css_obj.content)).group(1) css_obj = self.ebooklib_book.get_item_with_href( path_to_css_from_root) - assert css_obj, f'Css style {css_href} was not in manifest.' + assert css_obj, f"Css style {css_href} was not in manifest." css_content: str = css_obj.get_content().decode() return css_content @@ -140,11 +144,11 @@ class EpubConverter: for html_href in self.html_href2html_body_soup: html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={'style': re.compile('.*')}) + attrs={"style": re.compile(".*")}) for tag_initial_inline_style in tags_with_inline_style: - inline_style = tag_initial_inline_style.attrs['style'] - tag_initial_inline_style.attrs['style'] = \ + inline_style = tag_initial_inline_style.attrs["style"] + tag_initial_inline_style.attrs["style"] = \ build_inline_style_content(inline_style) def build_html_and_css_relations(self) -> tuple[dict, dict]: @@ -167,23 +171,23 @@ class EpubConverter: for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_content = item.content html_href = item.file_name - soup_html_content = BeautifulSoup(html_content, features='lxml') + soup_html_content = BeautifulSoup(html_content, features="lxml") # check if file links to css file - for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): + for tag in soup_html_content.find_all("link", attrs={"type": "text/css"}): # alternate page of original page (e.g. another language) - if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): + if tag.attrs.get("rel") and ("alternate" in tag.attrs["rel"]): continue - css_href = tag.attrs.get('href') + css_href = tag.attrs.get("href") html_href2css_href[html_href].append(css_href) if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict css_href2css_content[css_href] = build_css_file_content( self.get_css_content(css_href, html_href)) - for i, tag in enumerate(soup_html_content.find_all('style')): + for i, tag in enumerate(soup_html_content.find_all("style")): css_content = tag.string - html_href2css_href[html_href].append(f'href{i}') - css_href2css_content[f'href{i}'] = build_css_file_content( + html_href2css_href[html_href].append(f"href{i}") + css_href2css_content[f"href{i}"] = build_css_file_content( css_content) return html_href2css_href, css_href2css_content @@ -195,7 +199,7 @@ class EpubConverter: """ for html_href in self.html_href2html_body_soup: if self.html_href2css_href.get(html_href): - css = '' + css = "" for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] @@ -243,7 +247,7 @@ class EpubConverter: sub_nodes = [] for elem in second: - if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1: + if ("section" in first.title.lower() or "part" in first.title.lower()) and lvl == 1: self.offset_sub_nodes.append( self.build_adjacency_list_from_toc(elem, lvl)) else: @@ -267,7 +271,7 @@ class EpubConverter: self.adjacency_list[-1] = nodes else: - assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' + assert 0, f"Error. Element is not tuple/Link/list instance: {type(element)}" def is_toc_empty(self) -> bool: """Function checks is toc empty""" @@ -297,36 +301,36 @@ class EpubConverter: """Function add files that not added to adjacency list""" for i, file in enumerate(not_added): nav_point = NavPoint( - Section(f'To check #{i}, filename: {file}', file)) + Section(f"To check #{i}, filename: {file}", file)) self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(file) - def label_chapters_ids_with_tmp_id(self): + def label_chapters_ids_with_lc_id(self): for html_href in self.html_href2html_body_soup: ids = self.html_href2subchapter_ids[html_href] for i in ids: soup = self.html_href2html_body_soup[html_href] tag = soup.find(id=i) - new_h = soup.new_tag('tmp') - new_h.attrs['class'] = 'converter-chapter-mark' - new_h.attrs['id'] = i + new_h = soup.new_tag("tmp") + new_h.attrs["class"] = "converter-chapter-mark" + new_h.attrs["id"] = i tag.insert_before(new_h) def process_html_soup_structure_to_line(self): # go to line structure for html_href in self.html_href2html_body_soup: soup = self.html_href2html_body_soup[html_href] - self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup) + self.html_href2html_body_soup[html_href] = process_structural_tags(soup) @staticmethod def create_unique_id(href, id_): - return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) + return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_) @staticmethod def create_new_anchor_span(soup, id_): new_anchor_span = soup.new_tag("span") - new_anchor_span.attrs['id'] = id_ - new_anchor_span.attrs['class'] = 'link-anchor' + new_anchor_span.attrs["id"] = id_ + new_anchor_span.attrs["class"] = "link-anchor" new_anchor_span.string = "\xa0" return new_anchor_span @@ -353,18 +357,18 @@ class EpubConverter: """ dir_name = os.path.dirname(cur_file_path) normed_path = os.path.normpath(os.path.join( - dir_name, href_in_link)).replace('\\', '/') + dir_name, href_in_link)).replace("\\", "/") full_path = [ path for path in self.hrefs_added_to_toc if normed_path in path] if not full_path: - self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' - f'While processing href in {internal_link_tag}.') - internal_link_tag.attrs['converter-mark'] = 'bad-link' + self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. " + f"While processing href in {internal_link_tag}.") + internal_link_tag.attrs["converter-mark"] = "bad-link" return None if len(full_path) > 1: - self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}' - f' while {internal_link_tag} processing. The first one will be chosen.') + self.logger.log(f"Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}" + f" while {internal_link_tag} processing. The first one will be chosen.") return full_path[0] @@ -387,30 +391,30 @@ class EpubConverter: """ # 1. rebuild ids to be unique in all documents for toc_href in self.hrefs_added_to_toc: - for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}): - if tag.attrs.get('class') == 'converter-chapter-mark': + for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): + if tag.attrs.get("class") == "converter-chapter-mark": continue - if tag.attrs.get('class') == 'footnote-element': + if tag.attrs.get("class") == "footnote-element": continue - new_id = self.create_unique_id(toc_href, tag.attrs['id']) - tag.attrs['id'] = new_id + new_id = self.create_unique_id(toc_href, tag.attrs["id"]) + tag.attrs["id"] = new_id # 2a. process anchor which is a whole xhtml file internal_link_reg1 = re.compile( - r'(^(?!https?://).+\.(htm|html|xhtml)$)') + r"(^(?!https?://).+\.(htm|html|xhtml)$)") for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): - a_tag_href = internal_link_tag.attrs['href'] + for internal_link_tag in soup.find_all("a", {"href": internal_link_reg1}): + a_tag_href = internal_link_tag.attrs["href"] # find full path a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( toc_href, a_tag_href, internal_link_tag) if not a_tag_href_matched_to_toc: continue - new_id = self.create_unique_id(a_tag_href_matched_to_toc, '') - internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' + new_id = self.create_unique_id(a_tag_href_matched_to_toc, "") + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" if new_id not in self.internal_anchors: anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] new_anchor_span = self.create_new_anchor_span(soup, new_id) @@ -418,22 +422,22 @@ class EpubConverter: anchor_soup.insert(0, new_anchor_span) self.internal_anchors.add(new_id) - del internal_link_tag.attrs['href'] + del internal_link_tag.attrs["href"] # 2b. process anchor which is an element in xhtml file - internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)') + internal_link_reg2 = re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)") for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): - a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split( - '#') + for internal_link_tag in soup.find_all("a", {"href": internal_link_reg2}): + a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split( + "#") # find full path if a_tag_href: a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) else: a_tag_href_matched_to_toc = os.path.normpath( - toc_href).replace('\\', '/') + toc_href).replace("\\", "/") if not a_tag_href_matched_to_toc: continue @@ -442,45 +446,45 @@ class EpubConverter: a_tag_href_matched_to_toc, a_tag_id) anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - anchor_tags = anchor_soup.find_all(attrs={'id': new_id, }) + anchor_tags = anchor_soup.find_all(attrs={"id": new_id, }) anchor_tags = anchor_tags or anchor_soup.find_all( - attrs={'id': a_tag_id}) # if link is a footnote + attrs={"id": a_tag_id}) # if link is a footnote if anchor_tags: if len(anchor_tags) > 1: - self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n' - f'{anchor_tags}\n' - f' While processing {internal_link_tag}') + self.logger.log(f"Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n" + f"{anchor_tags}\n" + f" While processing {internal_link_tag}") anchor_tag = anchor_tags[0] - assert anchor_tag.attrs['id'] in [new_id, a_tag_id] + assert anchor_tag.attrs["id"] in [new_id, a_tag_id] # if anchor is found we could add placeholder for link creation on server side. - internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" # create span to have cyclic links, link has 1 type of class, anchor another - if anchor_tag.attrs['id'] not in self.internal_anchors: + if anchor_tag.attrs["id"] not in self.internal_anchors: new_anchor_span = self.create_new_anchor_span( soup, new_id) anchor_tag.insert_before(new_anchor_span) self.internal_anchors.add(new_id) - del anchor_tag.attrs['id'] - del internal_link_tag.attrs['href'] + del anchor_tag.attrs["id"] + del internal_link_tag.attrs["href"] else: - internal_link_tag.attrs['converter-mark'] = 'bad-link' - self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.' - f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' - f' Old id={a_tag_id}') + internal_link_tag.attrs["converter-mark"] = "bad-link" + self.logger.log(f"Error in {toc_href}. While processing {internal_link_tag} no anchor found." + f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file." + f" Old id={a_tag_id}") - def build_one_chapter(self, nav_point: NavPoint): + def detect_one_chapter(self, nav_point: NavPoint): """ Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) 3 cases: id wraps all chapter content, - id wraps chapter's content + subchapters' content + id wraps chapter"s content + subchapters" content id points to the start of title of a chapter - In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id + In all cases we know where chapter starts. Therefore, chapter is all tags between chapter"s id and id of the next chapter/subchapter Parameters ---------- @@ -496,7 +500,7 @@ class EpubConverter: soup = self.html_href2html_body_soup[nav_point.href] chapter_tags = get_tags_between_chapter_marks( first_id=nav_point.id, href=nav_point.href, html_soup=soup) - new_tree = BeautifulSoup('', 'html.parser') + new_tree = BeautifulSoup("", "html.parser") for tag in chapter_tags: new_tree.append(tag) self.href_chapter_id2soup_html[( @@ -504,16 +508,30 @@ class EpubConverter: if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: - self.build_one_chapter(sub_node) + self.detect_one_chapter(sub_node) def define_chapters_content(self): """Function build chapters content, starts from top level chapters""" top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: for point in top_level_nav_points: - self.build_one_chapter(point) + self.detect_one_chapter(point) - def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: + def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: + """ + Function prepare style, tags to json structure + Parameters + ---------- + nav_point: NavPoint + + lvl: int + level of chapter + Returns + ------- + ChapterItem + built chapter + + """ title = nav_point.title if nav_point.id: content: BeautifulSoup = self.href_chapter_id2soup_html[( @@ -526,7 +544,7 @@ class EpubConverter: access=self.access, path2aws_path=self.book_image_src_path2aws_path, book_id=self.file_path.stem - if hasattr(self.file_path, 'stem') else 'book_id') + if hasattr(self.file_path, "stem") else "book_id") is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS title_preprocessed = prepare_title(title) @@ -534,15 +552,16 @@ class EpubConverter: remove_title_from_chapter=is_chapter) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter + # if it doesn't have subchapters if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: - sub_chapter_item = self.node_to_livecarta_chapter_item( + sub_chapter_item = self.html_node_to_livecarta_chapter_item( sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) if self.logger: - indent = ' ' * lvl - self.logger.log(f'{indent}Chapter: {title} is prepared.') + indent = " " * lvl + self.logger.log(f"{indent}Chapter: {title} is prepared.") return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) def convert_to_dict(self) -> dict: @@ -550,12 +569,13 @@ class EpubConverter: top_level_nav_points = self.adjacency_list[-1] top_level_chapters = [] - for nav_point in top_level_nav_points: - chapter = self.node_to_livecarta_chapter_item(nav_point) + # loop through to level chapters + for tl_nav_point in top_level_nav_points: + chapter = self.html_node_to_livecarta_chapter_item(tl_nav_point) top_level_chapters.append(chapter) top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] - self.logger.log(f'Anchors found: {len(self.internal_anchors)}.') - self.logger.log('End conversion.') + self.logger.log(f"Anchors found: {len(self.internal_anchors)}.") + self.logger.log("End conversion.") return { "content": top_level_dict_chapters, @@ -564,12 +584,12 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = '../../epub/9781614382264.epub' + epub_file_path = "../../epub/9781614382264.epub" logger_object = BookLogger( - name='epub', book_id=epub_file_path.split('/')[-1]) + name="epub", book_id=epub_file_path.split("/")[-1]) json_converter = EpubConverter(epub_file_path, logger=logger_object) content_dict = json_converter.convert_to_dict() - with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json: + with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: json.dump(content_dict, f_json, ensure_ascii=False) diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index cb6e080..8e92a40 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -7,7 +7,7 @@ class EpubBook(BookSolver): def __init__(self, book_id=0, access=None, main_logger=None): super().__init__(book_id, access, main_logger) - self.book_type = 'epub' + self.book_type = "epub" def get_converted_book(self): """ diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py new file mode 100644 index 0000000..d9840f3 --- /dev/null +++ b/src/epub_converter/footnotes_processing.py @@ -0,0 +1,87 @@ +from typing import Tuple + +from bs4 import BeautifulSoup, Tag + + +def _replace_with_livecarta_anchor_tag(anchor, i): + """Function replace noteref_tag(anchor) with new livecarta tag""" + new_tag = BeautifulSoup(features="lxml").new_tag("sup") + new_tag["class"] = "footnote-element" + new_tag["data-id"] = i + 1 + new_tag["id"] = f"footnote-{i + 1}" + new_tag.string = "*" + if anchor.parent.name == "sup": + anchor.parent.unwrap() + anchor.replace_with(new_tag) + return new_tag + + +def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name="epub:type") \ + -> Tuple[list, list, list]: + """ + This function preprocessing footnotes + This function should be earlier that adding fonts in pipeline. + +

Here is an example footnote1

+ + + """ + footnotes = [] + noterefs_tags = source_html_tag.find_all( + attrs={noteref_attr_name: "noteref"}) + bad_noterefs_tags = set( + [tag for tag in noterefs_tags if not tag.attrs.get("href")]) + noterefs_tags = [ + tag for tag in noterefs_tags if tag not in bad_noterefs_tags] + new_noterefs_tags = [] + new_footnotes_tags = [] + [tag.decompose() for tag in bad_noterefs_tags] + + def parse_a_tag_href(s: str) -> Tuple[str, str]: + """Returns name of file & id of an anchor""" + assert "#" in s, f"Error. Unexpected href: {s} in a tag. Href must contain an id." + f, id_ = s.split("#") + return f, id_ + + def verify_footnote_tag(tags: list): + """Function verifies is tag - footnote""" + assert len(tags) <= 1, f"Error, Multiple id: {href}.\n{tags}" + if len(tags) == 0: + anchored_tags = list(target_html_tag.find_all(id=element_id)) + if len(anchored_tags): + print( + f"Warning. Href for tag is detected as footnote:\n{noteref_tag}") + return anchored_tags + else: + assert 0, f"Error, No element with id: {href} found." + return tags + + for i, noteref_tag in enumerate(noterefs_tags): + href = noteref_tag.attrs["href"] + file, element_id = parse_a_tag_href(href) + if not file: + target_html_tag = source_html_tag + else: + target_html_tag = href2soup_html.get(file) + if not target_html_tag: + print( + f"Error while footnotes processing. For {noteref_tag} invalid path: {file}.") + continue + + possible_footnote = "note|footnote|endnote|rearenote" + expected_footnote_tags = list(target_html_tag.find_all(id=element_id, + attrs={"epub:type": re.compile(possible_footnote)})) + + expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) + footnote_tag = expected_footnote_tags[0] + if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote": + footnote_tag = footnote_tag.parent + new_noterefs_tags.append( + _replace_with_livecarta_anchor_tag(noteref_tag, i)) + content = footnote_tag.text + # footnote_tag.decompose() + footnotes.append(content) + footnote_tag = footnote_tag.find( + attrs={"role": "doc-backlink"}) or footnote_tag + new_footnotes_tags.append(footnote_tag) + return footnotes, new_noterefs_tags, new_footnotes_tags \ No newline at end of file diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index d94c43a..efdba02 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -1,305 +1,107 @@ -import os import re -import pathlib -from typing import Tuple from bs4 import BeautifulSoup, NavigableString, Tag, Comment -from src.access import Access from src.livecarta_config import LiveCartaConfig -def _replace_with_livecarta_anchor_tag(anchor, i): - """Function replace noteref_tag(anchor) with new livecarta tag""" - new_tag = BeautifulSoup(features='lxml').new_tag('sup') - new_tag['class'] = 'footnote-element' - new_tag['data-id'] = i + 1 - new_tag['id'] = f'footnote-{i + 1}' - new_tag.string = '*' - if anchor.parent.name == 'sup': - anchor.parent.unwrap() - anchor.replace_with(new_tag) - return new_tag - - -def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \ - -> Tuple[list, list, list]: +def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup): """ - This function preprocessing footnotes - This function should be earlier that adding fonts in pipeline. + Function adds span with id from tag_to_be_removed + because this tag will be removed(unwrapped/extract) + Parameters + ---------- + tag_to_be_removed: Soup object + chapter_tag: BeautifulSoup -

Here is an example footnote1

- + Returns + ------- + None + updated body tag - """ - footnotes = [] - noterefs_tags = source_html_tag.find_all( - attrs={noteref_attr_name: 'noteref'}) - bad_noterefs_tags = set( - [tag for tag in noterefs_tags if not tag.attrs.get('href')]) - noterefs_tags = [ - tag for tag in noterefs_tags if tag not in bad_noterefs_tags] - new_noterefs_tags = [] - new_footnotes_tags = [] - [tag.decompose() for tag in bad_noterefs_tags] + """ + def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list): + """Function inserts span before tag aren't supported by livecarta""" + new_tag = chapter_tag.new_tag("span") + new_tag.attrs["id"] = id_ or "" + new_tag.attrs["class"] = class_ or "" + new_tag.string = "\xa0" + tag_to_be_removed.insert_before(new_tag) - def parse_a_tag_href(s: str) -> Tuple[str, str]: - """Returns name of file & id of an anchor""" - assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' - f, id_ = s.split('#') - return f, id_ - - def verify_footnote_tag(tags: list): - """Function verifies is tag - footnote""" - assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}' - if len(tags) == 0: - anchored_tags = list(target_html_tag.find_all(id=element_id)) - if len(anchored_tags): - print( - f'Warning. Href for tag is detected as footnote:\n{noteref_tag}') - return anchored_tags - else: - assert 0, f'Error, No element with id: {href} found.' - - return tags - - for i, noteref_tag in enumerate(noterefs_tags): - href = noteref_tag.attrs['href'] - file, element_id = parse_a_tag_href(href) - if not file: - target_html_tag = source_html_tag - else: - target_html_tag = href2soup_html.get(file) - if not target_html_tag: - print( - f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.') - continue - - possible_footnote = 'note|footnote|endnote|rearenote' - expected_footnote_tags = list(target_html_tag.find_all(id=element_id, - attrs={'epub:type': re.compile(possible_footnote)})) - - expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) - footnote_tag = expected_footnote_tags[0] - if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote': - footnote_tag = footnote_tag.parent - new_noterefs_tags.append( - _replace_with_livecarta_anchor_tag(noteref_tag, i)) - content = footnote_tag.text - # footnote_tag.decompose() - footnotes.append(content) - footnote_tag = footnote_tag.find( - attrs={'role': 'doc-backlink'}) or footnote_tag - new_footnotes_tags.append(footnote_tag) - - return footnotes, new_noterefs_tags, new_footnotes_tags + if tag_to_be_removed.attrs.get("id"): + _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, + id_=tag_to_be_removed.attrs["id"], + class_=tag_to_be_removed.attrs.get("class")) -def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup: +def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup: """ Main function that works with structure of html. Make changes inplace. Parameters ---------- - body_tag: Tag, soup object + chapter_tag: Tag, soup object Steps ---------- 1. Extracts tags that are not needed 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree. - Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed. - This tag must have a body_tag as a parent. + Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed. + This tag must have a chapter_tag as a parent. Otherwise, it is wrapped with some tags. Like: -

+

3. Headings that are not supported by livecarta converts to

4. Wrapping NavigableString Returns ------- - body_tag: Tag, BeautifulSoup - adjusted body_tag + chapter_tag: Tag, BeautifulSoup + adjusted chapter_tag """ - def _preserve_class_in_aside_tag(tag_): - """to save css style inherited from class, copy class to aside tag (which is parent to tag_)""" - # this is for Wiley books with boxes - tag_class = tag_.attrs['class'] if not isinstance( - tag_.attrs['class'], list) else tag_.attrs['class'][0] - if tag_.parent.name == 'aside': - if not tag_.parent.attrs.get('class'): - tag_.parent.attrs['class'] = tag_class + def _tags_to_correspond_livecarta_tag(chapter_tag): + """Function to replace all tags to correspond livecarta tags""" + for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): + for key in reg_key: + # text = tag if isinstance(tag, NavigableString) else tag.text + tags = chapter_tag.find_all(re.compile(key)) + for tag in tags: + tag.name = to_replace_value - def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool: - """ - Function saves css style inherited from class, copies class to child

- returns True, if

could be unwrapped - Parameters - ---------- - tag_: Tag, soup object + def _unwrap_tags(chapter_tag): + """Function unwrap tags and move id to span""" + for tag in LiveCartaConfig. TAGS_TO_UNWRAP: + for s in chapter_tag.find_all(tag): + _add_span_to_save_ids_for_links(s, chapter_tag) + s.unwrap() - Returns - ------- - bool + def _mark_parent_is_body(chapter_tag): + # check marks for chapter starting are on the same level - 1st + marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"}) - """ - # this is for Wiley books with boxes - tag_class = tag_.attrs['class'] if not isinstance( - tag_.attrs['class'], list) else tag_.attrs['class'][0] - if 'feature' not in tag_class: - return True - child_p_tags = tag_.find_all("p") - if len(child_p_tags) == 1: - child_p_tag = child_p_tags[0] - if not child_p_tag.attrs.get('class'): - child_p_tag.attrs['class'] = tag_class - return True + # fix marks to be on 1 level + for mark in marks: + while mark.parent != chapter_tag: + mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases - elif len(child_p_tags) > 1: - tag_.name = 'p' - return False - else: - return True + _tags_to_correspond_livecarta_tag(chapter_tag) - def _add_span_to_save_ids_for_links(tag_to_be_removed): - if tag_to_be_removed.attrs.get('id'): - _insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed, - id_=tag_to_be_removed.attrs['id'], - class_=tag_to_be_removed.attrs.get('class')) + _unwrap_tags(chapter_tag) - def _replace_div_tag_with_table(): - """ - Function replace
with : - 1. Convert div with certain classes to tables - 2. Add background color to div with background-color + _mark_parent_is_body(chapter_tag) - """ - for div in body_tag.find_all("div"): - if div.attrs.get('class'): - div_class = div.attrs['class'] if not isinstance( - div.attrs['class'], list) else div.attrs['class'][0] - if div_class in ['C409', 'C409a']: - _wrap_block_tag_with_table( - body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9') - - elif div_class in ['C441', 'C816']: - _wrap_block_tag_with_table( - body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8') - - if div.attrs.get('style'): - if 'background-color' in div.attrs['style']: - end_index = div.attrs['style'].find( - 'background-color') + len('background-color') - start_index_of_color = end_index + 2 - bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7] - _wrap_block_tag_with_table( - body_tag, old_tag=div, width='100', border='', bg_color=bg_color) - elif div.attrs.get('style') == '': - del div.attrs['style'] - - structural_tags_names = [ - 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', - 'figure', 'footer', 'iframe', 'span', 'p' - ] - - if div.contents: - is_not_struct_tag = [ - child.name not in structural_tags_names for child in div.contents] - if all(is_not_struct_tag): - div.name = 'p' - continue - _add_span_to_save_ids_for_links(div) - div.unwrap() - - def _heading_tag_to_p_tag(body_tag): - """Function to convert all lower level headings to p tags""" - pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' - header_tags = body_tag.find_all(re.compile(pattern)) - for tag in header_tags: - tag.name = 'p' - - # comments removal - for tag in body_tag.find_all(): - for element in tag(text=lambda text: isinstance(text, Comment)): - element.extract() - - _replace_div_tag_with_table() - - for s in body_tag.find_all("section"): - could_be_unwrapped = True - if s.attrs.get('class'): - could_be_unwrapped = _preserve_class_in_section_tag(s) - _add_span_to_save_ids_for_links(s) - if could_be_unwrapped: - s.unwrap() - - for s in body_tag.find_all("article"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("figure"): - s.name = 'p' - # to center image inside this tag - s.attrs['style'] = "text-align: center;" - - for s in body_tag.find_all("figcaption"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("aside"): - s.name = 'blockquote' - - for s in body_tag.find_all("main"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("body"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("html"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("header"): - s.name = 'span' - - # check marks for chapter starting are on the same 1 level - marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'}) - parents_marks_are_body = [x.parent == body_tag for x in marks] - - # fix marks to be on 1 level - if not all(parents_marks_are_body): - for x in marks: - while x.parent != body_tag: - x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases - - parents_marks_are_body = [x.parent == body_tag for x in marks] - assert all( - parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' - - _heading_tag_to_p_tag(body_tag) - - # wrap NavigableString with

- for node in body_tag: - if isinstance(node, NavigableString): - content = str(node) - content = re.sub(r'([\n\t\xa0])', ' ', content) - content = content.strip() - if content: - tag = body_tag.new_tag('p') - tag.append(str(node)) - node.replace_with(tag) - return body_tag + return chapter_tag def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: - """After processing on a first_id that corresponds to current chapter, + """ + After processing on a first_id that corresponds to current chapter, from initial html_soup all tags from current chapter are extracted Parameters ---------- - first_id: - Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' - href: - Name of current chapter's file + first_id: str + Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" + href: str + Name of current chapters file html_soup: Tag Soup object of current file @@ -310,13 +112,13 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu """ marked_tags = html_soup.find( - attrs={'id': first_id, 'class': 'converter-chapter-mark'}) + attrs={"id": first_id, "class": "converter-chapter-mark"}) if marked_tags: next_tag = marked_tags.next_sibling tags = [] while next_tag: - if not isinstance(next_tag, NavigableString) and\ - (next_tag.attrs.get('class') == 'converter-chapter-mark'): + if not isinstance(next_tag, NavigableString) and \ + (next_tag.attrs.get("class") == "converter-chapter-mark"): break tags.append(next_tag) next_tag = next_tag.next_sibling @@ -327,182 +129,119 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu html_soup.smooth() else: - assert 0, f'Warning: no match for {first_id, href}' + assert 0, f"Warning: no match for {first_id, href}" return tags -def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str): - """Function saves all images to Amazon web service""" - link_path = access.send_image( - img_file_path, doc_id=book_id, img_content=img_content) - return link_path - - -def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): - """Function saves all images locally""" - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join( - folder_path, f'../json/img_{book_id}/')) - new_path.mkdir(exist_ok=True) - - new_img_path = new_path / os.path.basename(img_file_path) - f = open(new_img_path, 'wb+') - f.write(img_content) - f.close() - - return new_img_path - - -def update_images_src_links(body_tag: BeautifulSoup, - href2img_content: dict, - path_to_html: str, - access=None, - path2aws_path: dict = None, - book_id: str = None) -> dict: - """Function makes dictionary image_src_path -> Amazon web service_path""" - img_tags = body_tag.find_all('img') - - for img in img_tags: - path_to_img_from_html = img.attrs.get('src') - html_folder = os.path.dirname(path_to_html) - path_to_img_from_root = os.path.normpath(os.path.join( - html_folder, path_to_img_from_html)).replace('\\', '/') - - assert path_to_img_from_root in href2img_content, \ - f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' - - img_content = href2img_content[path_to_img_from_root] - if access is not None: - if path_to_img_from_root in path2aws_path: - new_folder = path2aws_path[path_to_img_from_root] - else: - new_folder = save_image_to_aws( - access, path_to_img_from_root, img_content, book_id) - path2aws_path[path_to_img_from_root] = new_folder - else: - new_folder = save_image_locally( - path_to_img_from_root, img_content, 'book_id') - - img.attrs['src'] = str(new_folder) - if img.attrs.get('width'): - del img.attrs['width'] - if img.attrs.get('height'): - del img.attrs['height'] - if img.attrs.get('style'): - del img.attrs['style'] - return path2aws_path - - -def _clean_title_from_numbering(title: str): - """Function removes numbering from titles""" - title = re.sub(r'^(\s+)+', '', title) - # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title - # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title - # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title - return title - - def prepare_title(title_of_chapter: str) -> str: """Function finalise processing/cleaning title""" - title_str = BeautifulSoup(title_of_chapter, features='lxml').string - title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) - title_str = re.sub(r' +', ' ', title_str).rstrip() - title_str = _clean_title_from_numbering(title_str) + title_str = BeautifulSoup(title_of_chapter, features="lxml").string + title_str = re.sub(r"([\n\t\xa0])", " ", title_str) + title_str = re.sub(r" +", " ", title_str).rstrip() + # clean whitespace characters ([\r\n\t\f\v ]) + title_str = re.sub(r"(^\s+)|(\s+$)", "", title_str) return title_str -def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): - """Function inserts span before tag aren't supported by livecarta""" - new_tag = main_tag.new_tag("span") - new_tag.attrs['id'] = id_ or '' - new_tag.attrs['class'] = class_ or '' - new_tag.string = "\xa0" - tag.insert_before(new_tag) +def _remove_comments(chapter_tag): + for tag in chapter_tag.find_all(): + for element in tag(text=lambda text: isinstance(text, Comment)): + element.extract() -def _clean_headings_content(content: BeautifulSoup, title: str): - def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup): - if tag_to_be_removed.attrs.get('id'): - _insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=tag_to_be_removed.attrs.get( - 'id'), - class_=tag_to_be_removed.attrs.get('class')) +def _wrap_strings_with_p(chapter_tag): + # wrap NavigableString with

+ for node in chapter_tag: + if isinstance(node, NavigableString): + content = str(node) + content = re.sub(r"([\n\t\xa0])", " ", content) + # remove spaces at the beginning and at the end of the string: + content = content.strip() + if content: + tag = chapter_tag.new_tag("p") + tag.append(str(node)) + node.replace_with(tag) - for sub_tag in tag_to_be_removed.find_all(): - if sub_tag.attrs.get('id'): - _insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=sub_tag.attrs['id'], - class_=sub_tag.attrs.get('class')) - title = title.lower() - for child in content.contents: - if isinstance(child, NavigableString): - text = child - else: - text = child.text - if text and re.sub(r'([\n\t\xa0])', '', text): - text = re.sub(r'([\n\t\xa0])', ' ', text) - text = re.sub(r' +', ' ', text).strip() - text = text.lower() - if title == text: - add_span_to_save_ids_for_links(child, content) - child.extract() - elif (title in text) and (child.name in ['h1', 'h2', 'h3']): - add_span_to_save_ids_for_links(child, content) - child.extract() +def _remove_headings_content(content_tag, title_of_chapter: str): + """ + Function + clean/remove headings from chapter in order to avoid duplication of chapter titles in the content + add span with id in order to + Parameters + ---------- + content_tag: soup object + Tag of the page + title_of_chapter: str + Chapter title + + Returns + ------- + None + clean/remove headings & add span with id + + """ + title_of_chapter = title_of_chapter.lower() + for tag in content_tag.contents: + text = tag if isinstance(tag, NavigableString) else tag.text + if text: + text = re.sub(r"^[\s\xa0]+|[\s\xa0]+$", " ", text).lower() + if title_of_chapter == text or \ + (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name)): + _add_span_to_save_ids_for_links(tag, content_tag) + tag.extract() break -def _process_lists(body_tag: BeautifulSoup): +# todo remove +def _process_lists(chapter_tag: BeautifulSoup): """ Function - process tags

  • . - unwrap

    tags. Parameters ---------- - body_tag: Tag, soup object + chapter_tag: Tag, soup object Returns ------- None """ - li_tags = body_tag.find_all("li") + li_tags = chapter_tag.find_all("li") for li_tag in li_tags: if li_tag.p: li_tag.attrs.update(li_tag.p.attrs) li_tag.p.unwrap() -def _preprocess_table(body_tag: BeautifulSoup): +def _preprocess_table(chapter_tag: BeautifulSoup): """Function to preprocess tables and tags(td|th|tr): style""" - tables = body_tag.find_all("table") + tables = chapter_tag.find_all("table") for table in tables: t_tags = table.find_all(re.compile("td|th|tr")) for t_tag in t_tags: - style = t_tag.get('style') - width = '' + style = t_tag.get("style") + width = "" if style: width_match = re.search( r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) if width_match: size = width_match.group(1) - width = size + 'px' + width = size + "px" - t_tag.attrs['width'] = t_tag.get('width') or width + t_tag.attrs["width"] = t_tag.get("width") or width - if t_tag.attrs.get('style'): - t_tag.attrs['style'] = t_tag.attrs['style'].replace( - 'border:0;', '') + if t_tag.attrs.get("style"): + t_tag.attrs["style"] = t_tag.attrs["style"].replace( + "border:0;", "") - elif t_tag.attrs.get('style') == '': - del t_tag.attrs['style'] + elif t_tag.attrs.get("style") == "": + del t_tag.attrs["style"] - if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']: - table.attrs['border'] = '1' + if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: + table.attrs["border"] = "1" def _preprocess_code_tags(chapter_tag: BeautifulSoup): @@ -523,25 +262,15 @@ def _preprocess_code_tags(chapter_tag: BeautifulSoup): if not code.parent.name == "pre": code.name = "span" continue - # if tag isn't in pre and doesn't have style - if not code.attrs.get('style'): - code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;' - - -def _prepare_formatted(text: str) -> str: - """Function replaces special symbols with their Unicode representation""" - text = text.replace("<", "\x3C") - text = text.replace(">", "\x3E") - text = text.replace('\t', "\xa0 \xa0 ") #     - text = text.replace(' ', "\xa0") - text = text.replace('𝑓', "\xf0\x9d\x91\x93") - return text + # if tag isn"t in pre and doesn"t have style + if not code.attrs.get("style"): + code.attrs["style"] = "font-size: 14px; font-family: courier new,courier,monospace;" def _preprocess_pre_tags(chapter_tag: BeautifulSoup): """ Function preprocessing

     tags
    -    Wrap string of the tag with  if it's necessary
    +    Wrap string of the tag with  if its necessary
         Parameters
         ----------
         chapter_tag: Tag, soup object
    @@ -564,6 +293,42 @@ def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
                 pre.append(code)
     
     
    +# todo replace
    +def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
    +    """Function wraps  with 
  • """ + table = chapter_tag.new_tag("table") + table.attrs["border"], table.attrs["align"], table.attrs["style"] \ + = border, "center", f"width:{width}%;" + tbody, tr, td = \ + chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") + td.attrs["bgcolor"] = bg_color + tag_to_be_wrapped.wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) + return table + + +def _preprocess_div_tags(chapter_tag): + """ + Function replace
    with
    : + """ + for div in chapter_tag.find_all("div"): + if div.attrs.get('style'): + _wrap_tag_with_table( + chapter_tag, + tag_to_be_wrapped=div, + width=div.attrs['width'] if div.attrs.get('width') else '100', + border=div.attrs['border'] if div.attrs.get('border') else None, + bg_color=div.attrs['bgcolor'] if div.attrs.get('bgcolor') else None) + else: + div.name = "p" + continue + _add_span_to_save_ids_for_links(div, chapter_tag) + div.unwrap() + + def _clean_wiley_block(block): hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) for hr in hrs: @@ -571,48 +336,30 @@ def _clean_wiley_block(block): h = block.find(re.compile("h[1-9]")) if h: h.name = "p" - h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) + h.insert_before(BeautifulSoup(features="lxml").new_tag("br")) -def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None): - """Function wraps with
    """ - table = main_tag.new_tag("table") - table.attrs['border'] = border - table.attrs['align'] = 'center' - table.attrs['style'] = f'width:{width}%;' - tbody = main_tag.new_tag("tbody") - tr = main_tag.new_tag("tr") - td = main_tag.new_tag("td") - # td.attrs['border-radius'] = '8px' - if bg_color: - td.attrs['bgcolor'] = bg_color - old_tag.wrap(td) - td.wrap(tr) - tr.wrap(tbody) - tbody.wrap(table) - table.insert_after(BeautifulSoup(features='lxml').new_tag("br")) - return table def _preprocess_block_tags(chapter_tag: Tag): """Function preprocessing tags""" for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}): _clean_wiley_block(block) - color = '#DDDDDD' if block.attrs.get( - 'class') == 'feature1' else None - color = '#EEEEEE' if block.attrs.get( - 'class') == 'feature2' else color - _wrap_block_tag_with_table(chapter_tag, block, bg_color=color) - block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) + color = "#DDDDDD" if block.attrs.get( + "class") == "feature1" else None + color = "#EEEEEE" if block.attrs.get( + "class") == "feature2" else color + _wrap_tag_with_table(chapter_tag, block, bg_color=color) + block.insert_after(BeautifulSoup(features="lxml").new_tag("br")) block.unwrap() for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): _clean_wiley_block(future_block) - color = '#DDDDDD' if future_block.attrs.get( - 'class') == 'feature1' else None - color = '#EEEEEE' if future_block.attrs.get( - 'class') == 'feature2' else color - _wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) + color = "#DDDDDD" if future_block.attrs.get( + "class") == "feature1" else None + color = "#EEEEEE" if future_block.attrs.get( + "class") == "feature2" else color + _wrap_tag_with_table(chapter_tag, future_block, bg_color=color) def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: @@ -628,10 +375,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro Steps ---------- - 1. find \n - 2. heading removal - 3. processing tags - 4. class removal + 1. heading removal + 2. processing tags + 3. class removal Returns ------- @@ -639,28 +385,27 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro prepared content """ - # 1. find \n - to_remove = [] - for child in content_tag.contents: - if isinstance(child, NavigableString): - s = re.sub(r'([\n\t])', '', child.string) - if s == '': - to_remove.append(child) + # 1. remove comments + _remove_comments(content_tag) - # 2. heading removal + # 2. wrap NavigableString with tag

    + _wrap_strings_with_p(content_tag) + + # 3. heading removal if remove_title_from_chapter: - _clean_headings_content(content_tag, title_str) + _remove_headings_content(content_tag, title_str) - # 3. processing tags (

  • ,
  • , ,
    , )
    +    # 4. processing tags (
  • ,
  • , ,
    , 
    , ) _process_lists(content_tag) _preprocess_table(content_tag) _preprocess_code_tags(content_tag) _preprocess_pre_tags(content_tag) + _preprocess_div_tags(content_tag) _preprocess_block_tags(content_tag) - # 4. class removal + # 5. remove classes that were created by converter for tag in content_tag.find_all(recursive=True): - if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', - 'footnote-element']): - del tag.attrs['class'] + if hasattr(tag, "attrs") and tag.attrs.get("class") \ + and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): + del tag.attrs["class"] return str(content_tag) diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py new file mode 100644 index 0000000..950bbdd --- /dev/null +++ b/src/epub_converter/image_processing.py @@ -0,0 +1,67 @@ +import os +import pathlib + +from bs4 import BeautifulSoup + +from src.access import Access + + +def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str): + """Function saves all images to Amazon web service""" + link_path = access.send_image( + img_file_path, doc_id=book_id, img_content=img_content) + return link_path + + +def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): + """Function saves all images locally""" + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join( + folder_path, f"../json/img_{book_id}/")) + new_path.mkdir(exist_ok=True) + + new_img_path = new_path / os.path.basename(img_file_path) + f = open(new_img_path, "wb+") + f.write(img_content) + f.close() + return new_img_path + + +def update_images_src_links(body_tag: BeautifulSoup, + href2img_content: dict, + path_to_html: str, + access=None, + path2aws_path: dict = None, + book_id: str = None) -> dict: + """Function makes dictionary image_src_path -> Amazon web service_path""" + img_tags = body_tag.find_all("img") + + for img in img_tags: + path_to_img_from_html = img.attrs.get("src") + html_folder = os.path.dirname(path_to_html) + path_to_img_from_root = os.path.normpath(os.path.join( + html_folder, path_to_img_from_html)).replace("\\", "/") + + assert path_to_img_from_root in href2img_content, \ + f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest." + + img_content = href2img_content[path_to_img_from_root] + if access is not None: + if path_to_img_from_root in path2aws_path: + new_folder = path2aws_path[path_to_img_from_root] + else: + new_folder = save_image_to_aws( + access, path_to_img_from_root, img_content, book_id) + path2aws_path[path_to_img_from_root] = new_folder + else: + new_folder = save_image_locally( + path_to_img_from_root, img_content, "book_id") + + img.attrs["src"] = str(new_folder) + if img.attrs.get("width"): + del img.attrs["width"] + if img.attrs.get("height"): + del img.attrs["height"] + if img.attrs.get("style"): + del img.attrs["style"] + return path2aws_path \ No newline at end of file diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_css_style_converter.py index 37b2672..269d8ed 100644 --- a/src/epub_converter/tag_css_style_converter.py +++ b/src/epub_converter/tag_css_style_converter.py @@ -21,33 +21,33 @@ class TagStyleConverter: @staticmethod def remove_white_if_no_bgcolor(style_, tag): """Function remove text white color if there is no bg color""" - if 'background' in style_: + if "background" in style_: style_ = style_.replace( - 'background:', 'background-color:') + "background:", "background-color:") return style_ # if text color is white, check that we have bg-color - if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): + if ("color:#ffffff" in style_) or ("color:#fff" in style_) or ("color:white" in style_): # if bg color is inherited, just return style as is for parent_tag in tag.parents: - # white bg color not need to be checked as we do not write 'white bg color' - tag_with_bg = ['span', 'td', 'tr', 'p'] + # white bg color not need to be checked as we do not write "white bg color" + tag_with_bg = ["span", "td", "tr", "p"] tag_will_be_saved = parent_tag.name in tag_with_bg - has_bg = parent_tag.attrs.get('style') and ( - 'background' in parent_tag.attrs.get('style')) + has_bg = parent_tag.attrs.get("style") and ( + "background" in parent_tag.attrs.get("style")) if has_bg and tag_will_be_saved: return style_ children = tag.find_all() for child in children: - if child.attrs.get('style') and ('background' in child.attrs.get('style')): - tmp_style = child.attrs['style'] + '; color:#fff; ' - child.attrs['style'] = tmp_style + if child.attrs.get("style") and ("background" in child.attrs.get("style")): + tmp_style = child.attrs["style"] + "; color:#fff; " + child.attrs["style"] = tmp_style - # for child with bg color we added white text color, so this tag don't need white color - style_ = style_.replace('color:#fff;', '') - style_ = style_.replace('color:#ffffff;', '') - style_ = style_.replace('color:white;', '') + # for child with bg color we added white text color, so this tag don"t need white color + style_ = style_.replace("color:#fff;", "") + style_ = style_.replace("color:#ffffff;", "") + style_ = style_.replace("color:white;", "") return style_ @staticmethod @@ -68,7 +68,7 @@ class TagStyleConverter: Parameters ---------- split_style: list - list of styles split by ';' + list of styles split by ";" Returns ---------- @@ -79,9 +79,9 @@ class TagStyleConverter: processed_style = ";".join(split_style) margin_left_regexp = re.compile( - r'((margin-left|margin): *(-*\w+);*)') + r"((margin-left|margin): *(-*\w+);*)") text_indent_regexp = re.compile( - r'(text-indent: *(-*\w+);*)') + r"(text-indent: *(-*\w+);*)") has_margin = re.search(margin_left_regexp, processed_style) has_text_indent = re.search(text_indent_regexp, processed_style) @@ -92,21 +92,21 @@ class TagStyleConverter: if has_text_indent: num_ti = abs(int("0" + "".join( filter(str.isdigit, str(has_text_indent.group(2)))))) - processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + - str(abs(num_m - num_ti)) + 'px; ') + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + + str(abs(num_m - num_ti)) + "px; ") processed_style = processed_style.replace( - has_margin.group(1), '') + has_margin.group(1), "") return processed_style - processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' + - str(abs(num_m)) + 'px; ') + processed_style = processed_style.replace(has_margin.group(1), "text-indent: " + + str(abs(num_m)) + "px; ") return processed_style elif has_text_indent: - processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + str(abs(int("0" + "".join( filter(str.isdigit, str(has_text_indent.group(2))))))) - + 'px; ') + + "px; ") return processed_style return processed_style @@ -126,18 +126,18 @@ class TagStyleConverter: processed inline style """ - inline_style = self.tag_inline_style.attrs.get('style') + ';' - # 1. Remove white color if tag doesn't have background color in style + inline_style = self.tag_inline_style.attrs.get("style") + ";" + # 1. Remove white color if tag doesn"t have background color in style inline_style = self.remove_white_if_no_bgcolor( inline_style, self.tag_inline_style) inline_style = inline_style.replace( - 'list-style-image', 'list-style-type') + "list-style-image", "list-style-type") # 2. Create list of styles from inline style - # replace all spaces between '; & letter' to ';' + # replace all spaces between "; & letter" to ";" style = re.sub(r"; *", ";", inline_style) - # when we split style by ';', last element of the list is '' - None (remove it) - split_inline_style: list = list(filter(None, style.split(';'))) + # when we split style by ";", last element of the list is "" - None (remove it) + split_inline_style: list = list(filter(None, style.split(";"))) # 3. Duplicate styles check - if the tag had duplicate styles split_inline_style = self.duplicate_styles_check(split_inline_style) @@ -164,7 +164,7 @@ class TagStyleConverter: """ styles_to_remove = [] for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: - if f'{k[0]}:{k[1]}' in style: + if f"{k[0]}:{k[1]}" in style: styles_to_remove.append(k) return styles_to_remove @@ -172,11 +172,11 @@ class TagStyleConverter: # adds , , instead of styles styles_to_remove = self.check_style_to_be_tag(self.style) for i, (attr, value) in enumerate(styles_to_remove): - self.tag_inline_style.attrs['style'] = self.tag_inline_style.attrs['style']\ - .replace(f'{attr}:{value};', '').strip() + self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\ + .replace(f"{attr}:{value};", "").strip() corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( attr, value)] - correspond_tag = BeautifulSoup(features='lxml').new_tag(corr_tag_name) + correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name) for content in reversed(self.tag_inline_style.contents): correspond_tag.insert(0, content.extract()) self.tag_inline_style.append(correspond_tag) @@ -184,34 +184,34 @@ class TagStyleConverter: @staticmethod def wrap_span_in_tag_to_save_style_attrs(initial_tag): """Function designed to save style attrs that cannot be in tag.name -> span""" - dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG)) - if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'): + dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG)) + if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"): styles_can_be_in_tag = [style for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items() if re.match(tag, initial_tag.name) for style in styles] styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in styles_can_be_in_tag] - span_style = initial_tag.attrs['style'] + span_style = initial_tag.attrs["style"] # here check that this style is exactly the same. - # Not 'align' when we have 'text-align', or 'border' when we have 'border-top' - styles_to_be_saved_in_span = [((attr + ':') in span_style) & ( - '-' + attr not in span_style) for attr in styles_cant_be_in_tag] + # Not "align" when we have "text-align", or "border" when we have "border-top" + styles_to_be_saved_in_span = [((attr + ":") in span_style) & ( + "-" + attr not in span_style) for attr in styles_cant_be_in_tag] if any(styles_to_be_saved_in_span): # if we find styles that cannot be in -> wrap them in span - tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}') - style = '' - possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag] + tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}") + style = "" + possible_attrs_regexp = [re.compile(fr"({style}: *(\w+);)") for style in styles_can_be_in_tag] for possible_attr_regexp in possible_attrs_regexp: has_style_attrs = re.search( possible_attr_regexp, span_style) if has_style_attrs and has_style_attrs.group(1): style += has_style_attrs.group(1) span_style = span_style.replace( - has_style_attrs.group(1), '') - tag.attrs['style'] = style - initial_tag.name = 'span' - initial_tag.attrs['style'] = span_style + has_style_attrs.group(1), "") + tag.attrs["style"] = style + initial_tag.name = "span" + initial_tag.attrs["style"] = span_style initial_tag.wrap(tag) def convert_initial_tag(self): @@ -246,10 +246,10 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> disable_validation=True, ) # soup with converted styles from css - inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') + inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={'style': re.compile('.*')}) + attrs={"style": re.compile(".*")}) # go through the tags with inline style + style parsed from css file for tag_inline_style in tags_with_inline_style: diff --git a/src/livecarta_config.py b/src/livecarta_config.py index e3e63d4..31b549e 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -9,12 +9,12 @@ class LiveCartaConfig: HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} - DEFAULT_ALIGN_STYLE = 'left' + DEFAULT_ALIGN_STYLE = "left" - ALIGN_STYLES = ['justify', 'right', 'center', 'left'] + ALIGN_STYLES = ["justify", "right", "center", "left"] # Main constant values - DEFAULT_FONT_NAME = 'Times New Roman' + DEFAULT_FONT_NAME = "Times New Roman" WORD_DEFAULT_FONT_SIZE = 11 @@ -38,65 +38,65 @@ class LiveCartaConfig: } COLORS_MAP = { - '#ffff00': 'yellow', - '#00ff00': 'darkYellow', - '#00ffff': 'cyan', - '#ff00ff': 'magenta', - '#0000ff': 'blue', - '#ff0000': 'red', - '#000080': 'darkBlue', - '#008080': 'darkCyan', - '#008000': 'green', - '#800080': 'darkMagenta', - '#808000': 'darkGreen', - '#c0c0c0': 'lightGray', - '#ffffff': 'white', - '#800000': '#800000', - '#808080': '#808080' + "#ffff00": "yellow", + "#00ff00": "darkYellow", + "#00ffff": "cyan", + "#ff00ff": "magenta", + "#0000ff": "blue", + "#ff0000": "red", + "#000080": "darkBlue", + "#008080": "darkCyan", + "#008000": "green", + "#800080": "darkMagenta", + "#808000": "darkGreen", + "#c0c0c0": "lightGray", + "#ffffff": "white", + "#800000": "#800000", + "#808080": "#808080" } HTML42LIVECARTA_COLORS = { - 'yellow': 'yellow', - 'lime': 'green', - 'aqua': 'cyan', - 'fuchsia': 'magenta', - 'blue': 'blue', - 'red': 'red', - 'navy': 'darkBlue', - 'teal': 'darkCyan', - 'green': 'darkGreen', - 'purple': 'darkMagenta', - 'olive': 'darkYellow', - 'silver': 'lightGray', - 'white': 'white', - 'maroon': 'darkRed', # '#800000', - 'gray': 'darkGray', - 'grey': 'darkGray', + "yellow": "yellow", + "lime": "green", + "aqua": "cyan", + "fuchsia": "magenta", + "blue": "blue", + "red": "red", + "navy": "darkBlue", + "teal": "darkCyan", + "green": "darkGreen", + "purple": "darkMagenta", + "olive": "darkYellow", + "silver": "lightGray", + "white": "white", + "maroon": "darkRed", # "#800000", + "gray": "darkGray", + "grey": "darkGray", } - INDENT = '30px' + INDENT = "30px" sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] - sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', - '19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', - '30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px', - '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] + sizes_px = ["0px", "10px", "10px", "11px", "12px", "13px", "14px", "15px", "16px", "17px", "18px", + "19px", "20px", "21px", "22px", "23px", "24px", "25px", "26px", "27px", "28px", "29px", + "30px", "31px", "32px", "33px", "34px", "35px", "36px", "37px", "38px", "39px", "40px", + "41px", "42px", "43px", "44px", "45px", "46px", "47px", "48px", "49px", "50px", "64px", "72px"] - list_types = ['circle', 'disc', 'armenian', 'decimal', - 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', - 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] + list_types = ["circle", "disc", "armenian", "decimal", + "decimal-leading-zero", "georgian", "lower-alpha", "lower-latin", + "lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"] structural_tags_names = [ - 'div', 'section', 'article', 'main', 'body', 'html', 'aside', - 'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p' + "div", "section", "article", "main", "body", "html", "aside", + "canvas", "data", "figure", "footer", "iframe", "span", "p" ] could_have_style_in_livecarta_regexp = re.compile( - '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') + "(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)") """ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } @@ -104,23 +104,34 @@ class LiveCartaConfig: