From c62192d028d240b95227379046a694fe9ed6e421 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 21 Jun 2022 11:47:26 +0300 Subject: [PATCH 01/55] Updates to presets --- src/docx_converter/html_docx_preprocessor.py | 1 - src/epub_converter/css_preprocessing.py | 152 ++-- src/epub_converter/epub_converter.py | 216 +++--- src/epub_converter/epub_solver.py | 2 +- src/epub_converter/footnotes_processing.py | 87 +++ src/epub_converter/html_epub_preprocessor.py | 651 ++++++------------ src/epub_converter/image_processing.py | 67 ++ src/epub_converter/tag_css_style_converter.py | 98 +-- src/livecarta_config.py | 133 ++-- 9 files changed, 668 insertions(+), 739 deletions(-) create mode 100644 src/epub_converter/footnotes_processing.py create mode 100644 src/epub_converter/image_processing.py diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index e9683f4..80d96a3 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -222,7 +222,6 @@ class HTMLDocxPreprocessor: def _process_tables(self): """Function to process tables. Set "border" attribute.""" - tables = self.body_tag.find_all("table") for table in tables: tds = table.find_all("td") diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py index 2212bd5..11e4a16 100644 --- a/src/epub_converter/css_preprocessing.py +++ b/src/epub_converter/css_preprocessing.py @@ -11,13 +11,13 @@ from src.livecarta_config import LiveCartaConfig def get_text_color(x): color = str2hex(x) - color = color if color not in ['#000000', '#000', 'black'] else '' + color = color if color not in ["#000000", "#000", "black"] else "" return color def get_bg_color(x): color = str2hex(x) - color = color if color not in ['#ffffff', '#fff', 'white'] else '' + color = color if color not in ["#ffffff", "#fff", "white"] else "" return color @@ -43,25 +43,25 @@ def convert_tag_style_values(size_value: str) -> str: return LiveCartaConfig.sizes_px[last_possible_size_index] font_size_regexp = re.compile( - r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)') + r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)") has_style_attrs = re.search(font_size_regexp, size_value) if has_style_attrs: if has_style_attrs.group(1): - size_value = float(size_value.replace('%', '')) / 100.0 + size_value = float(size_value.replace("%", "")) / 100.0 return find_closest_size(size_value) elif has_style_attrs.group(3): - size_value = float(size_value.replace('em', '')) + size_value = float(size_value.replace("em", "")) return find_closest_size(size_value) elif has_style_attrs.group(5): - return size_value.replace('pt', 'px') + return size_value.replace("pt", "px") else: - return '' + return "" return size_value def convert_indents_tag_values(size_value: str) -> str: """ - Function converts values of ['text-indent', 'margin-left', 'margin'] + Function converts values of ["text-indent", "margin-left", "margin"] Parameters ---------- size_value: str @@ -71,12 +71,12 @@ def convert_indents_tag_values(size_value: str) -> str: size_value: str """ - if len(size_value.split(' ')) == 3: + if len(size_value.split(" ")) == 3: size_value = convert_tag_style_values(size_value.split( - ' ')[-2]) # returns middle value + " ")[-2]) # returns middle value else: size_value = convert_tag_style_values(size_value.split( - ' ')[-1]) # returns last value + " ")[-1]) # returns last value return size_value @@ -87,35 +87,35 @@ If property has empty list, it means that any value can be converted. If property has not empty list, it means that only certain property-value combinations can be transformed. """ LIVECARTA_STYLE_ATTRS = { - 'text-indent': [], - 'font-variant': ['small-caps'], - 'text-align': [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], - 'align': [], - 'font': [], - 'font-family': [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys() + "text-indent": [], + "font-variant": ["small-caps"], + "text-align": [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], + "align": [], + "font": [], + "font-family": [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys() if x != LiveCartaConfig.DEFAULT_FONT_NAME], - 'font-size': [], - 'font-weight': ['bold', '600', '700', '800', '900'], # - 'font-style': ['italic'], # - 'text-decoration': ['underline', 'line-through'], # , - 'text-decoration-line': ['underline', 'line-through'], # , - 'vertical-align': ['super'], # - 'color': [], - 'background-color': [], - 'background': [], - 'width': [], - 'border': [], - 'border-top-width': [], - 'border-right-width': [], - 'border-left-width': [], - 'border-bottom-width': [], - 'border-top': [], - 'border-bottom': [], - 'list-style-type': [], - 'list-style-image': [], - 'margin-left': [], - 'margin-top': [], - 'margin': [], + "font-size": [], + "font-weight": ["bold", "600", "700", "800", "900"], # + "font-style": ["italic"], # + "text-decoration": ["underline", "line-through"], # , + "text-decoration-line": ["underline", "line-through"], # , + "vertical-align": ["super"], # + "color": [], + "background-color": [], + "background": [], + "width": [], + "border": [], + "border-top-width": [], + "border-right-width": [], + "border-left-width": [], + "border-bottom-width": [], + "border-top": [], + "border-bottom": [], + "list-style-type": [], + "list-style-image": [], + "margin-left": [], + "margin-top": [], + "margin": [], } """ @@ -125,28 +125,28 @@ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING shou to suit livecarta style convention. """ LIVECARTA_STYLE_ATTRS_MAPPING = { - 'text-indent': convert_indents_tag_values, - 'font-variant': lambda x: x, - 'text-align': lambda x: x, - 'font': lambda x: '', - 'font-family': lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title())) + "text-indent": convert_indents_tag_values, + "font-variant": lambda x: x, + "text-align": lambda x: x, + "font": lambda x: "", + "font-family": lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title())) or LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x)), - 'font-size': convert_tag_style_values, - 'color': get_text_color, - 'background-color': get_bg_color, - 'background': get_bg_color, - 'border': lambda x: x if x != '0' else '', - 'border-top-width': lambda x: x if x != '0' else '', - 'border-right-width': lambda x: x if x != '0' else '', - 'border-left-width': lambda x: x if x != '0' else '', - 'border-bottom-width': lambda x: x if x != '0' else '', - 'border-top': lambda x: x if x != '0' else '', - 'border-bottom': lambda x: x if x != '0' else '', - 'list-style-type': lambda x: x if x in LiveCartaConfig.list_types else 'disc', - 'list-style-image': lambda x: 'disc', - 'margin-left': convert_indents_tag_values, - 'margin-top': convert_tag_style_values, - 'margin': convert_indents_tag_values + "font-size": convert_tag_style_values, + "color": get_text_color, + "background-color": get_bg_color, + "background": get_bg_color, + "border": lambda x: x if x != "0" else "", + "border-top-width": lambda x: x if x != "0" else "", + "border-right-width": lambda x: x if x != "0" else "", + "border-left-width": lambda x: x if x != "0" else "", + "border-bottom-width": lambda x: x if x != "0" else "", + "border-top": lambda x: x if x != "0" else "", + "border-bottom": lambda x: x if x != "0" else "", + "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", + "list-style-image": lambda x: "disc", + "margin-left": convert_indents_tag_values, + "margin-top": convert_tag_style_values, + "margin": convert_indents_tag_values } @@ -155,17 +155,17 @@ def update_inline_styles_to_livecarta_convention(split_style: list): style_name, style_value = style.split(":") if style_name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = '' + split_style[i] = "" return split_style - cleaned_value = style_value.replace('\"', '').split()[-1] + cleaned_value = style_value.replace("\"", "").split()[-1] constraints_on_value = LIVECARTA_STYLE_ATTRS.get( style_name) value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ style_name] if constraints_on_value and value_not_in_possible_values_list: # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = '' + split_style[i] = "" else: if style_name in LIVECARTA_STYLE_ATTRS_MAPPING: # function that converts our data @@ -177,14 +177,14 @@ def update_inline_styles_to_livecarta_convention(split_style: list): def build_inline_style_content(style: str) -> str: """Build inline style with livecarta convention""" - # replace all spaces between '; & letter' to ';' + # replace all spaces between "; & letter" to ";" style = re.sub(r"; *", ";", style) - # when we split style by ';', last element of the list is '' - None + # when we split style by ";", last element of the list is "" - None # remove it - split_style: list = list(filter(None, style.split(';'))) - # replace all spaces between ': & letter' to ':' + split_style: list = list(filter(None, style.split(";"))) + # replace all spaces between ": & letter" to ":" split_style = [el.replace( - re.search(r'(:\s*)', el).group(1), ':') for el in split_style] + re.search(r"(:\s*)", el).group(1), ":") for el in split_style] split_style = update_inline_styles_to_livecarta_convention(split_style) style = "; ".join(split_style) @@ -195,17 +195,17 @@ def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRul style_type: cssutils.css.property.Property): if style_type.name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = '' + css_rule.style[style_type.name] = "" return - cleaned_value = style_type.value.replace('\"', '') + cleaned_value = style_type.value.replace("\"", "") constraints_on_value = LIVECARTA_STYLE_ATTRS.get( style_type.name) value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ style_type.name] if constraints_on_value and value_not_in_possible_values_list: # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = '' + css_rule.style[style_type.name] = "" else: if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: # function that converts our data @@ -227,12 +227,12 @@ def build_css_file_content(css_content: str) -> str: return css_text -if __name__ == '__main__': - file = '../../epub/9781627222174.epub' +if __name__ == "__main__": + file = "../../epub/9781627222174.epub" ebooklib_book = epub.read_epub(file) - css_ = ebooklib_book.get_item_with_href('css/epub.css') + css_ = ebooklib_book.get_item_with_href("css/epub.css") css_ = css_.get_content().decode() css_cleaned = build_css_file_content(css_) html_ = ebooklib_book.get_item_with_href( - 'pr01s05.xhtml').get_body_content().decode() - html_soup = BeautifulSoup(html_, features='lxml') + "pr01s05.xhtml").get_body_content().decode() + html_soup = BeautifulSoup(html_, features="lxml") diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index dc8d3a2..57f2904 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -17,10 +17,12 @@ from bs4 import BeautifulSoup, Tag from src.util.helpers import BookLogger from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint +from src.epub_converter.image_processing import update_images_src_links +from src.epub_converter.footnotes_processing import preprocess_footnotes from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style -from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks,\ - prepare_title, prepare_content, update_images_src_links, preprocess_footnotes +from src.epub_converter.html_epub_preprocessor import process_structural_tags, get_tags_between_chapter_marks,\ + prepare_title, prepare_content class EpubConverter: @@ -57,26 +59,27 @@ class EpubConverter: self.noterefs: List[Tag] = [] # start of the footnote self.footnotes: List[Tag] = [] # end of the footnote - self.logger.log('Image processing.') + self.logger.log("Image processing.") for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): file_name = x.file_name content = x.content self.img_href2img_bytes[file_name] = content - self.logger.log('HTML files reading.') + self.logger.log("HTML files reading.") self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() - # TODO Presets - self.logger.log('Process CSS inline styles.') + self.logger.log("Process CSS inline styles.") self.process_inline_styles_in_html_soup() - self.logger.log('CSS files processing.') + self.logger.log("CSS files processing.") self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() - self.logger.log('CSS styles adding.') + self.logger.log("CSS styles adding.") self.add_css_styles_to_html_soup() - self.logger.log('Footnotes processing.') + # todo presets + + self.logger.log("Footnotes processing.") for href in self.html_href2html_body_soup: content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup) @@ -85,27 +88,28 @@ class EpubConverter: self.footnotes.extend(footnotes_tags) for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)): - noteref.attrs['data-id'] = i + 1 - noteref.attrs['id'] = f'footnote-{i + 1}' - footnote.attrs['href'] = f'#footnote-{i + 1}' + noteref.attrs["data-id"] = i + 1 + noteref.attrs["id"] = f"footnote-{i + 1}" + footnote.attrs["href"] = f"#footnote-{i + 1}" - self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.') - self.logger.log('TOC processing.') + self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") + self.logger.log("TOC processing.") self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed if self.is_toc_empty(): self.build_adjacency_list_from_spine() not_added = [ x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] - self.logger.log(f'Html documents not added to TOC: {not_added}.') + self.logger.log(f"Html documents not added to TOC: {not_added}.") self.add_not_added_files_to_adjacency_list(not_added) - self.logger.log(f'Html internal links and structure processing.') - self.label_chapters_ids_with_tmp_id() + self.logger.log(f"Html internal links and structure processing.") + self.label_chapters_ids_with_lc_id() # used only after parsed toc, ids from toc needed self.process_html_soup_structure_to_line() self.process_internal_links() - self.logger.log(f'Building chapters content.') + self.logger.log(f"Define chapters content.") self.define_chapters_content() + self.logger.log(f"Converting html_nodes to LiveCarta chapter items.") def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: # using EpubElements @@ -115,7 +119,7 @@ class EpubConverter: for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_body_text = item.get_body_content() # html.parser closes tags if needed - soup = BeautifulSoup(html_body_text, features='html.parser') + soup = BeautifulSoup(html_body_text, features="html.parser") nodes[item.file_name] = soup return nodes @@ -123,15 +127,15 @@ class EpubConverter: path_to_css_from_html = css_href html_folder = dirname(html_href) path_to_css_from_root = normpath( - join(html_folder, path_to_css_from_html)).replace('\\', '/') + join(html_folder, path_to_css_from_html)).replace("\\", "/") css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) # if in css file we import another css if "@import" in str(css_obj.content): path_to_css_from_root = "css/" + \ - re.search('"(.*)"', str(css_obj.content)).group(1) + re.search("'(.*)'", str(css_obj.content)).group(1) css_obj = self.ebooklib_book.get_item_with_href( path_to_css_from_root) - assert css_obj, f'Css style {css_href} was not in manifest.' + assert css_obj, f"Css style {css_href} was not in manifest." css_content: str = css_obj.get_content().decode() return css_content @@ -140,11 +144,11 @@ class EpubConverter: for html_href in self.html_href2html_body_soup: html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={'style': re.compile('.*')}) + attrs={"style": re.compile(".*")}) for tag_initial_inline_style in tags_with_inline_style: - inline_style = tag_initial_inline_style.attrs['style'] - tag_initial_inline_style.attrs['style'] = \ + inline_style = tag_initial_inline_style.attrs["style"] + tag_initial_inline_style.attrs["style"] = \ build_inline_style_content(inline_style) def build_html_and_css_relations(self) -> tuple[dict, dict]: @@ -167,23 +171,23 @@ class EpubConverter: for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_content = item.content html_href = item.file_name - soup_html_content = BeautifulSoup(html_content, features='lxml') + soup_html_content = BeautifulSoup(html_content, features="lxml") # check if file links to css file - for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): + for tag in soup_html_content.find_all("link", attrs={"type": "text/css"}): # alternate page of original page (e.g. another language) - if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']): + if tag.attrs.get("rel") and ("alternate" in tag.attrs["rel"]): continue - css_href = tag.attrs.get('href') + css_href = tag.attrs.get("href") html_href2css_href[html_href].append(css_href) if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict css_href2css_content[css_href] = build_css_file_content( self.get_css_content(css_href, html_href)) - for i, tag in enumerate(soup_html_content.find_all('style')): + for i, tag in enumerate(soup_html_content.find_all("style")): css_content = tag.string - html_href2css_href[html_href].append(f'href{i}') - css_href2css_content[f'href{i}'] = build_css_file_content( + html_href2css_href[html_href].append(f"href{i}") + css_href2css_content[f"href{i}"] = build_css_file_content( css_content) return html_href2css_href, css_href2css_content @@ -195,7 +199,7 @@ class EpubConverter: """ for html_href in self.html_href2html_body_soup: if self.html_href2css_href.get(html_href): - css = '' + css = "" for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] @@ -243,7 +247,7 @@ class EpubConverter: sub_nodes = [] for elem in second: - if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1: + if ("section" in first.title.lower() or "part" in first.title.lower()) and lvl == 1: self.offset_sub_nodes.append( self.build_adjacency_list_from_toc(elem, lvl)) else: @@ -267,7 +271,7 @@ class EpubConverter: self.adjacency_list[-1] = nodes else: - assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' + assert 0, f"Error. Element is not tuple/Link/list instance: {type(element)}" def is_toc_empty(self) -> bool: """Function checks is toc empty""" @@ -297,36 +301,36 @@ class EpubConverter: """Function add files that not added to adjacency list""" for i, file in enumerate(not_added): nav_point = NavPoint( - Section(f'To check #{i}, filename: {file}', file)) + Section(f"To check #{i}, filename: {file}", file)) self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(file) - def label_chapters_ids_with_tmp_id(self): + def label_chapters_ids_with_lc_id(self): for html_href in self.html_href2html_body_soup: ids = self.html_href2subchapter_ids[html_href] for i in ids: soup = self.html_href2html_body_soup[html_href] tag = soup.find(id=i) - new_h = soup.new_tag('tmp') - new_h.attrs['class'] = 'converter-chapter-mark' - new_h.attrs['id'] = i + new_h = soup.new_tag("tmp") + new_h.attrs["class"] = "converter-chapter-mark" + new_h.attrs["id"] = i tag.insert_before(new_h) def process_html_soup_structure_to_line(self): # go to line structure for html_href in self.html_href2html_body_soup: soup = self.html_href2html_body_soup[html_href] - self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup) + self.html_href2html_body_soup[html_href] = process_structural_tags(soup) @staticmethod def create_unique_id(href, id_): - return re.sub(r'([^\w\s])|_|-', '', href) + re.sub(r'[_-]', '0', id_) + return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_) @staticmethod def create_new_anchor_span(soup, id_): new_anchor_span = soup.new_tag("span") - new_anchor_span.attrs['id'] = id_ - new_anchor_span.attrs['class'] = 'link-anchor' + new_anchor_span.attrs["id"] = id_ + new_anchor_span.attrs["class"] = "link-anchor" new_anchor_span.string = "\xa0" return new_anchor_span @@ -353,18 +357,18 @@ class EpubConverter: """ dir_name = os.path.dirname(cur_file_path) normed_path = os.path.normpath(os.path.join( - dir_name, href_in_link)).replace('\\', '/') + dir_name, href_in_link)).replace("\\", "/") full_path = [ path for path in self.hrefs_added_to_toc if normed_path in path] if not full_path: - self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. ' - f'While processing href in {internal_link_tag}.') - internal_link_tag.attrs['converter-mark'] = 'bad-link' + self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. " + f"While processing href in {internal_link_tag}.") + internal_link_tag.attrs["converter-mark"] = "bad-link" return None if len(full_path) > 1: - self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}' - f' while {internal_link_tag} processing. The first one will be chosen.') + self.logger.log(f"Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}" + f" while {internal_link_tag} processing. The first one will be chosen.") return full_path[0] @@ -387,30 +391,30 @@ class EpubConverter: """ # 1. rebuild ids to be unique in all documents for toc_href in self.hrefs_added_to_toc: - for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}): - if tag.attrs.get('class') == 'converter-chapter-mark': + for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): + if tag.attrs.get("class") == "converter-chapter-mark": continue - if tag.attrs.get('class') == 'footnote-element': + if tag.attrs.get("class") == "footnote-element": continue - new_id = self.create_unique_id(toc_href, tag.attrs['id']) - tag.attrs['id'] = new_id + new_id = self.create_unique_id(toc_href, tag.attrs["id"]) + tag.attrs["id"] = new_id # 2a. process anchor which is a whole xhtml file internal_link_reg1 = re.compile( - r'(^(?!https?://).+\.(htm|html|xhtml)$)') + r"(^(?!https?://).+\.(htm|html|xhtml)$)") for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): - a_tag_href = internal_link_tag.attrs['href'] + for internal_link_tag in soup.find_all("a", {"href": internal_link_reg1}): + a_tag_href = internal_link_tag.attrs["href"] # find full path a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( toc_href, a_tag_href, internal_link_tag) if not a_tag_href_matched_to_toc: continue - new_id = self.create_unique_id(a_tag_href_matched_to_toc, '') - internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' + new_id = self.create_unique_id(a_tag_href_matched_to_toc, "") + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" if new_id not in self.internal_anchors: anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] new_anchor_span = self.create_new_anchor_span(soup, new_id) @@ -418,22 +422,22 @@ class EpubConverter: anchor_soup.insert(0, new_anchor_span) self.internal_anchors.add(new_id) - del internal_link_tag.attrs['href'] + del internal_link_tag.attrs["href"] # 2b. process anchor which is an element in xhtml file - internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)#.+)|(^#.+)') + internal_link_reg2 = re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)") for toc_href in self.hrefs_added_to_toc: soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): - a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split( - '#') + for internal_link_tag in soup.find_all("a", {"href": internal_link_reg2}): + a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split( + "#") # find full path if a_tag_href: a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) else: a_tag_href_matched_to_toc = os.path.normpath( - toc_href).replace('\\', '/') + toc_href).replace("\\", "/") if not a_tag_href_matched_to_toc: continue @@ -442,45 +446,45 @@ class EpubConverter: a_tag_href_matched_to_toc, a_tag_id) anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - anchor_tags = anchor_soup.find_all(attrs={'id': new_id, }) + anchor_tags = anchor_soup.find_all(attrs={"id": new_id, }) anchor_tags = anchor_tags or anchor_soup.find_all( - attrs={'id': a_tag_id}) # if link is a footnote + attrs={"id": a_tag_id}) # if link is a footnote if anchor_tags: if len(anchor_tags) > 1: - self.logger.log(f'Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n' - f'{anchor_tags}\n' - f' While processing {internal_link_tag}') + self.logger.log(f"Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n" + f"{anchor_tags}\n" + f" While processing {internal_link_tag}") anchor_tag = anchor_tags[0] - assert anchor_tag.attrs['id'] in [new_id, a_tag_id] + assert anchor_tag.attrs["id"] in [new_id, a_tag_id] # if anchor is found we could add placeholder for link creation on server side. - internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}' + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" # create span to have cyclic links, link has 1 type of class, anchor another - if anchor_tag.attrs['id'] not in self.internal_anchors: + if anchor_tag.attrs["id"] not in self.internal_anchors: new_anchor_span = self.create_new_anchor_span( soup, new_id) anchor_tag.insert_before(new_anchor_span) self.internal_anchors.add(new_id) - del anchor_tag.attrs['id'] - del internal_link_tag.attrs['href'] + del anchor_tag.attrs["id"] + del internal_link_tag.attrs["href"] else: - internal_link_tag.attrs['converter-mark'] = 'bad-link' - self.logger.log(f'Error in {toc_href}. While processing {internal_link_tag} no anchor found.' - f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' - f' Old id={a_tag_id}') + internal_link_tag.attrs["converter-mark"] = "bad-link" + self.logger.log(f"Error in {toc_href}. While processing {internal_link_tag} no anchor found." + f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file." + f" Old id={a_tag_id}") - def build_one_chapter(self, nav_point: NavPoint): + def detect_one_chapter(self, nav_point: NavPoint): """ Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) 3 cases: id wraps all chapter content, - id wraps chapter's content + subchapters' content + id wraps chapter"s content + subchapters" content id points to the start of title of a chapter - In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id + In all cases we know where chapter starts. Therefore, chapter is all tags between chapter"s id and id of the next chapter/subchapter Parameters ---------- @@ -496,7 +500,7 @@ class EpubConverter: soup = self.html_href2html_body_soup[nav_point.href] chapter_tags = get_tags_between_chapter_marks( first_id=nav_point.id, href=nav_point.href, html_soup=soup) - new_tree = BeautifulSoup('', 'html.parser') + new_tree = BeautifulSoup("", "html.parser") for tag in chapter_tags: new_tree.append(tag) self.href_chapter_id2soup_html[( @@ -504,16 +508,30 @@ class EpubConverter: if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: - self.build_one_chapter(sub_node) + self.detect_one_chapter(sub_node) def define_chapters_content(self): """Function build chapters content, starts from top level chapters""" top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: for point in top_level_nav_points: - self.build_one_chapter(point) + self.detect_one_chapter(point) - def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: + def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: + """ + Function prepare style, tags to json structure + Parameters + ---------- + nav_point: NavPoint + + lvl: int + level of chapter + Returns + ------- + ChapterItem + built chapter + + """ title = nav_point.title if nav_point.id: content: BeautifulSoup = self.href_chapter_id2soup_html[( @@ -526,7 +544,7 @@ class EpubConverter: access=self.access, path2aws_path=self.book_image_src_path2aws_path, book_id=self.file_path.stem - if hasattr(self.file_path, 'stem') else 'book_id') + if hasattr(self.file_path, "stem") else "book_id") is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS title_preprocessed = prepare_title(title) @@ -534,15 +552,16 @@ class EpubConverter: remove_title_from_chapter=is_chapter) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter + # if it doesn't have subchapters if self.adjacency_list.get(nav_point): for sub_node in self.adjacency_list[nav_point]: - sub_chapter_item = self.node_to_livecarta_chapter_item( + sub_chapter_item = self.html_node_to_livecarta_chapter_item( sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) if self.logger: - indent = ' ' * lvl - self.logger.log(f'{indent}Chapter: {title} is prepared.') + indent = " " * lvl + self.logger.log(f"{indent}Chapter: {title} is prepared.") return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) def convert_to_dict(self) -> dict: @@ -550,12 +569,13 @@ class EpubConverter: top_level_nav_points = self.adjacency_list[-1] top_level_chapters = [] - for nav_point in top_level_nav_points: - chapter = self.node_to_livecarta_chapter_item(nav_point) + # loop through to level chapters + for tl_nav_point in top_level_nav_points: + chapter = self.html_node_to_livecarta_chapter_item(tl_nav_point) top_level_chapters.append(chapter) top_level_dict_chapters = [x.to_dict() for x in top_level_chapters] - self.logger.log(f'Anchors found: {len(self.internal_anchors)}.') - self.logger.log('End conversion.') + self.logger.log(f"Anchors found: {len(self.internal_anchors)}.") + self.logger.log("End conversion.") return { "content": top_level_dict_chapters, @@ -564,12 +584,12 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = '../../epub/9781614382264.epub' + epub_file_path = "../../epub/9781614382264.epub" logger_object = BookLogger( - name='epub', book_id=epub_file_path.split('/')[-1]) + name="epub", book_id=epub_file_path.split("/")[-1]) json_converter = EpubConverter(epub_file_path, logger=logger_object) content_dict = json_converter.convert_to_dict() - with codecs.open(epub_file_path.replace('epub', 'json'), 'w', encoding='utf-8') as f_json: + with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: json.dump(content_dict, f_json, ensure_ascii=False) diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index cb6e080..8e92a40 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -7,7 +7,7 @@ class EpubBook(BookSolver): def __init__(self, book_id=0, access=None, main_logger=None): super().__init__(book_id, access, main_logger) - self.book_type = 'epub' + self.book_type = "epub" def get_converted_book(self): """ diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py new file mode 100644 index 0000000..d9840f3 --- /dev/null +++ b/src/epub_converter/footnotes_processing.py @@ -0,0 +1,87 @@ +from typing import Tuple + +from bs4 import BeautifulSoup, Tag + + +def _replace_with_livecarta_anchor_tag(anchor, i): + """Function replace noteref_tag(anchor) with new livecarta tag""" + new_tag = BeautifulSoup(features="lxml").new_tag("sup") + new_tag["class"] = "footnote-element" + new_tag["data-id"] = i + 1 + new_tag["id"] = f"footnote-{i + 1}" + new_tag.string = "*" + if anchor.parent.name == "sup": + anchor.parent.unwrap() + anchor.replace_with(new_tag) + return new_tag + + +def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name="epub:type") \ + -> Tuple[list, list, list]: + """ + This function preprocessing footnotes + This function should be earlier that adding fonts in pipeline. + +

Here is an example footnote1

+ + + """ + footnotes = [] + noterefs_tags = source_html_tag.find_all( + attrs={noteref_attr_name: "noteref"}) + bad_noterefs_tags = set( + [tag for tag in noterefs_tags if not tag.attrs.get("href")]) + noterefs_tags = [ + tag for tag in noterefs_tags if tag not in bad_noterefs_tags] + new_noterefs_tags = [] + new_footnotes_tags = [] + [tag.decompose() for tag in bad_noterefs_tags] + + def parse_a_tag_href(s: str) -> Tuple[str, str]: + """Returns name of file & id of an anchor""" + assert "#" in s, f"Error. Unexpected href: {s} in a tag. Href must contain an id." + f, id_ = s.split("#") + return f, id_ + + def verify_footnote_tag(tags: list): + """Function verifies is tag - footnote""" + assert len(tags) <= 1, f"Error, Multiple id: {href}.\n{tags}" + if len(tags) == 0: + anchored_tags = list(target_html_tag.find_all(id=element_id)) + if len(anchored_tags): + print( + f"Warning. Href for tag is detected as footnote:\n{noteref_tag}") + return anchored_tags + else: + assert 0, f"Error, No element with id: {href} found." + return tags + + for i, noteref_tag in enumerate(noterefs_tags): + href = noteref_tag.attrs["href"] + file, element_id = parse_a_tag_href(href) + if not file: + target_html_tag = source_html_tag + else: + target_html_tag = href2soup_html.get(file) + if not target_html_tag: + print( + f"Error while footnotes processing. For {noteref_tag} invalid path: {file}.") + continue + + possible_footnote = "note|footnote|endnote|rearenote" + expected_footnote_tags = list(target_html_tag.find_all(id=element_id, + attrs={"epub:type": re.compile(possible_footnote)})) + + expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) + footnote_tag = expected_footnote_tags[0] + if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote": + footnote_tag = footnote_tag.parent + new_noterefs_tags.append( + _replace_with_livecarta_anchor_tag(noteref_tag, i)) + content = footnote_tag.text + # footnote_tag.decompose() + footnotes.append(content) + footnote_tag = footnote_tag.find( + attrs={"role": "doc-backlink"}) or footnote_tag + new_footnotes_tags.append(footnote_tag) + return footnotes, new_noterefs_tags, new_footnotes_tags \ No newline at end of file diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index d94c43a..efdba02 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -1,305 +1,107 @@ -import os import re -import pathlib -from typing import Tuple from bs4 import BeautifulSoup, NavigableString, Tag, Comment -from src.access import Access from src.livecarta_config import LiveCartaConfig -def _replace_with_livecarta_anchor_tag(anchor, i): - """Function replace noteref_tag(anchor) with new livecarta tag""" - new_tag = BeautifulSoup(features='lxml').new_tag('sup') - new_tag['class'] = 'footnote-element' - new_tag['data-id'] = i + 1 - new_tag['id'] = f'footnote-{i + 1}' - new_tag.string = '*' - if anchor.parent.name == 'sup': - anchor.parent.unwrap() - anchor.replace_with(new_tag) - return new_tag - - -def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \ - -> Tuple[list, list, list]: +def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup): """ - This function preprocessing footnotes - This function should be earlier that adding fonts in pipeline. + Function adds span with id from tag_to_be_removed + because this tag will be removed(unwrapped/extract) + Parameters + ---------- + tag_to_be_removed: Soup object + chapter_tag: BeautifulSoup -

Here is an example footnote1

- + Returns + ------- + None + updated body tag - """ - footnotes = [] - noterefs_tags = source_html_tag.find_all( - attrs={noteref_attr_name: 'noteref'}) - bad_noterefs_tags = set( - [tag for tag in noterefs_tags if not tag.attrs.get('href')]) - noterefs_tags = [ - tag for tag in noterefs_tags if tag not in bad_noterefs_tags] - new_noterefs_tags = [] - new_footnotes_tags = [] - [tag.decompose() for tag in bad_noterefs_tags] + """ + def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list): + """Function inserts span before tag aren't supported by livecarta""" + new_tag = chapter_tag.new_tag("span") + new_tag.attrs["id"] = id_ or "" + new_tag.attrs["class"] = class_ or "" + new_tag.string = "\xa0" + tag_to_be_removed.insert_before(new_tag) - def parse_a_tag_href(s: str) -> Tuple[str, str]: - """Returns name of file & id of an anchor""" - assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' - f, id_ = s.split('#') - return f, id_ - - def verify_footnote_tag(tags: list): - """Function verifies is tag - footnote""" - assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}' - if len(tags) == 0: - anchored_tags = list(target_html_tag.find_all(id=element_id)) - if len(anchored_tags): - print( - f'Warning. Href for tag is detected as footnote:\n{noteref_tag}') - return anchored_tags - else: - assert 0, f'Error, No element with id: {href} found.' - - return tags - - for i, noteref_tag in enumerate(noterefs_tags): - href = noteref_tag.attrs['href'] - file, element_id = parse_a_tag_href(href) - if not file: - target_html_tag = source_html_tag - else: - target_html_tag = href2soup_html.get(file) - if not target_html_tag: - print( - f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.') - continue - - possible_footnote = 'note|footnote|endnote|rearenote' - expected_footnote_tags = list(target_html_tag.find_all(id=element_id, - attrs={'epub:type': re.compile(possible_footnote)})) - - expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) - footnote_tag = expected_footnote_tags[0] - if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote': - footnote_tag = footnote_tag.parent - new_noterefs_tags.append( - _replace_with_livecarta_anchor_tag(noteref_tag, i)) - content = footnote_tag.text - # footnote_tag.decompose() - footnotes.append(content) - footnote_tag = footnote_tag.find( - attrs={'role': 'doc-backlink'}) or footnote_tag - new_footnotes_tags.append(footnote_tag) - - return footnotes, new_noterefs_tags, new_footnotes_tags + if tag_to_be_removed.attrs.get("id"): + _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, + id_=tag_to_be_removed.attrs["id"], + class_=tag_to_be_removed.attrs.get("class")) -def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup: +def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup: """ Main function that works with structure of html. Make changes inplace. Parameters ---------- - body_tag: Tag, soup object + chapter_tag: Tag, soup object Steps ---------- 1. Extracts tags that are not needed 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree. - Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed. - This tag must have a body_tag as a parent. + Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed. + This tag must have a chapter_tag as a parent. Otherwise, it is wrapped with some tags. Like: -

+

3. Headings that are not supported by livecarta converts to

4. Wrapping NavigableString Returns ------- - body_tag: Tag, BeautifulSoup - adjusted body_tag + chapter_tag: Tag, BeautifulSoup + adjusted chapter_tag """ - def _preserve_class_in_aside_tag(tag_): - """to save css style inherited from class, copy class to aside tag (which is parent to tag_)""" - # this is for Wiley books with boxes - tag_class = tag_.attrs['class'] if not isinstance( - tag_.attrs['class'], list) else tag_.attrs['class'][0] - if tag_.parent.name == 'aside': - if not tag_.parent.attrs.get('class'): - tag_.parent.attrs['class'] = tag_class + def _tags_to_correspond_livecarta_tag(chapter_tag): + """Function to replace all tags to correspond livecarta tags""" + for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): + for key in reg_key: + # text = tag if isinstance(tag, NavigableString) else tag.text + tags = chapter_tag.find_all(re.compile(key)) + for tag in tags: + tag.name = to_replace_value - def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool: - """ - Function saves css style inherited from class, copies class to child

- returns True, if

could be unwrapped - Parameters - ---------- - tag_: Tag, soup object + def _unwrap_tags(chapter_tag): + """Function unwrap tags and move id to span""" + for tag in LiveCartaConfig. TAGS_TO_UNWRAP: + for s in chapter_tag.find_all(tag): + _add_span_to_save_ids_for_links(s, chapter_tag) + s.unwrap() - Returns - ------- - bool + def _mark_parent_is_body(chapter_tag): + # check marks for chapter starting are on the same level - 1st + marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"}) - """ - # this is for Wiley books with boxes - tag_class = tag_.attrs['class'] if not isinstance( - tag_.attrs['class'], list) else tag_.attrs['class'][0] - if 'feature' not in tag_class: - return True - child_p_tags = tag_.find_all("p") - if len(child_p_tags) == 1: - child_p_tag = child_p_tags[0] - if not child_p_tag.attrs.get('class'): - child_p_tag.attrs['class'] = tag_class - return True + # fix marks to be on 1 level + for mark in marks: + while mark.parent != chapter_tag: + mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases - elif len(child_p_tags) > 1: - tag_.name = 'p' - return False - else: - return True + _tags_to_correspond_livecarta_tag(chapter_tag) - def _add_span_to_save_ids_for_links(tag_to_be_removed): - if tag_to_be_removed.attrs.get('id'): - _insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed, - id_=tag_to_be_removed.attrs['id'], - class_=tag_to_be_removed.attrs.get('class')) + _unwrap_tags(chapter_tag) - def _replace_div_tag_with_table(): - """ - Function replace
with : - 1. Convert div with certain classes to tables - 2. Add background color to div with background-color + _mark_parent_is_body(chapter_tag) - """ - for div in body_tag.find_all("div"): - if div.attrs.get('class'): - div_class = div.attrs['class'] if not isinstance( - div.attrs['class'], list) else div.attrs['class'][0] - if div_class in ['C409', 'C409a']: - _wrap_block_tag_with_table( - body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9') - - elif div_class in ['C441', 'C816']: - _wrap_block_tag_with_table( - body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8') - - if div.attrs.get('style'): - if 'background-color' in div.attrs['style']: - end_index = div.attrs['style'].find( - 'background-color') + len('background-color') - start_index_of_color = end_index + 2 - bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7] - _wrap_block_tag_with_table( - body_tag, old_tag=div, width='100', border='', bg_color=bg_color) - elif div.attrs.get('style') == '': - del div.attrs['style'] - - structural_tags_names = [ - 'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data', - 'figure', 'footer', 'iframe', 'span', 'p' - ] - - if div.contents: - is_not_struct_tag = [ - child.name not in structural_tags_names for child in div.contents] - if all(is_not_struct_tag): - div.name = 'p' - continue - _add_span_to_save_ids_for_links(div) - div.unwrap() - - def _heading_tag_to_p_tag(body_tag): - """Function to convert all lower level headings to p tags""" - pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' - header_tags = body_tag.find_all(re.compile(pattern)) - for tag in header_tags: - tag.name = 'p' - - # comments removal - for tag in body_tag.find_all(): - for element in tag(text=lambda text: isinstance(text, Comment)): - element.extract() - - _replace_div_tag_with_table() - - for s in body_tag.find_all("section"): - could_be_unwrapped = True - if s.attrs.get('class'): - could_be_unwrapped = _preserve_class_in_section_tag(s) - _add_span_to_save_ids_for_links(s) - if could_be_unwrapped: - s.unwrap() - - for s in body_tag.find_all("article"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("figure"): - s.name = 'p' - # to center image inside this tag - s.attrs['style'] = "text-align: center;" - - for s in body_tag.find_all("figcaption"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("aside"): - s.name = 'blockquote' - - for s in body_tag.find_all("main"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("body"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("html"): - _add_span_to_save_ids_for_links(s) - s.unwrap() - - for s in body_tag.find_all("header"): - s.name = 'span' - - # check marks for chapter starting are on the same 1 level - marks = body_tag.find_all(attrs={'class': 'converter-chapter-mark'}) - parents_marks_are_body = [x.parent == body_tag for x in marks] - - # fix marks to be on 1 level - if not all(parents_marks_are_body): - for x in marks: - while x.parent != body_tag: - x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases - - parents_marks_are_body = [x.parent == body_tag for x in marks] - assert all( - parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' - - _heading_tag_to_p_tag(body_tag) - - # wrap NavigableString with

- for node in body_tag: - if isinstance(node, NavigableString): - content = str(node) - content = re.sub(r'([\n\t\xa0])', ' ', content) - content = content.strip() - if content: - tag = body_tag.new_tag('p') - tag.append(str(node)) - node.replace_with(tag) - return body_tag + return chapter_tag def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: - """After processing on a first_id that corresponds to current chapter, + """ + After processing on a first_id that corresponds to current chapter, from initial html_soup all tags from current chapter are extracted Parameters ---------- - first_id: - Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark' - href: - Name of current chapter's file + first_id: str + Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" + href: str + Name of current chapters file html_soup: Tag Soup object of current file @@ -310,13 +112,13 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu """ marked_tags = html_soup.find( - attrs={'id': first_id, 'class': 'converter-chapter-mark'}) + attrs={"id": first_id, "class": "converter-chapter-mark"}) if marked_tags: next_tag = marked_tags.next_sibling tags = [] while next_tag: - if not isinstance(next_tag, NavigableString) and\ - (next_tag.attrs.get('class') == 'converter-chapter-mark'): + if not isinstance(next_tag, NavigableString) and \ + (next_tag.attrs.get("class") == "converter-chapter-mark"): break tags.append(next_tag) next_tag = next_tag.next_sibling @@ -327,182 +129,119 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu html_soup.smooth() else: - assert 0, f'Warning: no match for {first_id, href}' + assert 0, f"Warning: no match for {first_id, href}" return tags -def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str): - """Function saves all images to Amazon web service""" - link_path = access.send_image( - img_file_path, doc_id=book_id, img_content=img_content) - return link_path - - -def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): - """Function saves all images locally""" - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join( - folder_path, f'../json/img_{book_id}/')) - new_path.mkdir(exist_ok=True) - - new_img_path = new_path / os.path.basename(img_file_path) - f = open(new_img_path, 'wb+') - f.write(img_content) - f.close() - - return new_img_path - - -def update_images_src_links(body_tag: BeautifulSoup, - href2img_content: dict, - path_to_html: str, - access=None, - path2aws_path: dict = None, - book_id: str = None) -> dict: - """Function makes dictionary image_src_path -> Amazon web service_path""" - img_tags = body_tag.find_all('img') - - for img in img_tags: - path_to_img_from_html = img.attrs.get('src') - html_folder = os.path.dirname(path_to_html) - path_to_img_from_root = os.path.normpath(os.path.join( - html_folder, path_to_img_from_html)).replace('\\', '/') - - assert path_to_img_from_root in href2img_content, \ - f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' - - img_content = href2img_content[path_to_img_from_root] - if access is not None: - if path_to_img_from_root in path2aws_path: - new_folder = path2aws_path[path_to_img_from_root] - else: - new_folder = save_image_to_aws( - access, path_to_img_from_root, img_content, book_id) - path2aws_path[path_to_img_from_root] = new_folder - else: - new_folder = save_image_locally( - path_to_img_from_root, img_content, 'book_id') - - img.attrs['src'] = str(new_folder) - if img.attrs.get('width'): - del img.attrs['width'] - if img.attrs.get('height'): - del img.attrs['height'] - if img.attrs.get('style'): - del img.attrs['style'] - return path2aws_path - - -def _clean_title_from_numbering(title: str): - """Function removes numbering from titles""" - title = re.sub(r'^(\s+)+', '', title) - # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title - # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title - # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title - return title - - def prepare_title(title_of_chapter: str) -> str: """Function finalise processing/cleaning title""" - title_str = BeautifulSoup(title_of_chapter, features='lxml').string - title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) - title_str = re.sub(r' +', ' ', title_str).rstrip() - title_str = _clean_title_from_numbering(title_str) + title_str = BeautifulSoup(title_of_chapter, features="lxml").string + title_str = re.sub(r"([\n\t\xa0])", " ", title_str) + title_str = re.sub(r" +", " ", title_str).rstrip() + # clean whitespace characters ([\r\n\t\f\v ]) + title_str = re.sub(r"(^\s+)|(\s+$)", "", title_str) return title_str -def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): - """Function inserts span before tag aren't supported by livecarta""" - new_tag = main_tag.new_tag("span") - new_tag.attrs['id'] = id_ or '' - new_tag.attrs['class'] = class_ or '' - new_tag.string = "\xa0" - tag.insert_before(new_tag) +def _remove_comments(chapter_tag): + for tag in chapter_tag.find_all(): + for element in tag(text=lambda text: isinstance(text, Comment)): + element.extract() -def _clean_headings_content(content: BeautifulSoup, title: str): - def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup): - if tag_to_be_removed.attrs.get('id'): - _insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=tag_to_be_removed.attrs.get( - 'id'), - class_=tag_to_be_removed.attrs.get('class')) +def _wrap_strings_with_p(chapter_tag): + # wrap NavigableString with

+ for node in chapter_tag: + if isinstance(node, NavigableString): + content = str(node) + content = re.sub(r"([\n\t\xa0])", " ", content) + # remove spaces at the beginning and at the end of the string: + content = content.strip() + if content: + tag = chapter_tag.new_tag("p") + tag.append(str(node)) + node.replace_with(tag) - for sub_tag in tag_to_be_removed.find_all(): - if sub_tag.attrs.get('id'): - _insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=sub_tag.attrs['id'], - class_=sub_tag.attrs.get('class')) - title = title.lower() - for child in content.contents: - if isinstance(child, NavigableString): - text = child - else: - text = child.text - if text and re.sub(r'([\n\t\xa0])', '', text): - text = re.sub(r'([\n\t\xa0])', ' ', text) - text = re.sub(r' +', ' ', text).strip() - text = text.lower() - if title == text: - add_span_to_save_ids_for_links(child, content) - child.extract() - elif (title in text) and (child.name in ['h1', 'h2', 'h3']): - add_span_to_save_ids_for_links(child, content) - child.extract() +def _remove_headings_content(content_tag, title_of_chapter: str): + """ + Function + clean/remove headings from chapter in order to avoid duplication of chapter titles in the content + add span with id in order to + Parameters + ---------- + content_tag: soup object + Tag of the page + title_of_chapter: str + Chapter title + + Returns + ------- + None + clean/remove headings & add span with id + + """ + title_of_chapter = title_of_chapter.lower() + for tag in content_tag.contents: + text = tag if isinstance(tag, NavigableString) else tag.text + if text: + text = re.sub(r"^[\s\xa0]+|[\s\xa0]+$", " ", text).lower() + if title_of_chapter == text or \ + (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name)): + _add_span_to_save_ids_for_links(tag, content_tag) + tag.extract() break -def _process_lists(body_tag: BeautifulSoup): +# todo remove +def _process_lists(chapter_tag: BeautifulSoup): """ Function - process tags

  • . - unwrap

    tags. Parameters ---------- - body_tag: Tag, soup object + chapter_tag: Tag, soup object Returns ------- None """ - li_tags = body_tag.find_all("li") + li_tags = chapter_tag.find_all("li") for li_tag in li_tags: if li_tag.p: li_tag.attrs.update(li_tag.p.attrs) li_tag.p.unwrap() -def _preprocess_table(body_tag: BeautifulSoup): +def _preprocess_table(chapter_tag: BeautifulSoup): """Function to preprocess tables and tags(td|th|tr): style""" - tables = body_tag.find_all("table") + tables = chapter_tag.find_all("table") for table in tables: t_tags = table.find_all(re.compile("td|th|tr")) for t_tag in t_tags: - style = t_tag.get('style') - width = '' + style = t_tag.get("style") + width = "" if style: width_match = re.search( r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) if width_match: size = width_match.group(1) - width = size + 'px' + width = size + "px" - t_tag.attrs['width'] = t_tag.get('width') or width + t_tag.attrs["width"] = t_tag.get("width") or width - if t_tag.attrs.get('style'): - t_tag.attrs['style'] = t_tag.attrs['style'].replace( - 'border:0;', '') + if t_tag.attrs.get("style"): + t_tag.attrs["style"] = t_tag.attrs["style"].replace( + "border:0;", "") - elif t_tag.attrs.get('style') == '': - del t_tag.attrs['style'] + elif t_tag.attrs.get("style") == "": + del t_tag.attrs["style"] - if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']: - table.attrs['border'] = '1' + if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: + table.attrs["border"] = "1" def _preprocess_code_tags(chapter_tag: BeautifulSoup): @@ -523,25 +262,15 @@ def _preprocess_code_tags(chapter_tag: BeautifulSoup): if not code.parent.name == "pre": code.name = "span" continue - # if tag isn't in pre and doesn't have style - if not code.attrs.get('style'): - code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;' - - -def _prepare_formatted(text: str) -> str: - """Function replaces special symbols with their Unicode representation""" - text = text.replace("<", "\x3C") - text = text.replace(">", "\x3E") - text = text.replace('\t', "\xa0 \xa0 ") #     - text = text.replace(' ', "\xa0") - text = text.replace('𝑓', "\xf0\x9d\x91\x93") - return text + # if tag isn"t in pre and doesn"t have style + if not code.attrs.get("style"): + code.attrs["style"] = "font-size: 14px; font-family: courier new,courier,monospace;" def _preprocess_pre_tags(chapter_tag: BeautifulSoup): """ Function preprocessing

     tags
    -    Wrap string of the tag with  if it's necessary
    +    Wrap string of the tag with  if its necessary
         Parameters
         ----------
         chapter_tag: Tag, soup object
    @@ -564,6 +293,42 @@ def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
                 pre.append(code)
     
     
    +# todo replace
    +def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None):
    +    """Function wraps  with 
  • """ + table = chapter_tag.new_tag("table") + table.attrs["border"], table.attrs["align"], table.attrs["style"] \ + = border, "center", f"width:{width}%;" + tbody, tr, td = \ + chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") + td.attrs["bgcolor"] = bg_color + tag_to_be_wrapped.wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) + return table + + +def _preprocess_div_tags(chapter_tag): + """ + Function replace
    with
    : + """ + for div in chapter_tag.find_all("div"): + if div.attrs.get('style'): + _wrap_tag_with_table( + chapter_tag, + tag_to_be_wrapped=div, + width=div.attrs['width'] if div.attrs.get('width') else '100', + border=div.attrs['border'] if div.attrs.get('border') else None, + bg_color=div.attrs['bgcolor'] if div.attrs.get('bgcolor') else None) + else: + div.name = "p" + continue + _add_span_to_save_ids_for_links(div, chapter_tag) + div.unwrap() + + def _clean_wiley_block(block): hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) for hr in hrs: @@ -571,48 +336,30 @@ def _clean_wiley_block(block): h = block.find(re.compile("h[1-9]")) if h: h.name = "p" - h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) + h.insert_before(BeautifulSoup(features="lxml").new_tag("br")) -def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None): - """Function wraps with
    """ - table = main_tag.new_tag("table") - table.attrs['border'] = border - table.attrs['align'] = 'center' - table.attrs['style'] = f'width:{width}%;' - tbody = main_tag.new_tag("tbody") - tr = main_tag.new_tag("tr") - td = main_tag.new_tag("td") - # td.attrs['border-radius'] = '8px' - if bg_color: - td.attrs['bgcolor'] = bg_color - old_tag.wrap(td) - td.wrap(tr) - tr.wrap(tbody) - tbody.wrap(table) - table.insert_after(BeautifulSoup(features='lxml').new_tag("br")) - return table def _preprocess_block_tags(chapter_tag: Tag): """Function preprocessing tags""" for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}): _clean_wiley_block(block) - color = '#DDDDDD' if block.attrs.get( - 'class') == 'feature1' else None - color = '#EEEEEE' if block.attrs.get( - 'class') == 'feature2' else color - _wrap_block_tag_with_table(chapter_tag, block, bg_color=color) - block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) + color = "#DDDDDD" if block.attrs.get( + "class") == "feature1" else None + color = "#EEEEEE" if block.attrs.get( + "class") == "feature2" else color + _wrap_tag_with_table(chapter_tag, block, bg_color=color) + block.insert_after(BeautifulSoup(features="lxml").new_tag("br")) block.unwrap() for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): _clean_wiley_block(future_block) - color = '#DDDDDD' if future_block.attrs.get( - 'class') == 'feature1' else None - color = '#EEEEEE' if future_block.attrs.get( - 'class') == 'feature2' else color - _wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) + color = "#DDDDDD" if future_block.attrs.get( + "class") == "feature1" else None + color = "#EEEEEE" if future_block.attrs.get( + "class") == "feature2" else color + _wrap_tag_with_table(chapter_tag, future_block, bg_color=color) def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: @@ -628,10 +375,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro Steps ---------- - 1. find \n - 2. heading removal - 3. processing tags - 4. class removal + 1. heading removal + 2. processing tags + 3. class removal Returns ------- @@ -639,28 +385,27 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro prepared content """ - # 1. find \n - to_remove = [] - for child in content_tag.contents: - if isinstance(child, NavigableString): - s = re.sub(r'([\n\t])', '', child.string) - if s == '': - to_remove.append(child) + # 1. remove comments + _remove_comments(content_tag) - # 2. heading removal + # 2. wrap NavigableString with tag

    + _wrap_strings_with_p(content_tag) + + # 3. heading removal if remove_title_from_chapter: - _clean_headings_content(content_tag, title_str) + _remove_headings_content(content_tag, title_str) - # 3. processing tags (

  • ,
  • , ,
    , )
    +    # 4. processing tags (
  • ,
  • , ,
    , 
    , ) _process_lists(content_tag) _preprocess_table(content_tag) _preprocess_code_tags(content_tag) _preprocess_pre_tags(content_tag) + _preprocess_div_tags(content_tag) _preprocess_block_tags(content_tag) - # 4. class removal + # 5. remove classes that were created by converter for tag in content_tag.find_all(recursive=True): - if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', - 'footnote-element']): - del tag.attrs['class'] + if hasattr(tag, "attrs") and tag.attrs.get("class") \ + and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): + del tag.attrs["class"] return str(content_tag) diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py new file mode 100644 index 0000000..950bbdd --- /dev/null +++ b/src/epub_converter/image_processing.py @@ -0,0 +1,67 @@ +import os +import pathlib + +from bs4 import BeautifulSoup + +from src.access import Access + + +def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str): + """Function saves all images to Amazon web service""" + link_path = access.send_image( + img_file_path, doc_id=book_id, img_content=img_content) + return link_path + + +def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): + """Function saves all images locally""" + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join( + folder_path, f"../json/img_{book_id}/")) + new_path.mkdir(exist_ok=True) + + new_img_path = new_path / os.path.basename(img_file_path) + f = open(new_img_path, "wb+") + f.write(img_content) + f.close() + return new_img_path + + +def update_images_src_links(body_tag: BeautifulSoup, + href2img_content: dict, + path_to_html: str, + access=None, + path2aws_path: dict = None, + book_id: str = None) -> dict: + """Function makes dictionary image_src_path -> Amazon web service_path""" + img_tags = body_tag.find_all("img") + + for img in img_tags: + path_to_img_from_html = img.attrs.get("src") + html_folder = os.path.dirname(path_to_html) + path_to_img_from_root = os.path.normpath(os.path.join( + html_folder, path_to_img_from_html)).replace("\\", "/") + + assert path_to_img_from_root in href2img_content, \ + f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest." + + img_content = href2img_content[path_to_img_from_root] + if access is not None: + if path_to_img_from_root in path2aws_path: + new_folder = path2aws_path[path_to_img_from_root] + else: + new_folder = save_image_to_aws( + access, path_to_img_from_root, img_content, book_id) + path2aws_path[path_to_img_from_root] = new_folder + else: + new_folder = save_image_locally( + path_to_img_from_root, img_content, "book_id") + + img.attrs["src"] = str(new_folder) + if img.attrs.get("width"): + del img.attrs["width"] + if img.attrs.get("height"): + del img.attrs["height"] + if img.attrs.get("style"): + del img.attrs["style"] + return path2aws_path \ No newline at end of file diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_css_style_converter.py index 37b2672..269d8ed 100644 --- a/src/epub_converter/tag_css_style_converter.py +++ b/src/epub_converter/tag_css_style_converter.py @@ -21,33 +21,33 @@ class TagStyleConverter: @staticmethod def remove_white_if_no_bgcolor(style_, tag): """Function remove text white color if there is no bg color""" - if 'background' in style_: + if "background" in style_: style_ = style_.replace( - 'background:', 'background-color:') + "background:", "background-color:") return style_ # if text color is white, check that we have bg-color - if ('color:#ffffff' in style_) or ('color:#fff' in style_) or ('color:white' in style_): + if ("color:#ffffff" in style_) or ("color:#fff" in style_) or ("color:white" in style_): # if bg color is inherited, just return style as is for parent_tag in tag.parents: - # white bg color not need to be checked as we do not write 'white bg color' - tag_with_bg = ['span', 'td', 'tr', 'p'] + # white bg color not need to be checked as we do not write "white bg color" + tag_with_bg = ["span", "td", "tr", "p"] tag_will_be_saved = parent_tag.name in tag_with_bg - has_bg = parent_tag.attrs.get('style') and ( - 'background' in parent_tag.attrs.get('style')) + has_bg = parent_tag.attrs.get("style") and ( + "background" in parent_tag.attrs.get("style")) if has_bg and tag_will_be_saved: return style_ children = tag.find_all() for child in children: - if child.attrs.get('style') and ('background' in child.attrs.get('style')): - tmp_style = child.attrs['style'] + '; color:#fff; ' - child.attrs['style'] = tmp_style + if child.attrs.get("style") and ("background" in child.attrs.get("style")): + tmp_style = child.attrs["style"] + "; color:#fff; " + child.attrs["style"] = tmp_style - # for child with bg color we added white text color, so this tag don't need white color - style_ = style_.replace('color:#fff;', '') - style_ = style_.replace('color:#ffffff;', '') - style_ = style_.replace('color:white;', '') + # for child with bg color we added white text color, so this tag don"t need white color + style_ = style_.replace("color:#fff;", "") + style_ = style_.replace("color:#ffffff;", "") + style_ = style_.replace("color:white;", "") return style_ @staticmethod @@ -68,7 +68,7 @@ class TagStyleConverter: Parameters ---------- split_style: list - list of styles split by ';' + list of styles split by ";" Returns ---------- @@ -79,9 +79,9 @@ class TagStyleConverter: processed_style = ";".join(split_style) margin_left_regexp = re.compile( - r'((margin-left|margin): *(-*\w+);*)') + r"((margin-left|margin): *(-*\w+);*)") text_indent_regexp = re.compile( - r'(text-indent: *(-*\w+);*)') + r"(text-indent: *(-*\w+);*)") has_margin = re.search(margin_left_regexp, processed_style) has_text_indent = re.search(text_indent_regexp, processed_style) @@ -92,21 +92,21 @@ class TagStyleConverter: if has_text_indent: num_ti = abs(int("0" + "".join( filter(str.isdigit, str(has_text_indent.group(2)))))) - processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + - str(abs(num_m - num_ti)) + 'px; ') + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + + str(abs(num_m - num_ti)) + "px; ") processed_style = processed_style.replace( - has_margin.group(1), '') + has_margin.group(1), "") return processed_style - processed_style = processed_style.replace(has_margin.group(1), 'text-indent: ' + - str(abs(num_m)) + 'px; ') + processed_style = processed_style.replace(has_margin.group(1), "text-indent: " + + str(abs(num_m)) + "px; ") return processed_style elif has_text_indent: - processed_style = processed_style.replace(has_text_indent.group(1), 'text-indent: ' + + processed_style = processed_style.replace(has_text_indent.group(1), "text-indent: " + str(abs(int("0" + "".join( filter(str.isdigit, str(has_text_indent.group(2))))))) - + 'px; ') + + "px; ") return processed_style return processed_style @@ -126,18 +126,18 @@ class TagStyleConverter: processed inline style """ - inline_style = self.tag_inline_style.attrs.get('style') + ';' - # 1. Remove white color if tag doesn't have background color in style + inline_style = self.tag_inline_style.attrs.get("style") + ";" + # 1. Remove white color if tag doesn"t have background color in style inline_style = self.remove_white_if_no_bgcolor( inline_style, self.tag_inline_style) inline_style = inline_style.replace( - 'list-style-image', 'list-style-type') + "list-style-image", "list-style-type") # 2. Create list of styles from inline style - # replace all spaces between '; & letter' to ';' + # replace all spaces between "; & letter" to ";" style = re.sub(r"; *", ";", inline_style) - # when we split style by ';', last element of the list is '' - None (remove it) - split_inline_style: list = list(filter(None, style.split(';'))) + # when we split style by ";", last element of the list is "" - None (remove it) + split_inline_style: list = list(filter(None, style.split(";"))) # 3. Duplicate styles check - if the tag had duplicate styles split_inline_style = self.duplicate_styles_check(split_inline_style) @@ -164,7 +164,7 @@ class TagStyleConverter: """ styles_to_remove = [] for k in LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: - if f'{k[0]}:{k[1]}' in style: + if f"{k[0]}:{k[1]}" in style: styles_to_remove.append(k) return styles_to_remove @@ -172,11 +172,11 @@ class TagStyleConverter: # adds , , instead of styles styles_to_remove = self.check_style_to_be_tag(self.style) for i, (attr, value) in enumerate(styles_to_remove): - self.tag_inline_style.attrs['style'] = self.tag_inline_style.attrs['style']\ - .replace(f'{attr}:{value};', '').strip() + self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\ + .replace(f"{attr}:{value};", "").strip() corr_tag_name = LiveCartaConfig.LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[( attr, value)] - correspond_tag = BeautifulSoup(features='lxml').new_tag(corr_tag_name) + correspond_tag = BeautifulSoup(features="lxml").new_tag(corr_tag_name) for content in reversed(self.tag_inline_style.contents): correspond_tag.insert(0, content.extract()) self.tag_inline_style.append(correspond_tag) @@ -184,34 +184,34 @@ class TagStyleConverter: @staticmethod def wrap_span_in_tag_to_save_style_attrs(initial_tag): """Function designed to save style attrs that cannot be in tag.name -> span""" - dictkeys_pattern = re.compile('|'.join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG)) - if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get('style'): + dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG)) + if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"): styles_can_be_in_tag = [style for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items() if re.match(tag, initial_tag.name) for style in styles] styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in styles_can_be_in_tag] - span_style = initial_tag.attrs['style'] + span_style = initial_tag.attrs["style"] # here check that this style is exactly the same. - # Not 'align' when we have 'text-align', or 'border' when we have 'border-top' - styles_to_be_saved_in_span = [((attr + ':') in span_style) & ( - '-' + attr not in span_style) for attr in styles_cant_be_in_tag] + # Not "align" when we have "text-align", or "border" when we have "border-top" + styles_to_be_saved_in_span = [((attr + ":") in span_style) & ( + "-" + attr not in span_style) for attr in styles_cant_be_in_tag] if any(styles_to_be_saved_in_span): # if we find styles that cannot be in -> wrap them in span - tag = BeautifulSoup(features='lxml').new_tag(f'{initial_tag.name}') - style = '' - possible_attrs_regexp = [re.compile(fr'({style}: *(\w+);)') for style in styles_can_be_in_tag] + tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}") + style = "" + possible_attrs_regexp = [re.compile(fr"({style}: *(\w+);)") for style in styles_can_be_in_tag] for possible_attr_regexp in possible_attrs_regexp: has_style_attrs = re.search( possible_attr_regexp, span_style) if has_style_attrs and has_style_attrs.group(1): style += has_style_attrs.group(1) span_style = span_style.replace( - has_style_attrs.group(1), '') - tag.attrs['style'] = style - initial_tag.name = 'span' - initial_tag.attrs['style'] = span_style + has_style_attrs.group(1), "") + tag.attrs["style"] = style + initial_tag.name = "span" + initial_tag.attrs["style"] = span_style initial_tag.wrap(tag) def convert_initial_tag(self): @@ -246,10 +246,10 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> disable_validation=True, ) # soup with converted styles from css - inline_soup = BeautifulSoup(html_with_css_styles, features='lxml') + inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={'style': re.compile('.*')}) + attrs={"style": re.compile(".*")}) # go through the tags with inline style + style parsed from css file for tag_inline_style in tags_with_inline_style: diff --git a/src/livecarta_config.py b/src/livecarta_config.py index e3e63d4..31b549e 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -9,12 +9,12 @@ class LiveCartaConfig: HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} - DEFAULT_ALIGN_STYLE = 'left' + DEFAULT_ALIGN_STYLE = "left" - ALIGN_STYLES = ['justify', 'right', 'center', 'left'] + ALIGN_STYLES = ["justify", "right", "center", "left"] # Main constant values - DEFAULT_FONT_NAME = 'Times New Roman' + DEFAULT_FONT_NAME = "Times New Roman" WORD_DEFAULT_FONT_SIZE = 11 @@ -38,65 +38,65 @@ class LiveCartaConfig: } COLORS_MAP = { - '#ffff00': 'yellow', - '#00ff00': 'darkYellow', - '#00ffff': 'cyan', - '#ff00ff': 'magenta', - '#0000ff': 'blue', - '#ff0000': 'red', - '#000080': 'darkBlue', - '#008080': 'darkCyan', - '#008000': 'green', - '#800080': 'darkMagenta', - '#808000': 'darkGreen', - '#c0c0c0': 'lightGray', - '#ffffff': 'white', - '#800000': '#800000', - '#808080': '#808080' + "#ffff00": "yellow", + "#00ff00": "darkYellow", + "#00ffff": "cyan", + "#ff00ff": "magenta", + "#0000ff": "blue", + "#ff0000": "red", + "#000080": "darkBlue", + "#008080": "darkCyan", + "#008000": "green", + "#800080": "darkMagenta", + "#808000": "darkGreen", + "#c0c0c0": "lightGray", + "#ffffff": "white", + "#800000": "#800000", + "#808080": "#808080" } HTML42LIVECARTA_COLORS = { - 'yellow': 'yellow', - 'lime': 'green', - 'aqua': 'cyan', - 'fuchsia': 'magenta', - 'blue': 'blue', - 'red': 'red', - 'navy': 'darkBlue', - 'teal': 'darkCyan', - 'green': 'darkGreen', - 'purple': 'darkMagenta', - 'olive': 'darkYellow', - 'silver': 'lightGray', - 'white': 'white', - 'maroon': 'darkRed', # '#800000', - 'gray': 'darkGray', - 'grey': 'darkGray', + "yellow": "yellow", + "lime": "green", + "aqua": "cyan", + "fuchsia": "magenta", + "blue": "blue", + "red": "red", + "navy": "darkBlue", + "teal": "darkCyan", + "green": "darkGreen", + "purple": "darkMagenta", + "olive": "darkYellow", + "silver": "lightGray", + "white": "white", + "maroon": "darkRed", # "#800000", + "gray": "darkGray", + "grey": "darkGray", } - INDENT = '30px' + INDENT = "30px" sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] - sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', - '19px', '20px', '21px', '22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', - '30px', '31px', '32px', '33px', '34px', '35px', '36px', '37px', '38px', '39px', '40px', - '41px', '42px', '43px', '44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px'] + sizes_px = ["0px", "10px", "10px", "11px", "12px", "13px", "14px", "15px", "16px", "17px", "18px", + "19px", "20px", "21px", "22px", "23px", "24px", "25px", "26px", "27px", "28px", "29px", + "30px", "31px", "32px", "33px", "34px", "35px", "36px", "37px", "38px", "39px", "40px", + "41px", "42px", "43px", "44px", "45px", "46px", "47px", "48px", "49px", "50px", "64px", "72px"] - list_types = ['circle', 'disc', 'armenian', 'decimal', - 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', - 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] + list_types = ["circle", "disc", "armenian", "decimal", + "decimal-leading-zero", "georgian", "lower-alpha", "lower-latin", + "lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"] structural_tags_names = [ - 'div', 'section', 'article', 'main', 'body', 'html', 'aside', - 'canvas', 'data', 'figure', 'footer', 'iframe', 'span', 'p' + "div", "section", "article", "main", "body", "html", "aside", + "canvas", "data", "figure", "footer", "iframe", "span", "p" ] could_have_style_in_livecarta_regexp = re.compile( - '(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)') + "(^div$)|(^p$)|(^span$)|(^code$)|(^kbd$)|(^var$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)") """ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } @@ -104,23 +104,34 @@ class LiveCartaConfig:

    Date: Tue, 21 Jun 2022 16:16:42 +0300 Subject: [PATCH 02/55] Fix style problem with span --- src/epub_converter/tag_css_style_converter.py | 14 +++++++------- src/livecarta_config.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_css_style_converter.py index 269d8ed..eb6bb73 100644 --- a/src/epub_converter/tag_css_style_converter.py +++ b/src/epub_converter/tag_css_style_converter.py @@ -16,7 +16,7 @@ class TagStyleConverter: def __init__(self, tag_inline_style): # tag with inline style + style parsed from css file self.tag_inline_style = tag_inline_style - self.style = self.process_inline_style() + self.tag_inline_style.attrs['style'] = self.process_inline_style() @staticmethod def remove_white_if_no_bgcolor(style_, tag): @@ -76,7 +76,7 @@ class TagStyleConverter: processed style with counted indent """ - processed_style = ";".join(split_style) + processed_style = ";".join(split_style)+';' margin_left_regexp = re.compile( r"((margin-left|margin): *(-*\w+);*)") @@ -142,7 +142,7 @@ class TagStyleConverter: # 3. Duplicate styles check - if the tag had duplicate styles split_inline_style = self.duplicate_styles_check(split_inline_style) - # 4. Processing indents# + # 4. Processing indents inline_style: str = self.indents_processing(split_inline_style) return inline_style @@ -170,7 +170,7 @@ class TagStyleConverter: def change_attrs_with_corresponding_tags(self): # adds , , instead of styles - styles_to_remove = self.check_style_to_be_tag(self.style) + styles_to_remove = self.check_style_to_be_tag(self.tag_inline_style.attrs['style']) for i, (attr, value) in enumerate(styles_to_remove): self.tag_inline_style.attrs["style"] = self.tag_inline_style.attrs["style"]\ .replace(f"{attr}:{value};", "").strip() @@ -184,10 +184,10 @@ class TagStyleConverter: @staticmethod def wrap_span_in_tag_to_save_style_attrs(initial_tag): """Function designed to save style attrs that cannot be in tag.name -> span""" - dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG)) + dictkeys_pattern = re.compile("|".join(LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG)) if re.findall(dictkeys_pattern, initial_tag.name) and initial_tag.attrs.get("style"): styles_can_be_in_tag = [style - for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CANT_BE_IN_TAG.items() + for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items() if re.match(tag, initial_tag.name) for style in styles] styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS @@ -201,7 +201,7 @@ class TagStyleConverter: # if we find styles that cannot be in -> wrap them in span tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}") style = "" - possible_attrs_regexp = [re.compile(fr"({style}: *(\w+);)") for style in styles_can_be_in_tag] + possible_attrs_regexp = [re.compile(fr"{style}: *\w+;") for style in styles_can_be_in_tag] for possible_attr_regexp in possible_attrs_regexp: has_style_attrs = re.search( possible_attr_regexp, span_style) diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 31b549e..2cfafb6 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -117,7 +117,7 @@ class LiveCartaConfig: ("vertical-align", "super"): "sup" } - LIVECARTA_STYLES_CANT_BE_IN_TAG = { + LIVECARTA_STYLES_CAN_BE_IN_TAG = { "p": ["text-align", "text-indent", "border-bottom", "border-top"], "li": ["text-align", "list-style-type"], "ul": ["list-style-type"], From 5a237c3974624d740423ab9b731861029c496518 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 21 Jun 2022 16:17:05 +0300 Subject: [PATCH 03/55] Fix headings cleaning problem --- src/epub_converter/html_epub_preprocessor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index efdba02..6944caf 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -186,12 +186,13 @@ def _remove_headings_content(content_tag, title_of_chapter: str): for tag in content_tag.contents: text = tag if isinstance(tag, NavigableString) else tag.text if text: - text = re.sub(r"^[\s\xa0]+|[\s\xa0]+$", " ", text).lower() + text = re.sub(r"[\s\xa0]", " ", text).lower() + text = text.strip() # delete extra spaces if title_of_chapter == text or \ (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name)): _add_span_to_save_ids_for_links(tag, content_tag) tag.extract() - break + break # todo remove From 32f5a5eb15ccd693a2a523c71fccab7809ef8159 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 21 Jun 2022 16:51:23 +0300 Subject: [PATCH 04/55] Comments fix --- src/epub_converter/html_epub_preprocessor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 6944caf..ed90767 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -62,7 +62,6 @@ def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup: """Function to replace all tags to correspond livecarta tags""" for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): for key in reg_key: - # text = tag if isinstance(tag, NavigableString) else tag.text tags = chapter_tag.find_all(re.compile(key)) for tag in tags: tag.name = to_replace_value @@ -83,6 +82,12 @@ def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup: while mark.parent != chapter_tag: mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases + # 1. remove comments + _remove_comments(chapter_tag) + + # 2. wrap NavigableString with tag

    + _wrap_strings_with_p(chapter_tag) + _tags_to_correspond_livecarta_tag(chapter_tag) _unwrap_tags(chapter_tag) @@ -294,7 +299,6 @@ def _preprocess_pre_tags(chapter_tag: BeautifulSoup): pre.append(code) -# todo replace def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): """Function wraps with

    """ table = chapter_tag.new_tag("table") @@ -386,11 +390,7 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro prepared content """ - # 1. remove comments - _remove_comments(content_tag) - # 2. wrap NavigableString with tag

    - _wrap_strings_with_p(content_tag) # 3. heading removal if remove_title_from_chapter: From 131fa2642e28974b47d11803a5ef792ffeff0c3e Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 21 Jun 2022 17:07:26 +0300 Subject: [PATCH 05/55] Fix % processing in styles --- src/epub_converter/css_preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py index 11e4a16..96959d6 100644 --- a/src/epub_converter/css_preprocessing.py +++ b/src/epub_converter/css_preprocessing.py @@ -47,8 +47,8 @@ def convert_tag_style_values(size_value: str) -> str: has_style_attrs = re.search(font_size_regexp, size_value) if has_style_attrs: if has_style_attrs.group(1): - size_value = float(size_value.replace("%", "")) / 100.0 - return find_closest_size(size_value) + size_value = float(size_value.replace("%", ""))*6 + return str(size_value)+'px' elif has_style_attrs.group(3): size_value = float(size_value.replace("em", "")) return find_closest_size(size_value) From 4b74faef053780ceb26a36f5e244bc0602c74dd8 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 21 Jun 2022 17:10:32 +0300 Subject: [PATCH 06/55] Not found group problem fixed --- src/epub_converter/tag_css_style_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_css_style_converter.py index eb6bb73..1032d49 100644 --- a/src/epub_converter/tag_css_style_converter.py +++ b/src/epub_converter/tag_css_style_converter.py @@ -201,7 +201,7 @@ class TagStyleConverter: # if we find styles that cannot be in -> wrap them in span tag = BeautifulSoup(features="lxml").new_tag(f"{initial_tag.name}") style = "" - possible_attrs_regexp = [re.compile(fr"{style}: *\w+;") for style in styles_can_be_in_tag] + possible_attrs_regexp = [re.compile(fr"({style}: *\w+;)") for style in styles_can_be_in_tag] for possible_attr_regexp in possible_attrs_regexp: has_style_attrs = re.search( possible_attr_regexp, span_style) From bc4055bdaf1907f0cc3303ee386524e784119de7 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 21 Jun 2022 18:14:26 +0300 Subject: [PATCH 07/55] Remove processing css in lowercase --- src/epub_converter/css_preprocessing.py | 2 +- src/util/color_reader.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py index 96959d6..29021ec 100644 --- a/src/epub_converter/css_preprocessing.py +++ b/src/epub_converter/css_preprocessing.py @@ -215,7 +215,7 @@ def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRul def build_css_file_content(css_content: str) -> str: """Build css content with livecarta convention""" - sheet = cssutils.parseString(css_content.lower(), validate=False) + sheet = cssutils.parseString(css_content, validate=False) for css_rule in sheet: if css_rule.type == css_rule.STYLE_RULE: diff --git a/src/util/color_reader.py b/src/util/color_reader.py index fe44758..82fb451 100644 --- a/src/util/color_reader.py +++ b/src/util/color_reader.py @@ -96,13 +96,13 @@ def str2hex(s: str): if '#' in s and (len(s) <= 7): return s.lower() - if ('rgb' in s) and ('%' in s): + if ('rgb' in s.lower()) and ('%' in s): match = re.search(r'rgba*\(((\d+)%, *(\d+)%, *(\d+)%(, \d\.\d+)*)\)', s) if match: r, g, b = int(match.group(2)), int(match.group(3)), int(match.group(4)) return rgb_percent_to_hex((r, g, b)) - if 'rgb' in s: + if 'rgb' in s.lower(): rgba = re.findall('([0-9] *\.?[0-9]+)', s) r, g, b = int(rgba[0]), int(rgba[1]), int(rgba[2]) if len(rgba) == 4: @@ -110,7 +110,7 @@ def str2hex(s: str): r, g, b = rgba2rgb(r, g, b, alpha) return rgb_to_hex((r, g, b)) - if 'hsl' in s: + if 'hsl' in s.lower(): # hsl(hue in {0,360}, saturation [0, 100%], lightness [0, 100%]) match = re.search(r'hsla*\(((\d+), *(\d+)%, *(\d+)%, (\d\.\d+)*)\)', s) if match: From de1246d89096a83dc2a2ed4413abb8ca1536689d Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 22 Jun 2022 18:18:58 +0300 Subject: [PATCH 08/55] Improve remove headings content --- src/epub_converter/html_epub_preprocessor.py | 22 ++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index ed90767..c3ce356 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -190,14 +190,32 @@ def _remove_headings_content(content_tag, title_of_chapter: str): title_of_chapter = title_of_chapter.lower() for tag in content_tag.contents: text = tag if isinstance(tag, NavigableString) else tag.text - if text: + if re.sub(r'([\s\xa0])', '', text): text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces if title_of_chapter == text or \ (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name)): _add_span_to_save_ids_for_links(tag, content_tag) tag.extract() - break + elif not isinstance(tag, NavigableString): + _remove_headings_content(tag, title_of_chapter) + break + + +def _tags_to_correspond_livecarta_tag(chapter_tag): + """Function to replace all tags to correspond livecarta tags""" + for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): + for key in reg_key: + tags = chapter_tag.find_all(re.compile(key)) + for tag in tags: + tag.name = to_replace_value + +def _unwrap_tags(chapter_tag): + """Function unwrap tags and move id to span""" + for tag in LiveCartaConfig. TAGS_TO_UNWRAP: + for s in chapter_tag.find_all(tag): + _add_span_to_save_ids_for_links(s, chapter_tag) + s.unwrap() # todo remove From 0f53caaffaff2c02efbe2a948ff1ff8f3ab23b69 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 22 Jun 2022 18:20:21 +0300 Subject: [PATCH 09/55] Replace functions working to the 1 html processing --- src/epub_converter/html_epub_preprocessor.py | 72 +++----------------- 1 file changed, 9 insertions(+), 63 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index c3ce356..e46e46d 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -34,69 +34,6 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou class_=tag_to_be_removed.attrs.get("class")) -def process_structural_tags(chapter_tag: BeautifulSoup) -> BeautifulSoup: - """ - Main function that works with structure of html. Make changes inplace. - Parameters - ---------- - chapter_tag: Tag, soup object - - Steps - ---------- - 1. Extracts tags that are not needed - 2. Checks that marks for pointing a start of a chapter are placed on one level in html tree. - Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed. - This tag must have a chapter_tag as a parent. - Otherwise, it is wrapped with some tags. Like: -

    - 3. Headings that are not supported by livecarta converts to

    - 4. Wrapping NavigableString - - Returns - ------- - chapter_tag: Tag, BeautifulSoup - adjusted chapter_tag - - """ - def _tags_to_correspond_livecarta_tag(chapter_tag): - """Function to replace all tags to correspond livecarta tags""" - for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): - for key in reg_key: - tags = chapter_tag.find_all(re.compile(key)) - for tag in tags: - tag.name = to_replace_value - - def _unwrap_tags(chapter_tag): - """Function unwrap tags and move id to span""" - for tag in LiveCartaConfig. TAGS_TO_UNWRAP: - for s in chapter_tag.find_all(tag): - _add_span_to_save_ids_for_links(s, chapter_tag) - s.unwrap() - - def _mark_parent_is_body(chapter_tag): - # check marks for chapter starting are on the same level - 1st - marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"}) - - # fix marks to be on 1 level - for mark in marks: - while mark.parent != chapter_tag: - mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases - - # 1. remove comments - _remove_comments(chapter_tag) - - # 2. wrap NavigableString with tag

    - _wrap_strings_with_p(chapter_tag) - - _tags_to_correspond_livecarta_tag(chapter_tag) - - _unwrap_tags(chapter_tag) - - _mark_parent_is_body(chapter_tag) - - return chapter_tag - - def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: """ After processing on a first_id that corresponds to current chapter, @@ -156,6 +93,7 @@ def _remove_comments(chapter_tag): def _wrap_strings_with_p(chapter_tag): + # Headings that are not supported by livecarta converts to

    # wrap NavigableString with

    for node in chapter_tag: if isinstance(node, NavigableString): @@ -408,7 +346,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro prepared content """ + # 1. remove comments + _remove_comments(content_tag) + # 2. wrap NavigableString with tag

    + _wrap_strings_with_p(content_tag) + + _tags_to_correspond_livecarta_tag(content_tag) + + _unwrap_tags(content_tag) # 3. heading removal if remove_title_from_chapter: From 43e16ed8d22aa32e2688c15616184be9fc1aefc9 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 22 Jun 2022 18:20:57 +0300 Subject: [PATCH 10/55] Convert div to table (width, border, bgcolor in attr) --- src/epub_converter/html_epub_preprocessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index e46e46d..87c98c9 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -276,7 +276,7 @@ def _preprocess_div_tags(chapter_tag): Function replace

    with
    : """ for div in chapter_tag.find_all("div"): - if div.attrs.get('style'): + if any(attr in ['width', 'border', 'bgcolor'] for attr in div.attrs): _wrap_tag_with_table( chapter_tag, tag_to_be_wrapped=div, From 94b5cb569c5794214f94f67fb149f3712df9d32b Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 22 Jun 2022 18:22:20 +0300 Subject: [PATCH 11/55] Add control on the same level of chapter marks --- src/epub_converter/epub_converter.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 57f2904..ae900ea 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -104,8 +104,8 @@ class EpubConverter: self.add_not_added_files_to_adjacency_list(not_added) self.logger.log(f"Html internal links and structure processing.") self.label_chapters_ids_with_lc_id() + self.chapter_marks_are_same_level() # used only after parsed toc, ids from toc needed - self.process_html_soup_structure_to_line() self.process_internal_links() self.logger.log(f"Define chapters content.") self.define_chapters_content() @@ -316,11 +316,24 @@ class EpubConverter: new_h.attrs["id"] = i tag.insert_before(new_h) - def process_html_soup_structure_to_line(self): - # go to line structure + def chapter_marks_are_same_level(self): + """ + Function checks that marks for pointing a start of a chapter are placed on one level in html tree. + Mark is tag with "class": "converter-chapter-mark". Added while TOC was parsed. + This tag must have a chapter_tag as a parent. + Otherwise, it is wrapped with some tags. Like: +

    + + """ for html_href in self.html_href2html_body_soup: - soup = self.html_href2html_body_soup[html_href] - self.html_href2html_body_soup[html_href] = process_structural_tags(soup) + chapter_tag = self.html_href2html_body_soup[html_href] + # check marks for chapter starting are on the same level - 1st + marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"}) + + # fix marks to be on 1 level + for mark in marks: + while mark.parent != chapter_tag: + mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases @staticmethod def create_unique_id(href, id_): From 2e0d812783822b90c1d4f2711c52a4fa14efd51f Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 22 Jun 2022 19:04:24 +0300 Subject: [PATCH 12/55] Footer -> Span --- src/livecarta_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 2cfafb6..0f6b0a1 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -128,7 +128,7 @@ class LiveCartaConfig: REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = { (r"^h[6-9]$", "figure$", "section$"): "p", ("^aside$",): "blockquote", - ("^header$",): "span", + ("^header$", "^footer$"): "span", ("^b$",): "strong", } From 3899e7f848204f13b394a1401b5b4c001d247d66 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 22 Jun 2022 19:05:13 +0300 Subject: [PATCH 13/55] Remove unuseful function class --- src/epub_converter/epub_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index ae900ea..ca7b69f 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -21,7 +21,7 @@ from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.footnotes_processing import preprocess_footnotes from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style -from src.epub_converter.html_epub_preprocessor import process_structural_tags, get_tags_between_chapter_marks,\ +from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\ prepare_title, prepare_content From 9422182b03d4ed1b382784ece5ee45306ef06f30 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 23 Jun 2022 12:59:37 +0300 Subject: [PATCH 14/55] Fix error with Navigable String doesn't have name --- src/epub_converter/html_epub_preprocessor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 87c98c9..793247c 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -132,7 +132,8 @@ def _remove_headings_content(content_tag, title_of_chapter: str): text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces if title_of_chapter == text or \ - (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name)): + (title_of_chapter in text and + re.findall(r"^h[1-3]$", tag.name or content_tag.name)): _add_span_to_save_ids_for_links(tag, content_tag) tag.extract() elif not isinstance(tag, NavigableString): From 1eb59a66ac4e2e7636d18ab9e8172f46510186ae Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 23 Jun 2022 13:02:23 +0300 Subject: [PATCH 15/55] Add processing of indents in %,em,pt --- src/epub_converter/css_preprocessing.py | 32 +++++++++++-------------- src/livecarta_config.py | 10 -------- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py index 29021ec..2e65880 100644 --- a/src/epub_converter/css_preprocessing.py +++ b/src/epub_converter/css_preprocessing.py @@ -21,7 +21,7 @@ def get_bg_color(x): return color -def convert_tag_style_values(size_value: str) -> str: +def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str: """ Function - converts values of tags from em/%/pt to px @@ -33,27 +33,23 @@ def convert_tag_style_values(size_value: str) -> str: Returns ------- size_value: str - + converted value size """ - def find_closest_size(style_value): - possible_sizes = list( - takewhile(lambda x: style_value >= x, LiveCartaConfig.sizes_pr)) - last_possible_size_index = LiveCartaConfig.sizes_pr.index( - possible_sizes[-1]) - return LiveCartaConfig.sizes_px[last_possible_size_index] - - font_size_regexp = re.compile( + size_regexp = re.compile( r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)") - has_style_attrs = re.search(font_size_regexp, size_value) + has_style_attrs = re.search(size_regexp, size_value) if has_style_attrs: if has_style_attrs.group(1): - size_value = float(size_value.replace("%", ""))*6 + multiplier = 5.76 if is_indent else 0.16 + size_value = float(size_value.replace("%", "")) * multiplier return str(size_value)+'px' elif has_style_attrs.group(3): - size_value = float(size_value.replace("em", "")) - return find_closest_size(size_value) + multiplier = 18 if is_indent else 16 + size_value = float(size_value.replace("em", "")) * multiplier + return str(size_value)+'px' elif has_style_attrs.group(5): - return size_value.replace("pt", "px") + size_value = float(size_value.replace("pt", "")) * 4/3 + return str(size_value)+'px' else: return "" return size_value @@ -73,10 +69,10 @@ def convert_indents_tag_values(size_value: str) -> str: """ if len(size_value.split(" ")) == 3: size_value = convert_tag_style_values(size_value.split( - " ")[-2]) # returns middle value + " ")[-2], True) # returns middle value else: size_value = convert_tag_style_values(size_value.split( - " ")[-1]) # returns last value + " ")[-1], True) # returns last value return size_value @@ -146,7 +142,7 @@ LIVECARTA_STYLE_ATTRS_MAPPING = { "list-style-image": lambda x: "disc", "margin-left": convert_indents_tag_values, "margin-top": convert_tag_style_values, - "margin": convert_indents_tag_values + "margin": convert_indents_tag_values, } diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 0f6b0a1..9fc8e2e 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -76,16 +76,6 @@ class LiveCartaConfig: INDENT = "30px" - sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, - 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69, - 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, - 2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] - - sizes_px = ["0px", "10px", "10px", "11px", "12px", "13px", "14px", "15px", "16px", "17px", "18px", - "19px", "20px", "21px", "22px", "23px", "24px", "25px", "26px", "27px", "28px", "29px", - "30px", "31px", "32px", "33px", "34px", "35px", "36px", "37px", "38px", "39px", "40px", - "41px", "42px", "43px", "44px", "45px", "46px", "47px", "48px", "49px", "50px", "64px", "72px"] - list_types = ["circle", "disc", "armenian", "decimal", "decimal-leading-zero", "georgian", "lower-alpha", "lower-latin", "lower-roman", "upper-alpha", "upper-latin", "upper-roman", "none"] From 536c1d23e438047d7dd69b6ec132a78e7543ec1f Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 23 Jun 2022 17:43:51 +0300 Subject: [PATCH 16/55] =?UTF-8?q?=D0=A1ut=20preparing=20title=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/epub_converter/html_epub_preprocessor.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 793247c..18f8902 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -79,10 +79,8 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu def prepare_title(title_of_chapter: str) -> str: """Function finalise processing/cleaning title""" title_str = BeautifulSoup(title_of_chapter, features="lxml").string - title_str = re.sub(r"([\n\t\xa0])", " ", title_str) - title_str = re.sub(r" +", " ", title_str).rstrip() - # clean whitespace characters ([\r\n\t\f\v ]) - title_str = re.sub(r"(^\s+)|(\s+$)", "", title_str) + # clean extra whitespace characters ([\r\n\t\f\v ]) + title_str = re.sub(r"[\s\xa0]", " ", title_str).strip() return title_str From 7e380ef431d46bbbe8acdbb9948493023f36d780 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 23 Jun 2022 17:44:48 +0300 Subject: [PATCH 17/55] Optimize heading cleaning --- src/epub_converter/html_epub_preprocessor.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 18f8902..e2fe136 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -126,7 +126,7 @@ def _remove_headings_content(content_tag, title_of_chapter: str): title_of_chapter = title_of_chapter.lower() for tag in content_tag.contents: text = tag if isinstance(tag, NavigableString) else tag.text - if re.sub(r'([\s\xa0])', '', text): + if re.sub(r"[\s\xa0]", "", text): text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces if title_of_chapter == text or \ @@ -134,9 +134,10 @@ def _remove_headings_content(content_tag, title_of_chapter: str): re.findall(r"^h[1-3]$", tag.name or content_tag.name)): _add_span_to_save_ids_for_links(tag, content_tag) tag.extract() + return elif not isinstance(tag, NavigableString): - _remove_headings_content(tag, title_of_chapter) - break + if not _remove_headings_content(tag, title_of_chapter): + break def _tags_to_correspond_livecarta_tag(chapter_tag): @@ -275,13 +276,13 @@ def _preprocess_div_tags(chapter_tag): Function replace
    with
    : """ for div in chapter_tag.find_all("div"): - if any(attr in ['width', 'border', 'bgcolor'] for attr in div.attrs): + if any(attr in ["width", "border", "bgcolor"] for attr in div.attrs): _wrap_tag_with_table( chapter_tag, tag_to_be_wrapped=div, - width=div.attrs['width'] if div.attrs.get('width') else '100', - border=div.attrs['border'] if div.attrs.get('border') else None, - bg_color=div.attrs['bgcolor'] if div.attrs.get('bgcolor') else None) + width=div.attrs["width"] if div.attrs.get("width") else "100", + border=div.attrs["border"] if div.attrs.get("border") else None, + bg_color=div.attrs["bgcolor"] if div.attrs.get("bgcolor") else None) else: div.name = "p" continue From ffa6e90ad55d91a541d2c012422da24ec597867e Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 23 Jun 2022 18:27:23 +0300 Subject: [PATCH 18/55] Take class removing to a function --- src/epub_converter/html_epub_preprocessor.py | 27 ++++++++++---------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index e2fe136..627d1f1 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -96,13 +96,11 @@ def _wrap_strings_with_p(chapter_tag): for node in chapter_tag: if isinstance(node, NavigableString): content = str(node) - content = re.sub(r"([\n\t\xa0])", " ", content) - # remove spaces at the beginning and at the end of the string: - content = content.strip() + content = re.sub(r"([\s\xa0])", " ", content).strip() if content: - tag = chapter_tag.new_tag("p") - tag.append(str(node)) - node.replace_with(tag) + p_tag = chapter_tag.new_tag("p") + p_tag.append(str(node)) + node.replace_with(p_tag) def _remove_headings_content(content_tag, title_of_chapter: str): @@ -146,6 +144,7 @@ def _tags_to_correspond_livecarta_tag(chapter_tag): for key in reg_key: tags = chapter_tag.find_all(re.compile(key)) for tag in tags: + # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) tag.name = to_replace_value def _unwrap_tags(chapter_tag): @@ -300,8 +299,6 @@ def _clean_wiley_block(block): h.insert_before(BeautifulSoup(features="lxml").new_tag("br")) - - def _preprocess_block_tags(chapter_tag: Tag): """Function preprocessing tags""" for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}): @@ -323,6 +320,13 @@ def _preprocess_block_tags(chapter_tag: Tag): _wrap_tag_with_table(chapter_tag, future_block, bg_color=color) +def _class_removing(chapter_tag): + for tag in chapter_tag.find_all(recursive=True): + if tag.attrs.get("class") \ + and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): + del tag.attrs["class"] + + def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: """ Function finalise processing/cleaning content @@ -368,9 +372,6 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro _preprocess_div_tags(content_tag) _preprocess_block_tags(content_tag) - # 5. remove classes that were created by converter - for tag in content_tag.find_all(recursive=True): - if hasattr(tag, "attrs") and tag.attrs.get("class") \ - and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): - del tag.attrs["class"] + # 5. remove classes that weren't created by converter + _class_removing(content_tag) return str(content_tag) From 66e03c98e39d8e7ac75a64eaa7a3467f51de830e Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 23 Jun 2022 18:48:50 +0300 Subject: [PATCH 19/55] Add style to tags without --- src/epub_converter/html_epub_preprocessor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 627d1f1..9f776c3 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -209,7 +209,7 @@ def _preprocess_code_tags(chapter_tag: BeautifulSoup): """ Function - transform , , tags into span - - add code style to this tags + - add code style to this tags (if there is no) Parameters ---------- chapter_tag: Tag, soup object @@ -222,10 +222,10 @@ def _preprocess_code_tags(chapter_tag: BeautifulSoup): for code in chapter_tag.find_all(re.compile("code|kbd|var")): if not code.parent.name == "pre": code.name = "span" + if not code.attrs.get("style"): + code.attrs["style"] = "font-size: 14px; font-family: courier new,courier,monospace;" continue - # if tag isn"t in pre and doesn"t have style - if not code.attrs.get("style"): - code.attrs["style"] = "font-size: 14px; font-family: courier new,courier,monospace;" + def _preprocess_pre_tags(chapter_tag: BeautifulSoup): From 9f067bb93d0b2b4c3217a465954d4c360da2f4f8 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 24 Jun 2022 17:10:43 +0300 Subject: [PATCH 20/55] Make structure of docx c as epub s --- src/docx_converter/footnotes_processing.py | 73 +++++++++ src/docx_converter/html_docx_preprocessor.py | 148 +++---------------- src/docx_converter/image_processing.py | 39 +++++ 3 files changed, 134 insertions(+), 126 deletions(-) create mode 100644 src/docx_converter/footnotes_processing.py create mode 100644 src/docx_converter/image_processing.py diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py new file mode 100644 index 0000000..84861d7 --- /dev/null +++ b/src/docx_converter/footnotes_processing.py @@ -0,0 +1,73 @@ +import re +from bs4 import BeautifulSoup, NavigableString, Tag + +@staticmethod +def _clean_footnote_content(content): + content = content.strip() + return content.strip() + + +def process_footnotes(body_tag): + """Function returns list of footnotes and delete them from html_soup.""" + footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc') + footnote_content = body_tag.find_all( + 'div', id=re.compile(r'^sdfootnote\d+$')) + footnote_amt = len(footnote_anchors) + + assert footnote_amt == len(footnote_content), \ + 'Something went wrong with footnotes after libre conversion' + + footnotes = [] + + for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): + true_a_tag = cont_tag.find_all( + 'a', class_=re.compile(r'^sdfootnote.+$'))[0] + + if true_a_tag.attrs.get('href') is None: + cont_tag.a.decompose() + continue + + assert anc_tag['name'] == true_a_tag['href'][1:], \ + 'Something went wrong with footnotes after libre conversion' + + new_tag = BeautifulSoup(features='lxml').new_tag('sup') + new_tag['class'] = 'footnote-element' + new_tag['data-id'] = i + 1 + new_tag['id'] = f'footnote-{i + 1}' + new_tag.string = '*' + anc_tag.replace_with(new_tag) + + # extra digits in footnotes from documents downloaded from livecarta + a_text = true_a_tag.text + if len(cont_tag.find_all('p')): + sup = cont_tag.find_all('p')[0].find('sup') + if sup and sup.text == a_text: + sup.decompose() + + for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}): + tag_a.decompose() + + # remove font-size + for span in cont_tag.find_all('span', {'style': re.compile('font-size')}): + style = span.get('style') + style = re.sub(r"font-size: \d+px", "", style) + if style == '': + del span.attrs['style'] + else: + span.attrs['style'] = style + + unicode_string = '' + for child in cont_tag.children: + if type(child) is NavigableString: + continue + if child.name == 'blockquote': + unicode_string += str(child) + else: + unicode_string += child.decode_contents() + + content = _clean_footnote_content(unicode_string) + cont_tag.decompose() + + footnotes.append(content) + + return footnotes diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index 80d96a3..425fa10 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -1,14 +1,13 @@ -import os import re import logging -import pathlib from typing import List -from shutil import copyfile from bs4 import BeautifulSoup, NavigableString, Tag from src.livecarta_config import LiveCartaConfig from src.util.helpers import BookLogger, BookStatusWrapper +from src.docx_converter.footnotes_processing import process_footnotes +from src.docx_converter.image_processing import process_images class HTMLDocxPreprocessor: @@ -22,6 +21,7 @@ class HTMLDocxPreprocessor: self.content = list() def _clean_tag(self, tag: str, attr_name: str, attr_value: re): + # todo regex """ Function to clean tags by its name and attribute value. Parameters @@ -44,6 +44,7 @@ class HTMLDocxPreprocessor: tag.unwrap() def _clean_underline_links(self): + # todo regex """Function cleans meaningless tags before links.""" underlines = self.body_tag.find_all("u") for u in underlines: @@ -99,12 +100,10 @@ class HTMLDocxPreprocessor: """ fonts = self.body_tag.find_all("font") for font in fonts: - face = font.get("face") - style = font.get("style") - color = font.get("color") + face, style, color =\ + font.get("face"), font.get("style"), font.get("color") - font.attrs = {} - font.name = "span" + font.attrs, font.name = {}, "span" if style: style = self.convert_font_pt_to_px(style) if style != "": @@ -127,14 +126,8 @@ class HTMLDocxPreprocessor: # on this step there should be no more tags assert len(self.body_tag.find_all("font")) == 0 - def delete_content_before_toc(self): - # remove all tag upper the only in content !!! body tag is not updated - toc_tag = self.html_soup.new_tag('TOC') - if toc_tag in self.content: - ind = self.content.index(toc_tag) + 1 - self.content = self.content[ind:] - def clean_trash(self): + # todo make it regex dict """Function to remove all styles and tags we don't need.""" self._clean_tag('span', 'style', re.compile( r'^background: #[\da-fA-F]{6}$')) @@ -308,115 +301,8 @@ class HTMLDocxPreprocessor: tag.string = tag.text.replace('\u200b', '') # zero-width-space tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') - @staticmethod - def _clean_footnote_content(content): - content = content.strip() - return content.strip() - - def _process_footnotes(self): - """Function returns list of footnotes and delete them from html_soup.""" - footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc') - footnote_content = self.body_tag.find_all( - 'div', id=re.compile(r'^sdfootnote\d+$')) - footnote_amt = len(footnote_anchors) - - assert footnote_amt == len(footnote_content), \ - 'Something went wrong with footnotes after libre conversion' - - footnotes = [] - - for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): - true_a_tag = cont_tag.find_all( - 'a', class_=re.compile(r'^sdfootnote.+$'))[0] - - if true_a_tag.attrs.get('href') is None: - cont_tag.a.decompose() - continue - - assert anc_tag['name'] == true_a_tag['href'][1:], \ - 'Something went wrong with footnotes after libre conversion' - - new_tag = BeautifulSoup(features='lxml').new_tag('sup') - new_tag['class'] = 'footnote-element' - new_tag['data-id'] = i + 1 - new_tag['id'] = f'footnote-{i + 1}' - new_tag.string = '*' - anc_tag.replace_with(new_tag) - - # extra digits in footnotes from documents downloaded from livecarta - a_text = true_a_tag.text - if len(cont_tag.find_all('p')): - sup = cont_tag.find_all('p')[0].find('sup') - if sup and sup.text == a_text: - sup.decompose() - - for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}): - tag_a.decompose() - - # remove font-size - for span in cont_tag.find_all('span', {'style': re.compile('font-size')}): - style = span.get('style') - style = re.sub(r"font-size: \d+px", "", style) - if style == '': - del span.attrs['style'] - else: - span.attrs['style'] = style - - unicode_string = '' - for child in cont_tag.children: - if type(child) is NavigableString: - continue - if child.name == 'blockquote': - unicode_string += str(child) - else: - unicode_string += child.decode_contents() - - content = self._clean_footnote_content(unicode_string) - cont_tag.decompose() - - footnotes.append(content) - - self.footnotes = footnotes - - def _process_images(self, access, html_path, book_id): - """ - Function to process tag. Img should be sent Amazon S3 and then return new tag with valid link. - For now images are moved to one folder. - """ - img_tags = self.body_tag.find_all('img') - - if len(img_tags): - if access is None: - folder_path = os.path.dirname( - os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join( - folder_path, f'json/img_{book_id}/')) - new_path.mkdir(exist_ok=True) - - for img in img_tags: - img_name = img.attrs.get('src') - # quick fix for bad links - if (len(img_name) >= 3) and img_name[:3] == '../': - img_name = img_name[3:] - - img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') - - if access is not None: - link = access.send_image(img_path, doc_id=book_id) - img.attrs['src'] = link - self.logger_object.log( - f'{img_name} successfully uploaded.') - else: - img_size = os.path.getsize(img_path) - self.logger_object.log( - f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG) - new_img_path = new_path / img_name - copyfile(img_path, new_img_path) - img.attrs["src"] = str(new_img_path) - - self.images = img_tags - def _process_footer(self): + # todo regex """ Function to process
    tags. All the tags will be deleted from file. @@ -426,6 +312,7 @@ class HTMLDocxPreprocessor: div.decompose() def _process_div(self): + # todo regex """Function to process
    tags. All the tags will be deleted from file, all content of the tags will stay.""" divs = self.body_tag.find_all("div") @@ -505,6 +392,7 @@ class HTMLDocxPreprocessor: self.apply_func_to_last_child(children[0], func) def _preprocessing_headings(self): + # todo regex """Function to convert all lower level headings to p tags""" pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = self.body_tag.find_all(re.compile(pattern)) @@ -584,6 +472,7 @@ class HTMLDocxPreprocessor: self.top_level_headers[i]['should_be_numbered'] = True def _process_headings(self): + # todo regex """ Function to process tags . Steps @@ -660,6 +549,7 @@ class HTMLDocxPreprocessor: content[0], self.clean_tag_from_numbering) def _process_lists(self): + # todo regex """ Function - process tags
  • . @@ -678,6 +568,13 @@ class HTMLDocxPreprocessor: li_tag.attrs.update(li_tag.p.attrs) li_tag.p.unwrap() + def delete_content_before_toc(self): + # remove all tag upper the only in content !!! body tag is not updated + toc_tag = self.html_soup.new_tag('TOC') + if toc_tag in self.content: + ind = self.content.index(toc_tag) + 1 + self.content = self.content[ind:] + def process_html(self, access=None, html_path='', book_id='local'): """Process html code to satisfy LiveCarta formatting.""" self.logger_object.log('Beginning of processing .html file.') @@ -705,13 +602,12 @@ class HTMLDocxPreprocessor: self._process_hrefs() self.logger_object.log('Footnotes processing.') - self._process_footnotes() + self.footnotes = process_footnotes(self.body_tag) self.logger_object.log( f'{len(self.footnotes)} footnotes have been processed.') self.logger_object.log('Image processing.') - self._process_images( - access=access, html_path=html_path, book_id=book_id) + self.images = process_images(self.body_tag, access=access, html_path=html_path, book_id=book_id) self.logger_object.log( f'{len(self.images)} images have been processed.') diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py new file mode 100644 index 0000000..923a274 --- /dev/null +++ b/src/docx_converter/image_processing.py @@ -0,0 +1,39 @@ +import os +import logging +import pathlib +from shutil import copyfile + + +def process_images(body_tag, access, html_path, book_id): + """ + Function to process tag. Img should be sent Amazon S3 and then return new tag with valid link. + For now images are moved to one folder. + """ + img_tags = body_tag.find_all('img') + + if len(img_tags): + if access is None: + folder_path = os.path.dirname( + os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join( + folder_path, f'json/img_{book_id}/')) + new_path.mkdir(exist_ok=True) + + for img in img_tags: + img_name = img.attrs.get('src') + # quick fix for bad links + if (len(img_name) >= 3) and img_name[:3] == '../': + img_name = img_name[3:] + + img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') + + if access is not None: + link = access.send_image(img_path, doc_id=book_id) + img.attrs['src'] = link + else: + img_size = os.path.getsize(img_path) + new_img_path = new_path / img_name + copyfile(img_path, new_img_path) + img.attrs["src"] = str(new_img_path) + + return img_tags \ No newline at end of file From d91f6aba4a4034672750530bb12f18eff1c7a842 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 24 Jun 2022 17:12:00 +0300 Subject: [PATCH 21/55] Create preset for wrapping tags with tables --- src/docx_converter/libre_html2json_converter.py | 5 +---- src/epub_converter/footnotes_processing.py | 2 +- src/epub_converter/image_processing.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/docx_converter/libre_html2json_converter.py b/src/docx_converter/libre_html2json_converter.py index 45522da..0cd92fa 100644 --- a/src/docx_converter/libre_html2json_converter.py +++ b/src/docx_converter/libre_html2json_converter.py @@ -107,10 +107,7 @@ class LibreHTML2JSONConverter: def convert_to_dict(self): """Function which convert list of html nodes to appropriate json structure.""" - json_strc = [] - ind = 0 - ch_num = 0 - ch_amt = 0 + json_strc, ind, ch_num, ch_amt = [], 0, 0, 0 try: while ind < len(self.content): diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py index d9840f3..ef2eac0 100644 --- a/src/epub_converter/footnotes_processing.py +++ b/src/epub_converter/footnotes_processing.py @@ -84,4 +84,4 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note footnote_tag = footnote_tag.find( attrs={"role": "doc-backlink"}) or footnote_tag new_footnotes_tags.append(footnote_tag) - return footnotes, new_noterefs_tags, new_footnotes_tags \ No newline at end of file + return footnotes, new_noterefs_tags, new_footnotes_tags diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py index 950bbdd..aefa24d 100644 --- a/src/epub_converter/image_processing.py +++ b/src/epub_converter/image_processing.py @@ -64,4 +64,4 @@ def update_images_src_links(body_tag: BeautifulSoup, del img.attrs["height"] if img.attrs.get("style"): del img.attrs["style"] - return path2aws_path \ No newline at end of file + return path2aws_path From f690412f5ccd4d663b2550e12931fcd545e7e3d3 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 24 Jun 2022 17:12:21 +0300 Subject: [PATCH 22/55] Create preset for wrapping tags with tables --- src/epub_converter/html_epub_preprocessor.py | 161 ++++++++----------- src/livecarta_config.py | 7 +- 2 files changed, 71 insertions(+), 97 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 9f776c3..fbc45a2 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -103,6 +103,61 @@ def _wrap_strings_with_p(chapter_tag): node.replace_with(p_tag) +def _wrap_tags_with_table(chapter_tag): + """Function wraps with
  • """ + def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): + table = chapter_tag.new_tag("table") + table.attrs["border"], table.attrs["align"], table.attrs["style"] \ + = border, "center", f"width:{width}%;" + tbody, tr, td = \ + chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") + td.attrs["bgcolor"] = bg_color + tag_to_be_wrapped.wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) + return table + + def process_tag_using_table(tag_to_wrap): + _wrap_tag_with_table( + chapter_tag, + tag_to_be_wrapped=tag_to_wrap, + width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", + border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, + bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) + _add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) + tag_to_wrap.unwrap() + + for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items(): + if isinstance(attrs, tuple): + attr, val = attrs[0], attrs[1] + for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}): + process_tag_using_table(tag_to_wrap) + else: + for tag_to_wrap in chapter_tag.find_all(tags_to_wrap): + if any(attr_name in attrs for attr_name in tag_to_wrap.attrs): + process_tag_using_table(tag_to_wrap) + + +def _tags_to_correspond_livecarta_tag(chapter_tag): + """Function to replace all tags to correspond livecarta tags""" + for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): + for key in reg_key: + tags = chapter_tag.find_all(re.compile(key)) + for tag in tags: + # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) + tag.name = to_replace_value + + +def _unwrap_tags(chapter_tag): + """Function unwrap tags and move id to span""" + for tag in LiveCartaConfig.TAGS_TO_UNWRAP: + for s in chapter_tag.find_all(tag): + _add_span_to_save_ids_for_links(s, chapter_tag) + s.unwrap() + + def _remove_headings_content(content_tag, title_of_chapter: str): """ Function @@ -138,23 +193,6 @@ def _remove_headings_content(content_tag, title_of_chapter: str): break -def _tags_to_correspond_livecarta_tag(chapter_tag): - """Function to replace all tags to correspond livecarta tags""" - for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): - for key in reg_key: - tags = chapter_tag.find_all(re.compile(key)) - for tag in tags: - # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) - tag.name = to_replace_value - -def _unwrap_tags(chapter_tag): - """Function unwrap tags and move id to span""" - for tag in LiveCartaConfig. TAGS_TO_UNWRAP: - for s in chapter_tag.find_all(tag): - _add_span_to_save_ids_for_links(s, chapter_tag) - s.unwrap() - - # todo remove def _process_lists(chapter_tag: BeautifulSoup): """ @@ -181,13 +219,11 @@ def _preprocess_table(chapter_tag: BeautifulSoup): """Function to preprocess tables and tags(td|th|tr): style""" tables = chapter_tag.find_all("table") for table in tables: - t_tags = table.find_all(re.compile("td|th|tr")) - for t_tag in t_tags: - style = t_tag.get("style") + for t_tag in table.find_all(re.compile("td|th|tr")): width = "" - if style: + if t_tag.get("style"): width_match = re.search( - r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) + r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) if width_match: size = width_match.group(1) width = size + "px" @@ -197,9 +233,8 @@ def _preprocess_table(chapter_tag: BeautifulSoup): if t_tag.attrs.get("style"): t_tag.attrs["style"] = t_tag.attrs["style"].replace( "border:0;", "") - - elif t_tag.attrs.get("style") == "": - del t_tag.attrs["style"] + if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": + del t_tag.attrs["style"] if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: table.attrs["border"] = "1" @@ -254,72 +289,6 @@ def _preprocess_pre_tags(chapter_tag: BeautifulSoup): pre.append(code) -def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): - """Function wraps with
    """ - table = chapter_tag.new_tag("table") - table.attrs["border"], table.attrs["align"], table.attrs["style"] \ - = border, "center", f"width:{width}%;" - tbody, tr, td = \ - chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") - td.attrs["bgcolor"] = bg_color - tag_to_be_wrapped.wrap(td) - td.wrap(tr) - tr.wrap(tbody) - tbody.wrap(table) - table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) - return table - - -def _preprocess_div_tags(chapter_tag): - """ - Function replace
    with
    : - """ - for div in chapter_tag.find_all("div"): - if any(attr in ["width", "border", "bgcolor"] for attr in div.attrs): - _wrap_tag_with_table( - chapter_tag, - tag_to_be_wrapped=div, - width=div.attrs["width"] if div.attrs.get("width") else "100", - border=div.attrs["border"] if div.attrs.get("border") else None, - bg_color=div.attrs["bgcolor"] if div.attrs.get("bgcolor") else None) - else: - div.name = "p" - continue - _add_span_to_save_ids_for_links(div, chapter_tag) - div.unwrap() - - -def _clean_wiley_block(block): - hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) - for hr in hrs: - hr.extract() - h = block.find(re.compile("h[1-9]")) - if h: - h.name = "p" - h.insert_before(BeautifulSoup(features="lxml").new_tag("br")) - - -def _preprocess_block_tags(chapter_tag: Tag): - """Function preprocessing tags""" - for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}): - _clean_wiley_block(block) - color = "#DDDDDD" if block.attrs.get( - "class") == "feature1" else None - color = "#EEEEEE" if block.attrs.get( - "class") == "feature2" else color - _wrap_tag_with_table(chapter_tag, block, bg_color=color) - block.insert_after(BeautifulSoup(features="lxml").new_tag("br")) - block.unwrap() - - for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): - _clean_wiley_block(future_block) - color = "#DDDDDD" if future_block.attrs.get( - "class") == "feature1" else None - color = "#EEEEEE" if future_block.attrs.get( - "class") == "feature2" else color - _wrap_tag_with_table(chapter_tag, future_block, bg_color=color) - - def _class_removing(chapter_tag): for tag in chapter_tag.find_all(recursive=True): if tag.attrs.get("class") \ @@ -356,6 +325,8 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro # 2. wrap NavigableString with tag

    _wrap_strings_with_p(content_tag) + _wrap_tags_with_table(content_tag) + _tags_to_correspond_livecarta_tag(content_tag) _unwrap_tags(content_tag) @@ -365,12 +336,10 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro _remove_headings_content(content_tag, title_str) # 4. processing tags (

  • ,
  • , ,
    , 
    , ) - _process_lists(content_tag) + _process_lists(content_tag) # todo regex _preprocess_table(content_tag) - _preprocess_code_tags(content_tag) - _preprocess_pre_tags(content_tag) - _preprocess_div_tags(content_tag) - _preprocess_block_tags(content_tag) + _preprocess_code_tags(content_tag) # todo regex + _preprocess_pre_tags(content_tag) # todo regex # 5. remove classes that weren't created by converter _class_removing(content_tag) diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 9fc8e2e..a81ffca 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -115,8 +115,13 @@ class LiveCartaConfig: r"(^h[1-9]$)": ["list-style-type"] } + WRAP_TAGS_WITH_TABLE = { + ("div",) :["width", "border", "bgcolor"], + ("section", "blockquote",) : ("class", r"feature[1234]"), + } + REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = { - (r"^h[6-9]$", "figure$", "section$"): "p", + (r"^h[6-9]$", "^figure$", "^section$", "^div$"): "p", ("^aside$",): "blockquote", ("^header$", "^footer$"): "span", ("^b$",): "strong", From 01c2c8b120b381349f3fc92097181ada0af5d64c Mon Sep 17 00:00:00 2001 From: Kiryl Date: Mon, 27 Jun 2022 19:12:02 +0300 Subject: [PATCH 23/55] Add ability to replace tags based on parents/children --- src/epub_converter/html_epub_preprocessor.py | 25 +++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index fbc45a2..042b7b0 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -142,12 +142,25 @@ def _wrap_tags_with_table(chapter_tag): def _tags_to_correspond_livecarta_tag(chapter_tag): """Function to replace all tags to correspond livecarta tags""" - for reg_key, to_replace_value in LiveCartaConfig.REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS.items(): - for key in reg_key: - tags = chapter_tag.find_all(re.compile(key)) - for tag in tags: - # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) - tag.name = to_replace_value + for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items(): + for key in reg_keys: + if isinstance(key, tuple): + replace = key[0] + parent, child = key[1], key[2] + for parent_tag in chapter_tag.select(parent): + if replace == "parent": + parent_tag.name = to_replace_value + elif replace == "child": + for child_tag in parent_tag.select(child): + child_tag.name = to_replace_value + if not child_tag.attrs.get("style"): + child_tag.attrs["style"] =\ + "font-size: 14px; font-family: courier new,courier,monospace;" + else: + tags = chapter_tag.find_all(re.compile(key)) + for tag in tags: + # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) + tag.name = to_replace_value def _unwrap_tags(chapter_tag): From 8e4d4de5bc7e3639ebf93e175d04fbb79f364062 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Mon, 27 Jun 2022 19:12:39 +0300 Subject: [PATCH 24/55] Add ability to unwrap tags that are in certain tags --- src/epub_converter/html_epub_preprocessor.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 042b7b0..60cc91e 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -165,10 +165,14 @@ def _tags_to_correspond_livecarta_tag(chapter_tag): def _unwrap_tags(chapter_tag): """Function unwrap tags and move id to span""" - for tag in LiveCartaConfig.TAGS_TO_UNWRAP: - for s in chapter_tag.find_all(tag): - _add_span_to_save_ids_for_links(s, chapter_tag) - s.unwrap() + for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP: + for tag in chapter_tag.select(tag_name): + # if tag is a subtag + if ">" in tag_name: + parent = tag.parent + tag.parent.attrs.update(tag.attrs) + _add_span_to_save_ids_for_links(tag, chapter_tag) + tag.unwrap() def _remove_headings_content(content_tag, title_of_chapter: str): From 9b4ecfd63c6c361f3d21f7d3eca41190bbb136c1 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Mon, 27 Jun 2022 19:13:39 +0300 Subject: [PATCH 25/55] Add function - insert certain tags in parent tags --- src/epub_converter/html_epub_preprocessor.py | 87 ++++---------------- 1 file changed, 15 insertions(+), 72 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 60cc91e..450d776 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -210,28 +210,6 @@ def _remove_headings_content(content_tag, title_of_chapter: str): break -# todo remove -def _process_lists(chapter_tag: BeautifulSoup): - """ - Function - - process tags
  • . - - unwrap

    tags. - Parameters - ---------- - chapter_tag: Tag, soup object - - Returns - ------- - None - - """ - li_tags = chapter_tag.find_all("li") - for li_tag in li_tags: - if li_tag.p: - li_tag.attrs.update(li_tag.p.attrs) - li_tag.p.unwrap() - - def _preprocess_table(chapter_tag: BeautifulSoup): """Function to preprocess tables and tags(td|th|tr): style""" tables = chapter_tag.find_all("table") @@ -257,53 +235,20 @@ def _preprocess_table(chapter_tag: BeautifulSoup): table.attrs["border"] = "1" -def _preprocess_code_tags(chapter_tag: BeautifulSoup): - """ - Function - - transform , , tags into span - - add code style to this tags (if there is no) - Parameters - ---------- - chapter_tag: Tag, soup object - - Returns - ------- - None - - """ - for code in chapter_tag.find_all(re.compile("code|kbd|var")): - if not code.parent.name == "pre": - code.name = "span" - if not code.attrs.get("style"): - code.attrs["style"] = "font-size: 14px; font-family: courier new,courier,monospace;" - continue - - - -def _preprocess_pre_tags(chapter_tag: BeautifulSoup): - """ - Function preprocessing

     tags
    -    Wrap string of the tag with  if its necessary
    -    Parameters
    -    ----------
    -    chapter_tag: Tag, soup object
    -
    -    Returns
    -    ----------
    -    None
    -        Modified chapter tag
    -
    -    """
    -    for pre in chapter_tag.find_all("pre"):
    -        if pre.find_all("code|kbd|var"):
    -            continue
    -        else:
    -            code = chapter_tag.new_tag("code")
    -            # insert all items that was in pre to code and remove from pre
    -            for content in reversed(pre.contents):
    -                code.insert(0, content.extract())
    -            # wrap code with items
    -            pre.append(code)
    +def _insert_tags_in_parents(chapter_tag):
    +    parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()}
    +    for parent_tag_name, condition in parent_tag2condition.items():
    +        for parent_tag in chapter_tag.select(parent_tag_name):
    +            if parent_tag.select(condition):
    +                continue
    +            else:
    +                tag_to_insert = chapter_tag.new_tag(
    +                    LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)])
    +                # insert all items that was in pre to code and remove from pre
    +                for content in reversed(parent_tag.contents):
    +                    tag_to_insert.insert(0, content.extract())
    +                # wrap code with items
    +                parent_tag.append(tag_to_insert)
     
     
     def _class_removing(chapter_tag):
    @@ -353,10 +298,8 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
             _remove_headings_content(content_tag, title_str)
     
         # 4. processing tags (
  • ,
  • , ,
    , 
    , ) - _process_lists(content_tag) # todo regex _preprocess_table(content_tag) - _preprocess_code_tags(content_tag) # todo regex - _preprocess_pre_tags(content_tag) # todo regex + _insert_tags_in_parents(content_tag) # 5. remove classes that weren't created by converter _class_removing(content_tag) From eab4f0130aa06ffa452a51ecff2f7e7b4768729f Mon Sep 17 00:00:00 2001 From: Kiryl Date: Mon, 27 Jun 2022 19:16:17 +0300 Subject: [PATCH 26/55] Update livecarta_config.py with processing changes --- src/epub_converter/html_epub_preprocessor.py | 2 +- src/livecarta_config.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 450d776..f9c2c06 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -198,7 +198,7 @@ def _remove_headings_content(content_tag, title_of_chapter: str): text = tag if isinstance(tag, NavigableString) else tag.text if re.sub(r"[\s\xa0]", "", text): text = re.sub(r"[\s\xa0]", " ", text).lower() - text = text.strip() # delete extra spaces + text = text.strip() # delete extra spaces if title_of_chapter == text or \ (title_of_chapter in text and re.findall(r"^h[1-3]$", tag.name or content_tag.name)): diff --git a/src/livecarta_config.py b/src/livecarta_config.py index a81ffca..9929cda 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -120,13 +120,20 @@ class LiveCartaConfig: ("section", "blockquote",) : ("class", r"feature[1234]"), } - REPLACE_REGEX_WITH_LIVECARTA_CORRESPOND_TAGS = { + """('what to replace', 'parent tag', 'child tag')""" + REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS = { (r"^h[6-9]$", "^figure$", "^section$", "^div$"): "p", ("^aside$",): "blockquote", - ("^header$", "^footer$"): "span", + ("^header$", "^footer$", ("child", ":not(pre)", "code, kbd, var")): "span", ("^b$",): "strong", + # (("parent", ":not(pre)", "code")): "p", } + """ > == in (p in li)""" TAGS_TO_UNWRAP = [ - "section", "article", "figcaption", "main", "body", "html", + "section", "article", "figcaption", "main", "body", "html", "li > p", ] + + INSERT_TAG_IN_PARENT_TAG = { + ("pre", "code, kbd, var"): "code", + } From f01f6ad778373ad1af73dd464ec2bc7891273df6 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 28 Jun 2022 16:38:21 +0300 Subject: [PATCH 27/55] Remove font=family processing --- src/docx_converter/html_docx_preprocessor.py | 7 ---- src/epub_converter/css_preprocessing.py | 39 +++++++++++--------- src/livecarta_config.py | 18 +-------- 3 files changed, 23 insertions(+), 41 deletions(-) diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index 425fa10..c264d17 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -113,13 +113,6 @@ class HTMLDocxPreprocessor: elif color and color in LiveCartaConfig.COLORS_MAP: font.attrs["style"] = f'color: {color};' - if face is not None: - face = re.sub(r",[\w,\- ]*$", "", face) - if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(face): - font.attrs["face"] = LiveCartaConfig.FONT_CORRESPONDANCE_TABLE[face] - else: - font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME - if len(font.attrs) == 0: font.unwrap() diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py index 2e65880..0ad0ff7 100644 --- a/src/epub_converter/css_preprocessing.py +++ b/src/epub_converter/css_preprocessing.py @@ -78,7 +78,7 @@ def convert_indents_tag_values(size_value: str) -> str: """ Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } -Style properties that can be used to fit livecarta css style convention. +Style properties that can be used to fit LiveCarta css style convention. If property has empty list, it means that any value can be converted. If property has not empty list, it means that only certain property-value combinations can be transformed. """ @@ -88,8 +88,7 @@ LIVECARTA_STYLE_ATTRS = { "text-align": [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], "align": [], "font": [], - "font-family": [x for x in LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.keys() - if x != LiveCartaConfig.DEFAULT_FONT_NAME], + "font-family": [], "font-size": [], "font-weight": ["bold", "600", "700", "800", "900"], # "font-style": ["italic"], # @@ -118,15 +117,14 @@ LIVECARTA_STYLE_ATTRS = { Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated -to suit livecarta style convention. +to suit LiveCarta style convention. """ LIVECARTA_STYLE_ATTRS_MAPPING = { "text-indent": convert_indents_tag_values, "font-variant": lambda x: x, "text-align": lambda x: x, "font": lambda x: "", - "font-family": lambda x: LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x.title())) - or LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(re.sub(r"^\s+|\s+$", "", x)), + "font-family": lambda x: x, "font-size": convert_tag_style_values, "color": get_text_color, "background-color": get_bg_color, @@ -146,6 +144,15 @@ LIVECARTA_STYLE_ATTRS_MAPPING = { } +def style_conditions(style_value, style_name): + cleaned_value = style_value.replace("\"", "") + constraints_on_value = LIVECARTA_STYLE_ATTRS.get( + style_name) + value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ + style_name] + return cleaned_value, constraints_on_value, value_not_in_possible_values_list + + def update_inline_styles_to_livecarta_convention(split_style: list): for i, style in enumerate(split_style): style_name, style_value = style.split(":") @@ -154,11 +161,8 @@ def update_inline_styles_to_livecarta_convention(split_style: list): split_style[i] = "" return split_style - cleaned_value = style_value.replace("\"", "").split()[-1] - constraints_on_value = LIVECARTA_STYLE_ATTRS.get( - style_name) - value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ - style_name] + cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ + style_conditions(style_value, style_name) if constraints_on_value and value_not_in_possible_values_list: # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file split_style[i] = "" @@ -172,7 +176,7 @@ def update_inline_styles_to_livecarta_convention(split_style: list): def build_inline_style_content(style: str) -> str: - """Build inline style with livecarta convention""" + """Build inline style with LiveCarta convention""" # replace all spaces between "; & letter" to ";" style = re.sub(r"; *", ";", style) # when we split style by ";", last element of the list is "" - None @@ -189,16 +193,15 @@ def build_inline_style_content(style: str) -> str: def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule, style_type: cssutils.css.property.Property): + if style_type.name == "font-family": + pass if style_type.name not in LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = "" return - cleaned_value = style_type.value.replace("\"", "") - constraints_on_value = LIVECARTA_STYLE_ATTRS.get( - style_type.name) - value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ - style_type.name] + cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ + style_conditions(style_type.value, style_type.name) if constraints_on_value and value_not_in_possible_values_list: # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = "" @@ -210,7 +213,7 @@ def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRul def build_css_file_content(css_content: str) -> str: - """Build css content with livecarta convention""" + """Build css content with LiveCarta convention""" sheet = cssutils.parseString(css_content, validate=False) for css_rule in sheet: diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 9929cda..9a94545 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -23,20 +23,6 @@ class LiveCartaConfig: FONT_CONVERT_RATIO = LIVECARTA_DEFAULT_FONT_SIZE /\ WORD_DEFAULT_FONT_SIZE - FONT_CORRESPONDANCE_TABLE = { - "Arial": "arial,helvetica,sans-serif", - "Comic Sans MS": "comic sans ms,cursive", - "Courier New": "courier new,courier,monospace", - "Georgia": "georgia,serif", - "Lucida Sans Unicode": "lucida sans unicode,lucida grande,sans-serif", - "Tahoma": "tahoma,geneva,sans-serif", - "Times New Roman": "times new roman,times,serif", - "Trebuchet MS": "trebuchet ms,helvetica,sans-serif", - "Verdana": "verdana,geneva,sans-serif", - "monospace": "courier new,courier,monospace", - "sans-serif": "arial,helvetica,sans-serif" - } - COLORS_MAP = { "#ffff00": "yellow", "#00ff00": "darkYellow", @@ -116,8 +102,8 @@ class LiveCartaConfig: } WRAP_TAGS_WITH_TABLE = { - ("div",) :["width", "border", "bgcolor"], - ("section", "blockquote",) : ("class", r"feature[1234]"), + ("div",): ["width", "border", "bgcolor"], + ("section", "blockquote",): ("class", r"feature[1234]"), } """('what to replace', 'parent tag', 'child tag')""" From 114ac78eb0d6f74163be8b01ea12c9cd138a3a2a Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 28 Jun 2022 16:39:50 +0300 Subject: [PATCH 28/55] refactor with PEP8 --- src/epub_converter/epub_converter.py | 5 +- src/epub_converter/html_epub_preprocessor.py | 169 ++++++++++++++++--- 2 files changed, 144 insertions(+), 30 deletions(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index ca7b69f..1ecc7a1 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -497,7 +497,7 @@ class EpubConverter: id wraps chapter"s content + subchapters" content id points to the start of title of a chapter - In all cases we know where chapter starts. Therefore, chapter is all tags between chapter"s id + In all cases we know where chapter starts. Therefore, chapter is all tags between chapter's id and id of the next chapter/subchapter Parameters ---------- @@ -539,6 +539,7 @@ class EpubConverter: lvl: int level of chapter + Returns ------- ChapterItem @@ -597,7 +598,7 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/9781614382264.epub" + epub_file_path = "../../epub/9781641050234.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index f9c2c06..3f762b4 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -21,7 +21,7 @@ def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSou """ def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list): - """Function inserts span before tag aren't supported by livecarta""" + """Function inserts span before tag aren't supported by LiveCarta""" new_tag = chapter_tag.new_tag("span") new_tag.attrs["id"] = id_ or "" new_tag.attrs["class"] = class_ or "" @@ -77,22 +77,57 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu def prepare_title(title_of_chapter: str) -> str: - """Function finalise processing/cleaning title""" - title_str = BeautifulSoup(title_of_chapter, features="lxml").string + """ + Function finalise processing/cleaning title + Parameters + ---------- + title_of_chapter: str + + Returns + ------- + title: str + cleaned title + + """ + title = BeautifulSoup(title_of_chapter, features="lxml").string # clean extra whitespace characters ([\r\n\t\f\v ]) - title_str = re.sub(r"[\s\xa0]", " ", title_str).strip() - return title_str + title = re.sub(r"[\s\xa0]", " ", title).strip() + return title def _remove_comments(chapter_tag): + """ + Function remove comments + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag without comments + + """ for tag in chapter_tag.find_all(): for element in tag(text=lambda text: isinstance(text, Comment)): element.extract() def _wrap_strings_with_p(chapter_tag): - # Headings that are not supported by livecarta converts to

    - # wrap NavigableString with

    + """ + Function converts headings that aren't supported by LiveCarta with

    + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with wrapped NavigableStrings + + """ for node in chapter_tag: if isinstance(node, NavigableString): content = str(node) @@ -104,7 +139,19 @@ def _wrap_strings_with_p(chapter_tag): def _wrap_tags_with_table(chapter_tag): - """Function wraps with

    """ + """ + Function wraps with
    + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with wrapped certain tags with
    + + """ def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): table = chapter_tag.new_tag("table") table.attrs["border"], table.attrs["align"], table.attrs["style"] \ @@ -141,7 +188,19 @@ def _wrap_tags_with_table(chapter_tag): def _tags_to_correspond_livecarta_tag(chapter_tag): - """Function to replace all tags to correspond livecarta tags""" + """ + Function to replace all tags to correspond LiveCarta tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with all tags replaced with LiveCarta tags + + """ for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items(): for key in reg_keys: if isinstance(key, tuple): @@ -164,12 +223,23 @@ def _tags_to_correspond_livecarta_tag(chapter_tag): def _unwrap_tags(chapter_tag): - """Function unwrap tags and move id to span""" + """ + Function unwrap tags and moves id to span + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with unwrapped certain tags + + """ for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP: for tag in chapter_tag.select(tag_name): # if tag is a subtag if ">" in tag_name: - parent = tag.parent tag.parent.attrs.update(tag.attrs) _add_span_to_save_ids_for_links(tag, chapter_tag) tag.unwrap() @@ -178,8 +248,8 @@ def _unwrap_tags(chapter_tag): def _remove_headings_content(content_tag, title_of_chapter: str): """ Function - clean/remove headings from chapter in order to avoid duplication of chapter titles in the content - add span with id in order to + - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content + - adds span with id in order to Parameters ---------- content_tag: soup object @@ -210,8 +280,20 @@ def _remove_headings_content(content_tag, title_of_chapter: str): break -def _preprocess_table(chapter_tag: BeautifulSoup): - """Function to preprocess tables and tags(td|th|tr): style""" +def _process_table(chapter_tag: BeautifulSoup): + """ + Function preprocesses tables and tags(td|th|tr) + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with processed tables + + """ tables = chapter_tag.find_all("table") for table in tables: for t_tag in table.find_all(re.compile("td|th|tr")): @@ -236,6 +318,19 @@ def _preprocess_table(chapter_tag: BeautifulSoup): def _insert_tags_in_parents(chapter_tag): + """ + Function inserts tags into correspond tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with inserted tags + + """ parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()} for parent_tag_name, condition in parent_tag2condition.items(): for parent_tag in chapter_tag.select(parent_tag_name): @@ -252,6 +347,19 @@ def _insert_tags_in_parents(chapter_tag): def _class_removing(chapter_tag): + """ + Function removes classes that aren't created by converter + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag without original classes of the book + + """ for tag in chapter_tag.find_all(recursive=True): if tag.attrs.get("class") \ and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): @@ -271,9 +379,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro Steps ---------- - 1. heading removal - 2. processing tags - 3. class removal + 1. comments removal + 2. wrap NavigableString with tag

    + 3. wrap tags with

    + 4. replace tags with correspond LiveCarta tags + 5. unwrap tags + 6. heading removal + 7. process_table + 8. insert tags into correspond tags + 9. class removal Returns ------- @@ -284,23 +398,22 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro # 1. remove comments _remove_comments(content_tag) - # 2. wrap NavigableString with tag

    + # 2. _wrap_strings_with_p(content_tag) - + # 3. _wrap_tags_with_table(content_tag) - + # 4. _tags_to_correspond_livecarta_tag(content_tag) - + # 5. _unwrap_tags(content_tag) - - # 3. heading removal + # 6. if remove_title_from_chapter: _remove_headings_content(content_tag, title_str) - - # 4. processing tags (

  • ,
  • , ,
    , 
    , ) - _preprocess_table(content_tag) + # 7. + _process_table(content_tag) + # 8. _insert_tags_in_parents(content_tag) - # 5. remove classes that weren't created by converter + # 9. remove classes that weren't created by converter _class_removing(content_tag) return str(content_tag) From 687c09417a5cd4359d2348c9c06db2c114470b02 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 7 Jul 2022 19:31:16 +0300 Subject: [PATCH 29/55] css processing formatting --- src/docx_converter/footnotes_processing.py | 2 +- src/epub_converter/css_preprocessing.py | 237 ------------------ src/epub_converter/css_preprocessor.py | 186 ++++++++++++++ src/epub_converter/footnotes_processing.py | 8 +- ...erter.py => tag_inline_style_processor.py} | 44 +--- src/livecarta_config.py | 56 +++-- 6 files changed, 231 insertions(+), 302 deletions(-) delete mode 100644 src/epub_converter/css_preprocessing.py create mode 100644 src/epub_converter/css_preprocessor.py rename src/epub_converter/{tag_css_style_converter.py => tag_inline_style_processor.py} (84%) diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py index 84861d7..beb6d15 100644 --- a/src/docx_converter/footnotes_processing.py +++ b/src/docx_converter/footnotes_processing.py @@ -1,5 +1,5 @@ import re -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString @staticmethod def _clean_footnote_content(content): diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py deleted file mode 100644 index 0ad0ff7..0000000 --- a/src/epub_converter/css_preprocessing.py +++ /dev/null @@ -1,237 +0,0 @@ -import re -import cssutils - -from ebooklib import epub -from bs4 import BeautifulSoup -from itertools import takewhile - -from src.util.color_reader import str2hex -from src.livecarta_config import LiveCartaConfig - - -def get_text_color(x): - color = str2hex(x) - color = color if color not in ["#000000", "#000", "black"] else "" - return color - - -def get_bg_color(x): - color = str2hex(x) - color = color if color not in ["#ffffff", "#fff", "white"] else "" - return color - - -def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str: - """ - Function - - converts values of tags from em/%/pt to px - - find closest font-size px - Parameters - ---------- - size_value: str - - Returns - ------- - size_value: str - converted value size - """ - size_regexp = re.compile( - r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)") - has_style_attrs = re.search(size_regexp, size_value) - if has_style_attrs: - if has_style_attrs.group(1): - multiplier = 5.76 if is_indent else 0.16 - size_value = float(size_value.replace("%", "")) * multiplier - return str(size_value)+'px' - elif has_style_attrs.group(3): - multiplier = 18 if is_indent else 16 - size_value = float(size_value.replace("em", "")) * multiplier - return str(size_value)+'px' - elif has_style_attrs.group(5): - size_value = float(size_value.replace("pt", "")) * 4/3 - return str(size_value)+'px' - else: - return "" - return size_value - - -def convert_indents_tag_values(size_value: str) -> str: - """ - Function converts values of ["text-indent", "margin-left", "margin"] - Parameters - ---------- - size_value: str - - Returns - ------- - size_value: str - - """ - if len(size_value.split(" ")) == 3: - size_value = convert_tag_style_values(size_value.split( - " ")[-2], True) # returns middle value - else: - size_value = convert_tag_style_values(size_value.split( - " ")[-1], True) # returns last value - return size_value - - -""" -Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } -Style properties that can be used to fit LiveCarta css style convention. -If property has empty list, it means that any value can be converted. -If property has not empty list, it means that only certain property-value combinations can be transformed. -""" -LIVECARTA_STYLE_ATTRS = { - "text-indent": [], - "font-variant": ["small-caps"], - "text-align": [x for x in LiveCartaConfig.ALIGN_STYLES if x != LiveCartaConfig.DEFAULT_ALIGN_STYLE], - "align": [], - "font": [], - "font-family": [], - "font-size": [], - "font-weight": ["bold", "600", "700", "800", "900"], # - "font-style": ["italic"], # - "text-decoration": ["underline", "line-through"], # , - "text-decoration-line": ["underline", "line-through"], # , - "vertical-align": ["super"], # - "color": [], - "background-color": [], - "background": [], - "width": [], - "border": [], - "border-top-width": [], - "border-right-width": [], - "border-left-width": [], - "border-bottom-width": [], - "border-top": [], - "border-bottom": [], - "list-style-type": [], - "list-style-image": [], - "margin-left": [], - "margin-top": [], - "margin": [], -} - -""" -Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } - -Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated -to suit LiveCarta style convention. -""" -LIVECARTA_STYLE_ATTRS_MAPPING = { - "text-indent": convert_indents_tag_values, - "font-variant": lambda x: x, - "text-align": lambda x: x, - "font": lambda x: "", - "font-family": lambda x: x, - "font-size": convert_tag_style_values, - "color": get_text_color, - "background-color": get_bg_color, - "background": get_bg_color, - "border": lambda x: x if x != "0" else "", - "border-top-width": lambda x: x if x != "0" else "", - "border-right-width": lambda x: x if x != "0" else "", - "border-left-width": lambda x: x if x != "0" else "", - "border-bottom-width": lambda x: x if x != "0" else "", - "border-top": lambda x: x if x != "0" else "", - "border-bottom": lambda x: x if x != "0" else "", - "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", - "list-style-image": lambda x: "disc", - "margin-left": convert_indents_tag_values, - "margin-top": convert_tag_style_values, - "margin": convert_indents_tag_values, -} - - -def style_conditions(style_value, style_name): - cleaned_value = style_value.replace("\"", "") - constraints_on_value = LIVECARTA_STYLE_ATTRS.get( - style_name) - value_not_in_possible_values_list = cleaned_value not in LIVECARTA_STYLE_ATTRS[ - style_name] - return cleaned_value, constraints_on_value, value_not_in_possible_values_list - - -def update_inline_styles_to_livecarta_convention(split_style: list): - for i, style in enumerate(split_style): - style_name, style_value = style.split(":") - if style_name not in LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = "" - return split_style - - cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ - style_conditions(style_value, style_name) - if constraints_on_value and value_not_in_possible_values_list: - # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - split_style[i] = "" - else: - if style_name in LIVECARTA_STYLE_ATTRS_MAPPING: - # function that converts our data - func = LIVECARTA_STYLE_ATTRS_MAPPING[style_name] - style_value = func(cleaned_value) - split_style[i] = style_name + ":" + style_value - return split_style - - -def build_inline_style_content(style: str) -> str: - """Build inline style with LiveCarta convention""" - # replace all spaces between "; & letter" to ";" - style = re.sub(r"; *", ";", style) - # when we split style by ";", last element of the list is "" - None - # remove it - split_style: list = list(filter(None, style.split(";"))) - # replace all spaces between ": & letter" to ":" - split_style = [el.replace( - re.search(r"(:\s*)", el).group(1), ":") for el in split_style] - - split_style = update_inline_styles_to_livecarta_convention(split_style) - style = "; ".join(split_style) - return style - - -def update_css_styles_to_livecarta_convention(css_rule: cssutils.css.CSSStyleRule, - style_type: cssutils.css.property.Property): - if style_type.name == "font-family": - pass - if style_type.name not in LIVECARTA_STYLE_ATTRS: - # property not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = "" - return - - cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ - style_conditions(style_type.value, style_type.name) - if constraints_on_value and value_not_in_possible_values_list: - # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file - css_rule.style[style_type.name] = "" - else: - if style_type.name in LIVECARTA_STYLE_ATTRS_MAPPING: - # function that converts our data - func = LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] - css_rule.style[style_type.name] = func(cleaned_value) - - -def build_css_file_content(css_content: str) -> str: - """Build css content with LiveCarta convention""" - sheet = cssutils.parseString(css_content, validate=False) - - for css_rule in sheet: - if css_rule.type == css_rule.STYLE_RULE: - for style_type in css_rule.style: - update_css_styles_to_livecarta_convention( - css_rule, style_type) - - css_text: str = sheet._getCssText().decode() - return css_text - - -if __name__ == "__main__": - file = "../../epub/9781627222174.epub" - ebooklib_book = epub.read_epub(file) - css_ = ebooklib_book.get_item_with_href("css/epub.css") - css_ = css_.get_content().decode() - css_cleaned = build_css_file_content(css_) - html_ = ebooklib_book.get_item_with_href( - "pr01s05.xhtml").get_body_content().decode() - html_soup = BeautifulSoup(html_, features="lxml") diff --git a/src/epub_converter/css_preprocessor.py b/src/epub_converter/css_preprocessor.py new file mode 100644 index 0000000..57c0388 --- /dev/null +++ b/src/epub_converter/css_preprocessor.py @@ -0,0 +1,186 @@ +import re +import cssutils + +from src.util.helpers import BookLogger +from src.util.color_reader import str2hex +from src.livecarta_config import LiveCartaConfig + + +class CSSPreprocessor: + def __init__(self, logger=None): + self.logger: BookLogger = logger + """ + Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } + + Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated + to suit LiveCarta style convention. + """ + self.LIVECARTA_STYLE_ATTRS_MAPPING = { + "text-indent": self.convert_indents_tag_values, + "font-variant": lambda x: x, + "text-align": lambda x: x, + "font": lambda x: "", + "font-family": lambda x: x, + "font-size": self.convert_tag_style_values, + "color": self.get_text_color, + "background-color": self.get_bg_color, + "background": self.get_bg_color, + "border": lambda x: x if x != "0" else "", + "border-top-width": lambda x: x if x != "0" else "", + "border-right-width": lambda x: x if x != "0" else "", + "border-left-width": lambda x: x if x != "0" else "", + "border-bottom-width": lambda x: x if x != "0" else "", + "border-top": lambda x: x if x != "0" else "", + "border-bottom": lambda x: x if x != "0" else "", + "list-style-type": lambda x: x if x in LiveCartaConfig.list_types else "disc", + "list-style-image": lambda x: "disc", + "margin-left": self.convert_indents_tag_values, + "margin-top": self.convert_tag_style_values, + "margin": self.convert_indents_tag_values, + } + + @staticmethod + def get_text_color(x): + color = str2hex(x) + color = color if color not in ["#000000", "#000", "black"] else "" + return color + + @staticmethod + def get_bg_color(x): + color = str2hex(x) + color = color if color not in ["#ffffff", "#fff", "white"] else "" + return color + + @staticmethod + def convert_tag_style_values(size_value: str, is_indent: bool = False) -> str: + """ + Function + - converts values of tags from em/%/pt to px + - find closest font-size px + Parameters + ---------- + size_value: str + + is_indent: bool + + Returns + ------- + size_value: str + converted value size + """ + size_regexp = re.compile( + r"(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)") + has_style_attrs = re.search(size_regexp, size_value) + if has_style_attrs: + if has_style_attrs.group(1): + multiplier = 5.76 if is_indent else 0.16 + size_value = float(size_value.replace("%", "")) * multiplier + return str(size_value)+'px' + elif has_style_attrs.group(3): + multiplier = 18 if is_indent else 16 + size_value = float(size_value.replace("em", "")) * multiplier + return str(size_value)+'px' + elif has_style_attrs.group(5): + size_value = float(size_value.replace("pt", "")) * 4/3 + return str(size_value)+'px' + else: + return "" + return size_value + + def convert_indents_tag_values(self, size_value: str) -> str: + """ + Function converts values of ["text-indent", "margin-left", "margin"] + Parameters + ---------- + size_value: str + + Returns + ------- + size_value: str + + """ + if len(size_value.split(" ")) == 3: + size_value = self.convert_tag_style_values(size_value.split( + " ")[-2], True) # returns middle value + else: + size_value = self.convert_tag_style_values(size_value.split( + " ")[-1], True) # returns last value + return size_value + + @staticmethod + def style_conditions(style_value, style_name): + cleaned_value = style_value.replace("\"", "") + constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get( + style_name) + value_not_in_possible_values_list = cleaned_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[ + style_name] + return cleaned_value, constraints_on_value, value_not_in_possible_values_list + + def update_inline_styles_to_livecarta_convention(self, split_style: list): + for i, style in enumerate(split_style): + style_name, style_value = style.split(":") + if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: + # property not in LIVECARTA_STYLE_ATTRS, remove from css file + split_style[i] = "" + return split_style + + cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ + self.style_conditions(style_value, style_name) + if constraints_on_value and value_not_in_possible_values_list: + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file + split_style[i] = "" + else: + if style_name in self.LIVECARTA_STYLE_ATTRS_MAPPING: + # function that converts our data + func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_name] + style_value = func(cleaned_value) + split_style[i] = style_name + ":" + style_value + return split_style + + def build_inline_style_content(self, style: str) -> str: + """Build inline style with LiveCarta convention""" + # replace all spaces between "; & letter" to ";" + style = re.sub(r"; *", ";", style) + # when we split style by ";", last element of the list is "" - None + # remove it + split_style: list = list(filter(None, style.split(";"))) + # replace all spaces between ": & letter" to ":" + split_style = [el.replace( + re.search(r"(:\s*)", el).group(1), ":") for el in split_style] + + split_style = self.update_inline_styles_to_livecarta_convention(split_style) + style = "; ".join(split_style) + return style + + def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule, + style_type: cssutils.css.property.Property): + if style_type.name == "font-family": + pass + if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: + # property not in LIVECARTA_STYLE_ATTRS, remove from css file + css_rule.style[style_type.name] = "" + return + + cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ + self.style_conditions(style_type.value, style_type.name) + if constraints_on_value and value_not_in_possible_values_list: + # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file + css_rule.style[style_type.name] = "" + else: + if style_type.name in self.LIVECARTA_STYLE_ATTRS_MAPPING: + # function that converts our data + func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] + css_rule.style[style_type.name] = func(cleaned_value) + + def build_css_file_content(self, css_content: str) -> str: + """Build css content with LiveCarta convention""" + sheet = cssutils.parseString(css_content, validate=False) + + for css_rule in sheet: + if css_rule.type == css_rule.STYLE_RULE: + for style_type in css_rule.style: + self.update_css_styles_to_livecarta_convention( + css_rule, style_type) + + css_text: str = sheet._getCssText().decode() + return css_text diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py index ef2eac0..ae568e0 100644 --- a/src/epub_converter/footnotes_processing.py +++ b/src/epub_converter/footnotes_processing.py @@ -1,5 +1,5 @@ +import re from typing import Tuple - from bs4 import BeautifulSoup, Tag @@ -84,4 +84,10 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note footnote_tag = footnote_tag.find( attrs={"role": "doc-backlink"}) or footnote_tag new_footnotes_tags.append(footnote_tag) + + for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)): + noteref.attrs["data-id"] = i + 1 + noteref.attrs["id"] = f"footnote-{i + 1}" + footnote.attrs["href"] = f"#footnote-{i + 1}" + return footnotes, new_noterefs_tags, new_footnotes_tags diff --git a/src/epub_converter/tag_css_style_converter.py b/src/epub_converter/tag_inline_style_processor.py similarity index 84% rename from src/epub_converter/tag_css_style_converter.py rename to src/epub_converter/tag_inline_style_processor.py index 1032d49..c4e0b45 100644 --- a/src/epub_converter/tag_css_style_converter.py +++ b/src/epub_converter/tag_inline_style_processor.py @@ -4,15 +4,13 @@ from typing import List from logging import CRITICAL from bs4 import BeautifulSoup -from premailer import transform from src.livecarta_config import LiveCartaConfig -from src.epub_converter.css_preprocessing import LIVECARTA_STYLE_ATTRS cssutils.log.setLevel(CRITICAL) -class TagStyleConverter: +class TagInlineStyleProcessor: def __init__(self, tag_inline_style): # tag with inline style + style parsed from css file self.tag_inline_style = tag_inline_style @@ -190,7 +188,7 @@ class TagStyleConverter: for tag, styles in LiveCartaConfig.LIVECARTA_STYLES_CAN_BE_IN_TAG.items() if re.match(tag, initial_tag.name) for style in styles] - styles_cant_be_in_tag = [attr for attr in LIVECARTA_STYLE_ATTRS + styles_cant_be_in_tag = [attr for attr in LiveCartaConfig.LIVECARTA_STYLE_ATTRS if attr not in styles_can_be_in_tag] span_style = initial_tag.attrs["style"] # here check that this style is exactly the same. @@ -218,41 +216,3 @@ class TagStyleConverter: self.change_attrs_with_corresponding_tags() self.wrap_span_in_tag_to_save_style_attrs(self.tag_inline_style) return self.tag_inline_style - - -def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: - """ - Function adds styles from .css to inline style. - Parameters - ---------- - html_soup: BeautifulSoup - html page with inline style - css_text: str - css content from css file - Returns - ------- - inline_soup: BeautifulSoup - soup with styles from css - - """ - # remove this specification because it causes problems - css_text = css_text.replace( - '@namespace epub "http://www.idpf.org/2007/ops";', '') - # here we add css styles to inline style - html_with_css_styles: str = transform(str(html_soup), css_text=css_text, - remove_classes=False, - external_styles=False, - allow_network=False, - disable_validation=True, - ) - # soup with converted styles from css - inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") - - tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={"style": re.compile(".*")}) - - # go through the tags with inline style + style parsed from css file - for tag_inline_style in tags_with_inline_style: - style_converter = TagStyleConverter(tag_inline_style) - style_converter.convert_initial_tag() - return inline_soup diff --git a/src/livecarta_config.py b/src/livecarta_config.py index 9a94545..9ae2d40 100644 --- a/src/livecarta_config.py +++ b/src/livecarta_config.py @@ -101,25 +101,39 @@ class LiveCartaConfig: r"(^h[1-9]$)": ["list-style-type"] } - WRAP_TAGS_WITH_TABLE = { - ("div",): ["width", "border", "bgcolor"], - ("section", "blockquote",): ("class", r"feature[1234]"), - } - - """('what to replace', 'parent tag', 'child tag')""" - REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS = { - (r"^h[6-9]$", "^figure$", "^section$", "^div$"): "p", - ("^aside$",): "blockquote", - ("^header$", "^footer$", ("child", ":not(pre)", "code, kbd, var")): "span", - ("^b$",): "strong", - # (("parent", ":not(pre)", "code")): "p", - } - - """ > == in (p in li)""" - TAGS_TO_UNWRAP = [ - "section", "article", "figcaption", "main", "body", "html", "li > p", - ] - - INSERT_TAG_IN_PARENT_TAG = { - ("pre", "code, kbd, var"): "code", + """ + Dictionary LIVECARTA_STYLE_ATTRS = { css property: value } + Style properties that can be used to fit LiveCarta css style convention. + If property has empty list, it means that any value can be converted. + If property has not empty list, it means that only certain property-value combinations can be transformed. + """ + LIVECARTA_STYLE_ATTRS = { + "text-indent": [], + "font-variant": ["small-caps"], + "text-align": [x for x in ["justify", "right", "center", "left"] if x != "left"], + "align": [], + "font": [], + "font-family": [], + "font-size": [], + "font-weight": ["bold", "600", "700", "800", "900"], # + "font-style": ["italic"], # + "text-decoration": ["underline", "line-through"], # , + "text-decoration-line": ["underline", "line-through"], # , + "vertical-align": ["super"], # + "color": [], + "background-color": [], + "background": [], + "width": [], + "border": [], + "border-top-width": [], + "border-right-width": [], + "border-left-width": [], + "border-bottom-width": [], + "border-top": [], + "border-bottom": [], + "list-style-type": [], + "list-style-image": [], + "margin-left": [], + "margin-top": [], + "margin": [], } From c4752a19db765a81b0ae8557663ed883aabd8438 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 7 Jul 2022 19:32:24 +0300 Subject: [PATCH 30/55] add processing of JSON presets --- src/docx_converter/image_processing.py | 1 - src/epub_converter/epub_converter.py | 149 +++- src/epub_converter/epub_solver.py | 16 +- src/epub_converter/html_epub_preprocessor.py | 733 +++++++++---------- src/preset_processor.py | 15 + 5 files changed, 497 insertions(+), 417 deletions(-) create mode 100644 src/preset_processor.py diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py index 923a274..e593312 100644 --- a/src/docx_converter/image_processing.py +++ b/src/docx_converter/image_processing.py @@ -1,5 +1,4 @@ import os -import logging import pathlib from shutil import copyfile diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 1ecc7a1..525fad3 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -4,33 +4,34 @@ import codecs import os from os.path import dirname, normpath, join from itertools import chain +from premailer import transform from collections import defaultdict from typing import Dict, Union, List - import ebooklib from ebooklib import epub from ebooklib.epub import Link, Section -from bs4 import BeautifulSoup, Tag - +from bs4 import BeautifulSoup, NavigableString, Tag from src.util.helpers import BookLogger +from src.preset_processor import PresetProcessor +from src.epub_converter.css_preprocessor import CSSPreprocessor +from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.epub_converter.image_processing import update_images_src_links from src.epub_converter.footnotes_processing import preprocess_footnotes -from src.epub_converter.css_preprocessing import build_inline_style_content, build_css_file_content -from src.epub_converter.tag_css_style_converter import convert_html_soup_with_css_style -from src.epub_converter.html_epub_preprocessor import get_tags_between_chapter_marks,\ - prepare_title, prepare_content +from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcessor class EpubConverter: - def __init__(self, file_path, access=None, logger=None): + def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None): self.file_path = file_path self.access = access self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file_path) + self.css_processor = css_preprocessor + self.html_preprocessor = html_processor # main container for all epub .xhtml files self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} @@ -74,25 +75,15 @@ class EpubConverter: self.process_inline_styles_in_html_soup() self.logger.log("CSS files processing.") self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() - self.logger.log("CSS styles adding.") + self.logger.log("CSS styles adding.") self.add_css_styles_to_html_soup() - # todo presets - self.logger.log("Footnotes processing.") for href in self.html_href2html_body_soup: - content, noterefs, footnotes_tags = preprocess_footnotes(self.html_href2html_body_soup[href], - self.html_href2html_body_soup) - self.footnotes_contents.extend(content) - self.noterefs.extend(noterefs) - self.footnotes.extend(footnotes_tags) - - for i, (noteref, footnote) in enumerate(zip(self.noterefs, self.footnotes)): - noteref.attrs["data-id"] = i + 1 - noteref.attrs["id"] = f"footnote-{i + 1}" - footnote.attrs["href"] = f"#footnote-{i + 1}" - + self.footnotes_contents, self.noterefs, self.footnotes =\ + preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup) self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") + self.logger.log("TOC processing.") self.build_adjacency_list_from_toc(self.ebooklib_book.toc) # build simple toc from spine if needed @@ -101,6 +92,7 @@ class EpubConverter: not_added = [ x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc] self.logger.log(f"Html documents not added to TOC: {not_added}.") + self.logger.log(f"Add documents not added to TOC.") self.add_not_added_files_to_adjacency_list(not_added) self.logger.log(f"Html internal links and structure processing.") self.label_chapters_ids_with_lc_id() @@ -149,7 +141,7 @@ class EpubConverter: for tag_initial_inline_style in tags_with_inline_style: inline_style = tag_initial_inline_style.attrs["style"] tag_initial_inline_style.attrs["style"] = \ - build_inline_style_content(inline_style) + self.css_processor.build_inline_style_content(inline_style) def build_html_and_css_relations(self) -> tuple[dict, dict]: """ @@ -181,16 +173,53 @@ class EpubConverter: html_href2css_href[html_href].append(css_href) if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict - css_href2css_content[css_href] = build_css_file_content( + css_href2css_content[css_href] = self.css_processor.build_css_file_content( self.get_css_content(css_href, html_href)) for i, tag in enumerate(soup_html_content.find_all("style")): css_content = tag.string html_href2css_href[html_href].append(f"href{i}") - css_href2css_content[f"href{i}"] = build_css_file_content( + css_href2css_content[f"href{i}"] = self.css_processor.build_css_file_content( css_content) return html_href2css_href, css_href2css_content + def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: + """ + Function adds styles from .css to inline style. + Parameters + ---------- + html_soup: BeautifulSoup + html page with inline style + css_text: str + css content from css file + Returns + ------- + inline_soup: BeautifulSoup + soup with styles from css + + """ + # remove this specification because it causes problems + css_text = css_text.replace( + '@namespace epub "http://www.idpf.org/2007/ops";', '') + # here we add css styles to inline style + html_with_css_styles: str = transform(str(html_soup), css_text=css_text, + remove_classes=False, + external_styles=False, + allow_network=False, + disable_validation=True, + ) + # soup with converted styles from css + inline_soup = BeautifulSoup(html_with_css_styles, features="lxml") + + tags_with_inline_style = inline_soup.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + attrs={"style": re.compile(".*")}) + + # go through the tags with inline style + style parsed from css file + for tag_inline_style in tags_with_inline_style: + style_converter = TagInlineStyleProcessor(tag_inline_style) + style_converter.convert_initial_tag() + return inline_soup + def add_css_styles_to_html_soup(self): """ This function is designed to update html_href2html_body_soup @@ -203,7 +232,7 @@ class EpubConverter: for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] - html_content = convert_html_soup_with_css_style(html_content, css) + html_content = self.convert_html_soup_with_css_style(html_content, css) self.html_href2html_body_soup[html_href] = html_content def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): @@ -488,6 +517,48 @@ class EpubConverter: f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file." f" Old id={a_tag_id}") + @staticmethod + def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: + """ + After processing on a first_id that corresponds to current chapter, + from initial html_soup all tags from current chapter are extracted + Parameters + ---------- + first_id: str + Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" + href: str + Name of current chapters file + html_soup: Tag + Soup object of current file + + Returns + ------- + tags: list [Tag, NavigableString] + Chapter's tags + + """ + marked_tags = html_soup.find( + attrs={"id": first_id, "class": "converter-chapter-mark"}) + if marked_tags: + next_tag = marked_tags.next_sibling + tags = [] + while next_tag: + if not isinstance(next_tag, NavigableString) and \ + (next_tag.attrs.get("class") == "converter-chapter-mark"): + break + tags.append(next_tag) + next_tag = next_tag.next_sibling + + # remove tags between first_id and next found id + # save them in list for next steps + tags = [tag.extract() for tag in tags] + html_soup.smooth() + + else: + assert 0, f"Warning: no match for {first_id, href}" + + return tags + def detect_one_chapter(self, nav_point: NavPoint): """ Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) @@ -511,11 +582,11 @@ class EpubConverter: """ if nav_point.id: soup = self.html_href2html_body_soup[nav_point.href] - chapter_tags = get_tags_between_chapter_marks( + subchapter_tags = self.get_tags_between_chapter_marks( first_id=nav_point.id, href=nav_point.href, html_soup=soup) new_tree = BeautifulSoup("", "html.parser") - for tag in chapter_tags: - new_tree.append(tag) + for subchapter_tag in subchapter_tags: + new_tree.append(subchapter_tag) self.href_chapter_id2soup_html[( nav_point.href, nav_point.id)] = new_tree @@ -527,8 +598,8 @@ class EpubConverter: """Function build chapters content, starts from top level chapters""" top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: - for point in top_level_nav_points: - self.detect_one_chapter(point) + for tl_nav_point in top_level_nav_points: + self.detect_one_chapter(tl_nav_point) def html_node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem: """ @@ -561,9 +632,9 @@ class EpubConverter: if hasattr(self.file_path, "stem") else "book_id") is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS - title_preprocessed = prepare_title(title) - content_preprocessed = prepare_content(title_preprocessed, content, - remove_title_from_chapter=is_chapter) + title_preprocessed = self.html_preprocessor.prepare_title(title) + content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content, + remove_title_from_chapter=is_chapter) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter # if it doesn't have subchapters @@ -598,11 +669,17 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/9781641050234.epub" + epub_file_path = "../../epub/Modern_Java_in_Action.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) - json_converter = EpubConverter(epub_file_path, logger=logger_object) + preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\ + .get_preset_json() + css_preprocessor = CSSPreprocessor(logger=logger_object) + html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object) + + json_converter = EpubConverter(epub_file_path, logger=logger_object, + css_preprocessor=css_preprocessor, html_processor=html_preprocessor) content_dict = json_converter.convert_to_dict() with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index 8e92a40..c1bb800 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -1,4 +1,7 @@ from src.book_solver import BookSolver +from src.preset_processor import PresetProcessor +from src.epub_converter.css_preprocessor import CSSPreprocessor +from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor from src.epub_converter.epub_converter import EpubConverter @@ -14,8 +17,10 @@ class EpubBook(BookSolver): Function Steps ---------- - 1. Converts .epub to .html - 2. Parses from line structure to nested structure + 1. Gets data from preset structure + 2. Add preset to html preprocessor + 3. Converts .epub to .html + 4. Parses from line structure to nested structure Returns ---------- @@ -23,7 +28,12 @@ class EpubBook(BookSolver): json for LiveCarta platform """ + preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\ + .get_preset_json() + css_preprocessor = CSSPreprocessor(logger=self.logger_object) + html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object) json_converter = EpubConverter( - self.file_path, access=self.access, logger=self.logger_object) + self.file_path, access=self.access, logger=self.logger_object, + css_preprocessor=css_preprocessor, html_processor=html_preprocessor) content_dict = json_converter.convert_to_dict() return content_dict diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 3f762b4..3ddc532 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -1,419 +1,398 @@ import re +from bs4 import BeautifulSoup, NavigableString, Comment, Tag -from bs4 import BeautifulSoup, NavigableString, Tag, Comment - -from src.livecarta_config import LiveCartaConfig +from src.util.helpers import BookLogger -def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup): - """ - Function adds span with id from tag_to_be_removed - because this tag will be removed(unwrapped/extract) - Parameters - ---------- - tag_to_be_removed: Soup object - chapter_tag: BeautifulSoup +class HtmlEpubPreprocessor: + def __init__(self, preset, logger=None): + self.preset = preset + self.logger: BookLogger = logger + self.name2function = { + "table_wrapper": self._wrap_tags_with_table, + "replacer": self._tags_to_correspond_livecarta_tag, + "unwrapper": self._unwrap_tags, + "inserter": self._insert_tags_into_correspond_tags + } - Returns - ------- - None - updated body tag + @staticmethod + def _add_span_to_save_ids_for_links(tag_to_be_removed, chapter_tag: BeautifulSoup): + """ + Function adds span with id from tag_to_be_removed + because this tag will be removed(unwrapped/extract) + Parameters + ---------- + tag_to_be_removed: Soup object + chapter_tag: BeautifulSoup - """ - def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, class_: list): - """Function inserts span before tag aren't supported by LiveCarta""" - new_tag = chapter_tag.new_tag("span") - new_tag.attrs["id"] = id_ or "" - new_tag.attrs["class"] = class_ or "" - new_tag.string = "\xa0" - tag_to_be_removed.insert_before(new_tag) + Returns + ------- + None + updated body tag - if tag_to_be_removed.attrs.get("id"): - _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, - id_=tag_to_be_removed.attrs["id"], - class_=tag_to_be_removed.attrs.get("class")) + """ + def _insert_span_with_attrs_before_tag(chapter_tag: BeautifulSoup, tag_to_be_removed: Tag, id_: str, + class_: list): + """Function inserts span before tag aren't supported by LiveCarta""" + new_tag = chapter_tag.new_tag("span") + new_tag.attrs["id"] = id_ or "" + new_tag.attrs["class"] = class_ or "" + new_tag.string = "\xa0" + tag_to_be_removed.insert_before(new_tag) -def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: - """ - After processing on a first_id that corresponds to current chapter, - from initial html_soup all tags from current chapter are extracted - Parameters - ---------- - first_id: str - Id that point where a chapter starts. A Tag with class: "converter-chapter-mark" - href: str - Name of current chapters file - html_soup: Tag - Soup object of current file + if tag_to_be_removed.attrs.get("id"): + _insert_span_with_attrs_before_tag(chapter_tag=chapter_tag, tag_to_be_removed=tag_to_be_removed, + id_=tag_to_be_removed.attrs["id"], + class_=tag_to_be_removed.attrs.get("class")) - Returns - ------- - tags: list [Tag, NavigableString] - Chapter's tags + @staticmethod + def prepare_title(title_of_chapter: str) -> str: + """ + Function finalise processing/cleaning title + Parameters + ---------- + title_of_chapter: str - """ - marked_tags = html_soup.find( - attrs={"id": first_id, "class": "converter-chapter-mark"}) - if marked_tags: - next_tag = marked_tags.next_sibling - tags = [] - while next_tag: - if not isinstance(next_tag, NavigableString) and \ - (next_tag.attrs.get("class") == "converter-chapter-mark"): - break - tags.append(next_tag) - next_tag = next_tag.next_sibling + Returns + ------- + title: str + cleaned title - # remove tags between first_id and next found id - # save them in list for next steps - tags = [tag.extract() for tag in tags] - html_soup.smooth() + """ + title = BeautifulSoup(title_of_chapter, features="lxml").string + # clean extra whitespace characters ([\r\n\t\f\v ]) + title = re.sub(r"[\s\xa0]", " ", title).strip() + return title - else: - assert 0, f"Warning: no match for {first_id, href}" + @staticmethod + def _remove_comments(chapter_tag): + """ + Function remove comments + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag - return tags + Returns + ------- + None + Chapter Tag without comments + """ + for tag in chapter_tag.find_all(): + for element in tag(text=lambda text: isinstance(text, Comment)): + element.extract() -def prepare_title(title_of_chapter: str) -> str: - """ - Function finalise processing/cleaning title - Parameters - ---------- - title_of_chapter: str + @staticmethod + def _wrap_strings_with_p(chapter_tag): + """ + Function converts headings that aren't supported by LiveCarta with

    + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag - Returns - ------- - title: str - cleaned title + Returns + ------- + None + Chapter Tag with wrapped NavigableStrings - """ - title = BeautifulSoup(title_of_chapter, features="lxml").string - # clean extra whitespace characters ([\r\n\t\f\v ]) - title = re.sub(r"[\s\xa0]", " ", title).strip() - return title + """ + for node in chapter_tag: + if isinstance(node, NavigableString): + content = str(node) + content = re.sub(r"([\s\xa0])", " ", content).strip() + if content: + p_tag = chapter_tag.new_tag("p") + p_tag.append(str(node)) + node.replace_with(p_tag) + def _wrap_tags_with_table(self, chapter_tag, rules: list): + """ + Function wraps with

    + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag -def _remove_comments(chapter_tag): - """ - Function remove comments - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag + Returns + ------- + None + Chapter Tag with wrapped certain tags with
    - Returns - ------- - None - Chapter Tag without comments + """ - """ - for tag in chapter_tag.find_all(): - for element in tag(text=lambda text: isinstance(text, Comment)): - element.extract() + def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): + table = chapter_tag.new_tag("table") + table.attrs["border"], table.attrs["align"], table.attrs["style"] \ + = border, "center", f"width:{width}%;" + tbody, tr, td = \ + chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") + td.attrs["bgcolor"] = bg_color + tag_to_be_wrapped.wrap(td) + td.wrap(tr) + tr.wrap(tbody) + tbody.wrap(table) + table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) + return table + def process_tag_using_table(tag_to_wrap): + _wrap_tag_with_table( + chapter_tag, + tag_to_be_wrapped=tag_to_wrap, + width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", + border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, + bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) + self._add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) + tag_to_wrap.unwrap() -def _wrap_strings_with_p(chapter_tag): - """ - Function converts headings that aren't supported by LiveCarta with

    - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with wrapped NavigableStrings - - """ - for node in chapter_tag: - if isinstance(node, NavigableString): - content = str(node) - content = re.sub(r"([\s\xa0])", " ", content).strip() - if content: - p_tag = chapter_tag.new_tag("p") - p_tag.append(str(node)) - node.replace_with(p_tag) - - -def _wrap_tags_with_table(chapter_tag): - """ - Function wraps with

    - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with wrapped certain tags with
    - - """ - def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): - table = chapter_tag.new_tag("table") - table.attrs["border"], table.attrs["align"], table.attrs["style"] \ - = border, "center", f"width:{width}%;" - tbody, tr, td = \ - chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") - td.attrs["bgcolor"] = bg_color - tag_to_be_wrapped.wrap(td) - td.wrap(tr) - tr.wrap(tbody) - tbody.wrap(table) - table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) - return table - - def process_tag_using_table(tag_to_wrap): - _wrap_tag_with_table( - chapter_tag, - tag_to_be_wrapped=tag_to_wrap, - width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", - border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, - bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) - _add_span_to_save_ids_for_links(tag_to_wrap, chapter_tag) - tag_to_wrap.unwrap() - - for tags_to_wrap, attrs in LiveCartaConfig.WRAP_TAGS_WITH_TABLE.items(): - if isinstance(attrs, tuple): - attr, val = attrs[0], attrs[1] - for tag_to_wrap in chapter_tag.find_all(tags_to_wrap, {attr: re.compile(fr"{val}")}): - process_tag_using_table(tag_to_wrap) - else: - for tag_to_wrap in chapter_tag.find_all(tags_to_wrap): - if any(attr_name in attrs for attr_name in tag_to_wrap.attrs): + for rule in rules: + tags = rule["tags"] + for attr in rule["attrs"]: + for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): process_tag_using_table(tag_to_wrap) + @staticmethod + def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list): + """ + Function to replace all tags to correspond LiveCarta tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag -def _tags_to_correspond_livecarta_tag(chapter_tag): - """ - Function to replace all tags to correspond LiveCarta tags - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag + Returns + ------- + None + Chapter Tag with all tags replaced with LiveCarta tags - Returns - ------- - None - Chapter Tag with all tags replaced with LiveCarta tags - - """ - for reg_keys, to_replace_value in LiveCartaConfig.REPLACE_TAG_WITH_LIVECARTA_CORRESPOND_TAGS.items(): - for key in reg_keys: - if isinstance(key, tuple): - replace = key[0] - parent, child = key[1], key[2] - for parent_tag in chapter_tag.select(parent): - if replace == "parent": - parent_tag.name = to_replace_value - elif replace == "child": - for child_tag in parent_tag.select(child): - child_tag.name = to_replace_value - if not child_tag.attrs.get("style"): - child_tag.attrs["style"] =\ - "font-size: 14px; font-family: courier new,courier,monospace;" - else: - tags = chapter_tag.find_all(re.compile(key)) - for tag in tags: - # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) - tag.name = to_replace_value - - -def _unwrap_tags(chapter_tag): - """ - Function unwrap tags and moves id to span - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with unwrapped certain tags - - """ - for tag_name in LiveCartaConfig.TAGS_TO_UNWRAP: - for tag in chapter_tag.select(tag_name): - # if tag is a subtag - if ">" in tag_name: - tag.parent.attrs.update(tag.attrs) - _add_span_to_save_ids_for_links(tag, chapter_tag) - tag.unwrap() - - -def _remove_headings_content(content_tag, title_of_chapter: str): - """ - Function - - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content - - adds span with id in order to - Parameters - ---------- - content_tag: soup object - Tag of the page - title_of_chapter: str - Chapter title - - Returns - ------- - None - clean/remove headings & add span with id - - """ - title_of_chapter = title_of_chapter.lower() - for tag in content_tag.contents: - text = tag if isinstance(tag, NavigableString) else tag.text - if re.sub(r"[\s\xa0]", "", text): - text = re.sub(r"[\s\xa0]", " ", text).lower() - text = text.strip() # delete extra spaces - if title_of_chapter == text or \ - (title_of_chapter in text and - re.findall(r"^h[1-3]$", tag.name or content_tag.name)): - _add_span_to_save_ids_for_links(tag, content_tag) - tag.extract() - return - elif not isinstance(tag, NavigableString): - if not _remove_headings_content(tag, title_of_chapter): - break - - -def _process_table(chapter_tag: BeautifulSoup): - """ - Function preprocesses tables and tags(td|th|tr) - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with processed tables - - """ - tables = chapter_tag.find_all("table") - for table in tables: - for t_tag in table.find_all(re.compile("td|th|tr")): - width = "" - if t_tag.get("style"): - width_match = re.search( - r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) - if width_match: - size = width_match.group(1) - width = size + "px" - - t_tag.attrs["width"] = t_tag.get("width") or width - - if t_tag.attrs.get("style"): - t_tag.attrs["style"] = t_tag.attrs["style"].replace( - "border:0;", "") - if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": - del t_tag.attrs["style"] - - if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: - table.attrs["border"] = "1" - - -def _insert_tags_in_parents(chapter_tag): - """ - Function inserts tags into correspond tags - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag - - Returns - ------- - None - Chapter Tag with inserted tags - - """ - parent_tag2condition = {parent[0]: parent[1] for parent in LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG.keys()} - for parent_tag_name, condition in parent_tag2condition.items(): - for parent_tag in chapter_tag.select(parent_tag_name): - if parent_tag.select(condition): - continue + """ + for rule in rules: + tags = rule["tags"] + tag_to_replace = rule["tag_to_replace"] + if rule["condition"]: + for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): + if condition_on_tag[0] == 'parent_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if tag.parent.select(condition_on_tag[1]): + tag.name = tag_to_replace + elif condition_on_tag[0] == 'child_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): + tag.name = tag_to_replace + elif condition_on_tag[0] == "attrs": + for attr in rule["condition"]["attrs"]: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): + tag.name = tag_to_replace else: - tag_to_insert = chapter_tag.new_tag( - LiveCartaConfig.INSERT_TAG_IN_PARENT_TAG[(parent_tag_name, condition)]) - # insert all items that was in pre to code and remove from pre - for content in reversed(parent_tag.contents): - tag_to_insert.insert(0, content.extract()) - # wrap code with items - parent_tag.append(tag_to_insert) + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) + tag.name = tag_to_replace + def _unwrap_tags(self, chapter_tag, rules: dict): + """ + Function unwrap tags and moves id to span + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag -def _class_removing(chapter_tag): - """ - Function removes classes that aren't created by converter - Parameters - ---------- - chapter_tag: BeautifulSoup - Tag & contents of the chapter tag + Returns + ------- + None + Chapter Tag with unwrapped certain tags - Returns - ------- - None - Chapter Tag without original classes of the book + """ + for tag_name in rules["tags"]: + for tag in chapter_tag.select(tag_name): + # if tag is a subtag + if ">" in tag_name: + tag.parent.attrs.update(tag.attrs) + self._add_span_to_save_ids_for_links(tag, chapter_tag) + tag.unwrap() - """ - for tag in chapter_tag.find_all(recursive=True): - if tag.attrs.get("class") \ - and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): - del tag.attrs["class"] + @staticmethod + def _insert_tags_into_correspond_tags(chapter_tag, rules: list): + """ + Function inserts tags into correspond tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + Returns + ------- + None + Chapter Tag with inserted tags -def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: - """ - Function finalise processing/cleaning content - Parameters - ---------- - title_str: str + """ - content_tag: Tag, soup object + def insert(tag, tag_to_insert): + # insert all items that was in tag to subtag and remove from tag + for content in reversed(tag.contents): + tag_to_insert.insert(0, content.extract()) + # wrap subtag with items + tag.append(tag_to_insert) - remove_title_from_chapter: bool + for rule in rules: + tags = rule["tags"] + tag_to_insert = \ + chapter_tag.new_tag(rule["tag_to_insert"]) + if rule["condition"]: + for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): + if condition_on_tag[0] == 'parent_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if tag.parent.select(condition_on_tag[1]): + insert(tag, tag_to_insert) + elif condition_on_tag[0] == 'child_tags': + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): + insert(tag, tag_to_insert) + elif condition_on_tag[0] == "attrs": + for attr in rule["condition"]["attrs"]: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr["name"]: re.compile(fr"{attr['value']}")}): + insert(tag, tag_to_insert) + else: + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): + insert(tag, tag_to_insert) - Steps - ---------- - 1. comments removal - 2. wrap NavigableString with tag

    - 3. wrap tags with

    - 4. replace tags with correspond LiveCarta tags - 5. unwrap tags - 6. heading removal - 7. process_table - 8. insert tags into correspond tags - 9. class removal + def _remove_headings_content(self, content_tag, title_of_chapter: str): + """ + Function + - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content + - adds span with id in order to + Parameters + ---------- + content_tag: soup object + Tag of the page + title_of_chapter: str + Chapter title - Returns - ------- - content_tag: str - prepared content + Returns + ------- + None + clean/remove headings & add span with id - """ - # 1. remove comments - _remove_comments(content_tag) + """ + title_of_chapter = title_of_chapter.lower() + for tag in content_tag.contents: + text = tag if isinstance(tag, NavigableString) else tag.text + if re.sub(r"[\s\xa0]", "", text): + text = re.sub(r"[\s\xa0]", " ", text).lower() + text = text.strip() # delete extra spaces + if title_of_chapter == text or \ + (title_of_chapter in text and + re.findall(r"^h[1-3]$", tag.name or content_tag.name)): + self._add_span_to_save_ids_for_links(tag, content_tag) + tag.extract() + return + elif not isinstance(tag, NavigableString): + if not self._remove_headings_content(tag, title_of_chapter): + break - # 2. - _wrap_strings_with_p(content_tag) - # 3. - _wrap_tags_with_table(content_tag) - # 4. - _tags_to_correspond_livecarta_tag(content_tag) - # 5. - _unwrap_tags(content_tag) - # 6. - if remove_title_from_chapter: - _remove_headings_content(content_tag, title_str) - # 7. - _process_table(content_tag) - # 8. - _insert_tags_in_parents(content_tag) + @staticmethod + def _process_tables(chapter_tag: BeautifulSoup): + """ + Function preprocesses tables and tags(td|th|tr) + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag - # 9. remove classes that weren't created by converter - _class_removing(content_tag) - return str(content_tag) + Returns + ------- + None + Chapter Tag with processed tables + + """ + tables = chapter_tag.find_all("table") + for table in tables: + for t_tag in table.find_all(re.compile("td|th|tr")): + width = "" + if t_tag.get("style"): + width_match = re.search( + r"[^-]width: ?(\d+\.?\d*)(p[tx])", t_tag["style"]) + if width_match: + size = width_match.group(1) + width = size + "px" + + t_tag.attrs["width"] = t_tag.get("width") or width + + if t_tag.attrs.get("style"): + t_tag.attrs["style"] = t_tag.attrs["style"].replace( + "border:0;", "") + if re.sub(r"[\s\xa0]", "", t_tag.attrs.get("style")) == "": + del t_tag.attrs["style"] + + if not table.attrs.get("border") or table.attrs.get("border") in ["0", "0px"]: + table.attrs["border"] = "1" + + @staticmethod + def _class_removing(chapter_tag): + """ + Function removes classes that aren't created by converter + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag without original classes of the book + + """ + for tag in chapter_tag.find_all(recursive=True): + if tag.attrs.get("class") \ + and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): + del tag.attrs["class"] + + def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: + """ + Function finalise processing/cleaning content + Parameters + ---------- + title_str: str + + content_tag: Tag, soup object + + remove_title_from_chapter: bool + + Steps + ---------- + 1. comments removal + 2. wrap NavigableString with tag

    + 3-6. wrap tags with

    + replace tags with correspond LiveCarta tags + unwrap tags + insert tags into correspond tags + 7. heading removal + 8. process_tables + 9. class removal + + Returns + ------- + content_tag: str + prepared content + + """ + # 1. remove comments + self._remove_comments(content_tag) + # 2. + self._wrap_strings_with_p(content_tag) + # 3-6. + for dict in self.preset: + func = self.name2function[dict["preset_name"]] + func(content_tag, dict['rules']) + # 7. + if remove_title_from_chapter: + self._remove_headings_content(content_tag, title_str) + # 8. + self._process_tables(content_tag) + # 9. remove classes that weren't created by converter + self._class_removing(content_tag) + return str(content_tag) diff --git a/src/preset_processor.py b/src/preset_processor.py new file mode 100644 index 0000000..a1cbb93 --- /dev/null +++ b/src/preset_processor.py @@ -0,0 +1,15 @@ +import json + + +from src.util.helpers import BookLogger + + +class PresetProcessor: + def __init__(self, preset_path="config/presets.json", logger=None): + self.preset_path = preset_path + self.logger: BookLogger = logger + + def get_preset_json(self): + f = open(self.preset_path) + data = json.load(f) + return data From 1926377a34939cc4de649cb56e36f0cd066cd7e8 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 8 Jul 2022 18:35:34 +0300 Subject: [PATCH 31/55] rewrite css_processor.py --- .../{css_preprocessor.py => css_processor.py} | 45 ++++++-- src/epub_converter/epub_converter.py | 103 +++++++----------- src/epub_converter/epub_solver.py | 10 +- 3 files changed, 78 insertions(+), 80 deletions(-) rename src/epub_converter/{css_preprocessor.py => css_processor.py} (79%) diff --git a/src/epub_converter/css_preprocessor.py b/src/epub_converter/css_processor.py similarity index 79% rename from src/epub_converter/css_preprocessor.py rename to src/epub_converter/css_processor.py index 57c0388..0caad25 100644 --- a/src/epub_converter/css_preprocessor.py +++ b/src/epub_converter/css_processor.py @@ -1,14 +1,14 @@ import re import cssutils +from bs4 import BeautifulSoup +from os.path import dirname, normpath, join -from src.util.helpers import BookLogger from src.util.color_reader import str2hex from src.livecarta_config import LiveCartaConfig class CSSPreprocessor: - def __init__(self, logger=None): - self.logger: BookLogger = logger + def __init__(self): """ Dictionary LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function } @@ -99,12 +99,8 @@ class CSSPreprocessor: size_value: str """ - if len(size_value.split(" ")) == 3: - size_value = self.convert_tag_style_values(size_value.split( - " ")[-2], True) # returns middle value - else: - size_value = self.convert_tag_style_values(size_value.split( - " ")[-1], True) # returns last value + size_value = self.convert_tag_style_values(size_value.split(" ")[-2], True) if len(size_value.split(" ")) == 3\ + else self.convert_tag_style_values(size_value.split(" ")[-1], True) return size_value @staticmethod @@ -152,10 +148,37 @@ class CSSPreprocessor: style = "; ".join(split_style) return style + def process_inline_styles_in_html_soup(self, html_href2html_body_soup): + """This function is designed to convert inline html styles""" + for html_href in html_href2html_body_soup: + html_content: BeautifulSoup = html_href2html_body_soup[html_href] + tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, + attrs={"style": re.compile(".*")}) + + for tag_initial_inline_style in tags_with_inline_style: + inline_style = tag_initial_inline_style.attrs["style"] + tag_initial_inline_style.attrs["style"] = \ + self.build_inline_style_content(inline_style) + + @staticmethod + def get_css_content(css_href, html_href, ebooklib_book): + path_to_css_from_html = css_href + html_folder = dirname(html_href) + path_to_css_from_root = normpath( + join(html_folder, path_to_css_from_html)).replace("\\", "/") + css_obj = ebooklib_book.get_item_with_href(path_to_css_from_root) + # if in css file we import another css + if "@import" in str(css_obj.content): + path_to_css_from_root = "css/" + \ + re.search("'(.*)'", str(css_obj.content)).group(1) + css_obj = ebooklib_book.get_item_with_href( + path_to_css_from_root) + assert css_obj, f"Css style {css_href} was not in manifest." + css_content: str = css_obj.get_content().decode() + return css_content + def update_css_styles_to_livecarta_convention(self, css_rule: cssutils.css.CSSStyleRule, style_type: cssutils.css.property.Property): - if style_type.name == "font-family": - pass if style_type.name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: # property not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = "" diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 525fad3..a301b5b 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -2,7 +2,6 @@ import re import json import codecs import os -from os.path import dirname, normpath, join from itertools import chain from premailer import transform from collections import defaultdict @@ -15,8 +14,8 @@ from bs4 import BeautifulSoup, NavigableString, Tag from src.util.helpers import BookLogger from src.preset_processor import PresetProcessor -from src.epub_converter.css_preprocessor import CSSPreprocessor -from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor +from src.epub_converter.css_processor import CSSPreprocessor +from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.livecarta_config import LiveCartaConfig from src.data_objects import ChapterItem, NavPoint from src.epub_converter.image_processing import update_images_src_links @@ -25,18 +24,18 @@ from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcesso class EpubConverter: - def __init__(self, file_path, access=None, logger=None, css_preprocessor=None, html_processor=None): + def __init__(self, file_path, access=None, logger=None, css_processor=None, html_processor=None): self.file_path = file_path self.access = access self.logger: BookLogger = logger self.ebooklib_book = epub.read_epub(file_path) - self.css_processor = css_preprocessor - self.html_preprocessor = html_processor + self.css_processor = css_processor + self.html_processor = html_processor # main container for all epub .xhtml files self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # enumerate all subchapter id for each file - self.html_href2subchapter_ids = defaultdict(list) + self.html_href2subchapters_ids = defaultdict(list) self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC # toc tree structure stored as adj.list (NavPoint to list of NavPoints) @@ -71,17 +70,18 @@ class EpubConverter: self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content() - self.logger.log("Process CSS inline styles.") - self.process_inline_styles_in_html_soup() + self.logger.log("CSS inline style processing.") + self.css_processor.process_inline_styles_in_html_soup(self.html_href2html_body_soup) self.logger.log("CSS files processing.") self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations() - self.logger.log("CSS styles adding.") + self.logger.log("CSS styles fusion(inline+file).") self.add_css_styles_to_html_soup() self.logger.log("Footnotes processing.") for href in self.html_href2html_body_soup: self.footnotes_contents, self.noterefs, self.footnotes =\ - preprocess_footnotes(self.html_href2html_body_soup[href], self.html_href2html_body_soup) + preprocess_footnotes( + self.html_href2html_body_soup[href], self.html_href2html_body_soup) self.logger.log(f"Added {len(self.footnotes_contents)} footnotes.") self.logger.log("TOC processing.") @@ -115,34 +115,6 @@ class EpubConverter: nodes[item.file_name] = soup return nodes - def get_css_content(self, css_href, html_href): - path_to_css_from_html = css_href - html_folder = dirname(html_href) - path_to_css_from_root = normpath( - join(html_folder, path_to_css_from_html)).replace("\\", "/") - css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root) - # if in css file we import another css - if "@import" in str(css_obj.content): - path_to_css_from_root = "css/" + \ - re.search("'(.*)'", str(css_obj.content)).group(1) - css_obj = self.ebooklib_book.get_item_with_href( - path_to_css_from_root) - assert css_obj, f"Css style {css_href} was not in manifest." - css_content: str = css_obj.get_content().decode() - return css_content - - def process_inline_styles_in_html_soup(self): - """This function is designed to convert inline html styles""" - for html_href in self.html_href2html_body_soup: - html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] - tags_with_inline_style = html_content.find_all(LiveCartaConfig.could_have_style_in_livecarta_regexp, - attrs={"style": re.compile(".*")}) - - for tag_initial_inline_style in tags_with_inline_style: - inline_style = tag_initial_inline_style.attrs["style"] - tag_initial_inline_style.attrs["style"] = \ - self.css_processor.build_inline_style_content(inline_style) - def build_html_and_css_relations(self) -> tuple[dict, dict]: """ Function is designed to get 2 dictionaries: @@ -174,7 +146,7 @@ class EpubConverter: if css_href not in css_href2css_content: # css_href not in css_href2css_content, add to this dict css_href2css_content[css_href] = self.css_processor.build_css_file_content( - self.get_css_content(css_href, html_href)) + self.css_processor.get_css_content(css_href, html_href, self.ebooklib_book)) for i, tag in enumerate(soup_html_content.find_all("style")): css_content = tag.string @@ -183,7 +155,8 @@ class EpubConverter: css_content) return html_href2css_href, css_href2css_content - def convert_html_soup_with_css_style(self, html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: + @staticmethod + def modify_html_soup_with_css_styles(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup: """ Function adds styles from .css to inline style. Parameters @@ -224,7 +197,10 @@ class EpubConverter: """ This function is designed to update html_href2html_body_soup - add to html_inline_style css_style_content - + Returns + ------- + None + updated soups with styles from css """ for html_href in self.html_href2html_body_soup: if self.html_href2css_href.get(html_href): @@ -232,7 +208,8 @@ class EpubConverter: for css_href in self.html_href2css_href[html_href]: css += self.css_href2css_content[css_href] html_content: BeautifulSoup = self.html_href2html_body_soup[html_href] - html_content = self.convert_html_soup_with_css_style(html_content, css) + html_content = self.modify_html_soup_with_css_styles( + html_content, css) self.html_href2html_body_soup[html_href] = html_content def build_adjacency_list_from_toc(self, element: [Link, tuple, list], lvl=0): @@ -259,7 +236,7 @@ class EpubConverter: nav_point = NavPoint(element) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.html_href2subchapter_ids[nav_point.href].append( + self.html_href2subchapters_ids[nav_point.href].append( nav_point.id) self.adjacency_list[nav_point] = None self.hrefs_added_to_toc.add(nav_point.href) @@ -271,7 +248,7 @@ class EpubConverter: nav_point = NavPoint(first) if nav_point.id: self.id_anchor_exist_in_nav_points = True - self.html_href2subchapter_ids[nav_point.href].append( + self.html_href2subchapters_ids[nav_point.href].append( nav_point.id) sub_nodes = [] @@ -357,25 +334,19 @@ class EpubConverter: for html_href in self.html_href2html_body_soup: chapter_tag = self.html_href2html_body_soup[html_href] # check marks for chapter starting are on the same level - 1st - marks = chapter_tag.find_all(attrs={"class": "converter-chapter-mark"}) + marks = chapter_tag.find_all( + attrs={"class": "converter-chapter-mark"}) # fix marks to be on 1 level for mark in marks: while mark.parent != chapter_tag: - mark.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases + # todo warning! could reflect on formatting/internal links in some cases + mark.parent.unwrap() @staticmethod def create_unique_id(href, id_): return re.sub(r"([^\w\s])|_|-", "", href) + re.sub(r"[_-]", "0", id_) - @staticmethod - def create_new_anchor_span(soup, id_): - new_anchor_span = soup.new_tag("span") - new_anchor_span.attrs["id"] = id_ - new_anchor_span.attrs["class"] = "link-anchor" - new_anchor_span.string = "\xa0" - return new_anchor_span - def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> [None, str]: """ Function used to find full path to file that is parsed from tag link @@ -414,6 +385,14 @@ class EpubConverter: return full_path[0] + @staticmethod + def create_new_anchor_span(soup, id_): + new_anchor_span = soup.new_tag("span") + new_anchor_span.attrs["id"] = id_ + new_anchor_span.attrs["class"] = "link-anchor" + new_anchor_span.string = "\xa0" + return new_anchor_span + def process_internal_links(self): """ Function @@ -520,8 +499,7 @@ class EpubConverter: @staticmethod def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: """ - After processing on a first_id that corresponds to current chapter, - from initial html_soup all tags from current chapter are extracted + Get tags between LiveCarta chapter marks Parameters ---------- first_id: str @@ -553,7 +531,6 @@ class EpubConverter: # save them in list for next steps tags = [tag.extract() for tag in tags] html_soup.smooth() - else: assert 0, f"Warning: no match for {first_id, href}" @@ -594,7 +571,7 @@ class EpubConverter: for sub_node in self.adjacency_list[nav_point]: self.detect_one_chapter(sub_node) - def define_chapters_content(self): + def define_chapters_with_content(self): """Function build chapters content, starts from top level chapters""" top_level_nav_points = self.adjacency_list[-1] if self.id_anchor_exist_in_nav_points: @@ -618,11 +595,9 @@ class EpubConverter: """ title = nav_point.title - if nav_point.id: - content: BeautifulSoup = self.href_chapter_id2soup_html[( - nav_point.href, nav_point.id)] - else: - content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href] + content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \ + if nav_point.id else self.html_href2html_body_soup[nav_point.href] + self.book_image_src_path2aws_path = update_images_src_links(content, self.img_href2img_bytes, path_to_html=nav_point.href, diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index c1bb800..e0cfef6 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -1,7 +1,7 @@ from src.book_solver import BookSolver from src.preset_processor import PresetProcessor -from src.epub_converter.css_preprocessor import CSSPreprocessor -from src.epub_converter.html_epub_preprocessor import HtmlEpubPreprocessor +from src.epub_converter.css_processor import CSSPreprocessor +from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.epub_converter.epub_converter import EpubConverter @@ -30,10 +30,10 @@ class EpubBook(BookSolver): """ preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\ .get_preset_json() - css_preprocessor = CSSPreprocessor(logger=self.logger_object) - html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object) + css_processor = CSSPreprocessor() + html_processor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object) json_converter = EpubConverter( self.file_path, access=self.access, logger=self.logger_object, - css_preprocessor=css_preprocessor, html_processor=html_preprocessor) + css_processor=css_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() return content_dict From 7d5c1bfdf2b08dc52ec0c5c84007b96707168eb1 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 8 Jul 2022 18:36:09 +0300 Subject: [PATCH 32/55] comment duplicate_styles_check cause of transform --- ...preprocessor.py => html_epub_processor.py} | 1 - .../tag_inline_style_processor.py | 26 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) rename src/epub_converter/{html_epub_preprocessor.py => html_epub_processor.py} (99%) diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_processor.py similarity index 99% rename from src/epub_converter/html_epub_preprocessor.py rename to src/epub_converter/html_epub_processor.py index 3ddc532..8193285 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_processor.py @@ -229,7 +229,6 @@ class HtmlEpubPreprocessor: Chapter Tag with inserted tags """ - def insert(tag, tag_to_insert): # insert all items that was in tag to subtag and remove from tag for content in reversed(tag.contents): diff --git a/src/epub_converter/tag_inline_style_processor.py b/src/epub_converter/tag_inline_style_processor.py index c4e0b45..30d7e50 100644 --- a/src/epub_converter/tag_inline_style_processor.py +++ b/src/epub_converter/tag_inline_style_processor.py @@ -48,15 +48,18 @@ class TagInlineStyleProcessor: style_ = style_.replace("color:white;", "") return style_ - @staticmethod - def duplicate_styles_check(split_style: list) -> list: - style_name2style_value = {} - for list_item in split_style: - key, val = list_item.split(":") - if val not in style_name2style_value.keys(): - style_name2style_value[key] = val - split_style = [k + ":" + v for k, v in style_name2style_value.items()] - return split_style + # @staticmethod + # def duplicate_styles_check(split_style: list) -> list: + # style_name2style_value = {} + # # {key: val for for list_item in split_style} + # splitstrs = (list_item.split(":") for list_item in split_style) + # d = {key: val for key, val in splitstrs} + # for list_item in split_style: + # key, val = list_item.split(":") + # if key not in style_name2style_value.keys(): + # style_name2style_value[key] = val + # split_style = [k + ":" + v for k, v in style_name2style_value.items()] + # return split_style @staticmethod def indents_processing(split_style: list) -> str: @@ -130,16 +133,13 @@ class TagInlineStyleProcessor: inline_style, self.tag_inline_style) inline_style = inline_style.replace( "list-style-image", "list-style-type") - # 2. Create list of styles from inline style # replace all spaces between "; & letter" to ";" style = re.sub(r"; *", ";", inline_style) # when we split style by ";", last element of the list is "" - None (remove it) split_inline_style: list = list(filter(None, style.split(";"))) - # 3. Duplicate styles check - if the tag had duplicate styles - split_inline_style = self.duplicate_styles_check(split_inline_style) - + # split_inline_style = self.duplicate_styles_check(split_inline_style) # 4. Processing indents inline_style: str = self.indents_processing(split_inline_style) return inline_style From 5036445c057b1e7d0fdebc3e10f78844065768f9 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 8 Jul 2022 18:37:17 +0300 Subject: [PATCH 33/55] rewrite process internal links --- src/epub_converter/epub_converter.py | 218 +++++++++++++-------------- 1 file changed, 105 insertions(+), 113 deletions(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index a301b5b..d3a623a 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -94,13 +94,15 @@ class EpubConverter: self.logger.log(f"Html documents not added to TOC: {not_added}.") self.logger.log(f"Add documents not added to TOC.") self.add_not_added_files_to_adjacency_list(not_added) - self.logger.log(f"Html internal links and structure processing.") - self.label_chapters_ids_with_lc_id() - self.chapter_marks_are_same_level() - # used only after parsed toc, ids from toc needed + self.logger.log(f"Label subchapters with converter tag.") + self.label_subchapters_with_lc_tag() + self.logger.log(f"Process html internal links.") self.process_internal_links() + self.logger.log( + f"Check if converter-chapter-marks are on the same level.") + self.chapter_marks_are_same_level() self.logger.log(f"Define chapters content.") - self.define_chapters_content() + self.define_chapters_with_content() self.logger.log(f"Converting html_nodes to LiveCarta chapter items.") def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: @@ -286,14 +288,14 @@ class EpubConverter: return True return False - def build_manifest_id2html_href(self) -> dict: - links = dict() - for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): - links[item.id] = item.file_name - return links - def build_adjacency_list_from_spine(self): - manifest_id2html_href = self.build_manifest_id2html_href() + def build_manifest_id2html_href() -> dict: + links = dict() + for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + links[item.id] = item.file_name + return links + + manifest_id2html_href = build_manifest_id2html_href() self.adjacency_list = { -1: [] } @@ -311,16 +313,16 @@ class EpubConverter: self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(file) - def label_chapters_ids_with_lc_id(self): + def label_subchapters_with_lc_tag(self): for html_href in self.html_href2html_body_soup: - ids = self.html_href2subchapter_ids[html_href] + ids, soup = self.html_href2subchapters_ids[html_href], \ + self.html_href2html_body_soup[html_href] for i in ids: - soup = self.html_href2html_body_soup[html_href] tag = soup.find(id=i) - new_h = soup.new_tag("tmp") - new_h.attrs["class"] = "converter-chapter-mark" - new_h.attrs["id"] = i - tag.insert_before(new_h) + tmp_tag = soup.new_tag("lc_tmp") + tmp_tag.attrs["class"] = "converter-chapter-mark" + tmp_tag.attrs["id"] = i + tag.insert_before(tmp_tag) def chapter_marks_are_same_level(self): """ @@ -401,8 +403,8 @@ class EpubConverter: Steps ---------- 1. rebuild ids to be unique in all documents - 2a. process anchor which is a whole xhtml file - 2b. process anchor which is an element in xhtml file + 2a. process anchor which is a whole htm|html|xhtml file + 2b. process anchor which is an element in htm|html|xhtml file Returns ------- @@ -410,91 +412,80 @@ class EpubConverter: process links in html """ - # 1. rebuild ids to be unique in all documents - for toc_href in self.hrefs_added_to_toc: - for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): - if tag.attrs.get("class") == "converter-chapter-mark": - continue + def make_ids_unique(): + for toc_href in self.hrefs_added_to_toc: + for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): + if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]: + new_id = self.create_unique_id(toc_href, tag.attrs["id"]) + tag.attrs["id"] = new_id - if tag.attrs.get("class") == "footnote-element": - continue + def process_file_anchor(): + for toc_href in self.hrefs_added_to_toc: + soup = self.html_href2html_body_soup[toc_href] + for internal_link_tag in soup.find_all("a", + {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}): + a_tag_href = internal_link_tag.attrs["href"] + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( + toc_href, a_tag_href, internal_link_tag) + if a_tag_href_matched_to_toc: + new_id = self.create_unique_id(a_tag_href_matched_to_toc, "") + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" + if new_id not in self.internal_anchors: + anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] + new_anchor_span = self.create_new_anchor_span(soup, new_id) + # insert a new span to the beginning of the file + anchor_soup.insert(0, new_anchor_span) + self.internal_anchors.add(new_id) + del internal_link_tag.attrs["href"] - new_id = self.create_unique_id(toc_href, tag.attrs["id"]) - tag.attrs["id"] = new_id + def process_file_element_anchor(): + for toc_href in self.hrefs_added_to_toc: + soup = self.html_href2html_body_soup[toc_href] + # process_file_element_anchor + for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}): + a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#") + a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( + toc_href, a_tag_href, internal_link_tag) if a_tag_href \ + else os.path.normpath(toc_href).replace("\\", "/") + if a_tag_href_matched_to_toc: + new_id = self.create_unique_id( + a_tag_href_matched_to_toc, a_tag_id) - # 2a. process anchor which is a whole xhtml file - internal_link_reg1 = re.compile( - r"(^(?!https?://).+\.(htm|html|xhtml)$)") - for toc_href in self.hrefs_added_to_toc: - soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all("a", {"href": internal_link_reg1}): - a_tag_href = internal_link_tag.attrs["href"] - # find full path - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( - toc_href, a_tag_href, internal_link_tag) - if not a_tag_href_matched_to_toc: - continue - new_id = self.create_unique_id(a_tag_href_matched_to_toc, "") - internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" - if new_id not in self.internal_anchors: - anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - new_anchor_span = self.create_new_anchor_span(soup, new_id) - # insert a new span to the beginning of the file - anchor_soup.insert(0, new_anchor_span) - self.internal_anchors.add(new_id) + anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] + anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \ + anchor_soup.find_all(attrs={"id": a_tag_id}) # if link is a footnote + if anchor_tags: + if len(anchor_tags) > 1: + self.logger.log(f"Warning in {toc_href}: multiple anchors:" + f"{len(anchor_tags)} found.\n" + f"{anchor_tags}\n" + f"While processing {internal_link_tag}") - del internal_link_tag.attrs["href"] - - # 2b. process anchor which is an element in xhtml file - internal_link_reg2 = re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)") - for toc_href in self.hrefs_added_to_toc: - soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all("a", {"href": internal_link_reg2}): - a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split( - "#") - # find full path - if a_tag_href: - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, - internal_link_tag) - else: - a_tag_href_matched_to_toc = os.path.normpath( - toc_href).replace("\\", "/") - - if not a_tag_href_matched_to_toc: - continue - - new_id = self.create_unique_id( - a_tag_href_matched_to_toc, a_tag_id) - - anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - anchor_tags = anchor_soup.find_all(attrs={"id": new_id, }) - anchor_tags = anchor_tags or anchor_soup.find_all( - attrs={"id": a_tag_id}) # if link is a footnote - - if anchor_tags: - if len(anchor_tags) > 1: - self.logger.log(f"Warning in {toc_href}: multiple anchors: {len(anchor_tags)} found.\n" - f"{anchor_tags}\n" - f" While processing {internal_link_tag}") - - anchor_tag = anchor_tags[0] - assert anchor_tag.attrs["id"] in [new_id, a_tag_id] - # if anchor is found we could add placeholder for link creation on server side. - internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" - # create span to have cyclic links, link has 1 type of class, anchor another - if anchor_tag.attrs["id"] not in self.internal_anchors: - new_anchor_span = self.create_new_anchor_span( - soup, new_id) - anchor_tag.insert_before(new_anchor_span) - self.internal_anchors.add(new_id) - del anchor_tag.attrs["id"] - del internal_link_tag.attrs["href"] - - else: - internal_link_tag.attrs["converter-mark"] = "bad-link" - self.logger.log(f"Error in {toc_href}. While processing {internal_link_tag} no anchor found." - f" Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file." - f" Old id={a_tag_id}") + anchor_tag = anchor_tags[0] + assert anchor_tag.attrs["id"] in [new_id, a_tag_id] + # if anchor is found we could add placeholder for link creation on server side. + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" + # create span to have cyclic links, link has 1 type of class, anchor another + if anchor_tag.attrs["id"] not in self.internal_anchors: + new_anchor_span = self.create_new_anchor_span( + soup, new_id) + anchor_tag.insert_before(new_anchor_span) + self.internal_anchors.add(new_id) + del anchor_tag.attrs["id"] + del internal_link_tag.attrs["href"] + else: + internal_link_tag.attrs["converter-mark"] = "bad-link" + self.logger.log(f"Error in {toc_href}." + f" While processing {internal_link_tag} no anchor found." + f" Should be anchor with new id={new_id} in" + f" {a_tag_href_matched_to_toc} file." + f" Old id={a_tag_id}") + # 1. make ids to be unique in all documents + make_ids_unique() + # 2a. process anchor which is a whole htm|html|xhtml file + process_file_anchor() + # 2b. process anchor which is an element in htm|html|xhtml file + process_file_element_anchor() @staticmethod def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list: @@ -606,10 +597,14 @@ class EpubConverter: book_id=self.file_path.stem if hasattr(self.file_path, "stem") else "book_id") + indent = " " * lvl + self.logger.log(indent + f"Chapter: {title} is processing.") is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS - title_preprocessed = self.html_preprocessor.prepare_title(title) - content_preprocessed = self.html_preprocessor.prepare_content(title_preprocessed, content, - remove_title_from_chapter=is_chapter) + self.logger.log(indent + "Process title.") + title_preprocessed = self.html_processor.prepare_title(title) + self.logger.log(indent + "Process content.") + content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content, + remove_title_from_chapter=is_chapter) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter # if it doesn't have subchapters @@ -618,10 +613,6 @@ class EpubConverter: sub_chapter_item = self.html_node_to_livecarta_chapter_item( sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) - - if self.logger: - indent = " " * lvl - self.logger.log(f"{indent}Chapter: {title} is prepared.") return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) def convert_to_dict(self) -> dict: @@ -644,17 +635,18 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/Modern_Java_in_Action.epub" + epub_file_path = "../../epub/9781641050234.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\ .get_preset_json() - css_preprocessor = CSSPreprocessor(logger=logger_object) - html_preprocessor = HtmlEpubPreprocessor(preset=preset, logger=logger_object) + css_processor = CSSPreprocessor() + html_processor = HtmlEpubPreprocessor( + preset=preset, logger=logger_object) json_converter = EpubConverter(epub_file_path, logger=logger_object, - css_preprocessor=css_preprocessor, html_processor=html_preprocessor) + css_processor=css_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() with codecs.open(epub_file_path.replace("epub", "json"), "w", encoding="utf-8") as f_json: From a5f7a9b36cb91ca6c27c6f670b084ba4afdfb8c6 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Tue, 12 Jul 2022 11:08:45 +0300 Subject: [PATCH 34/55] add presets to git --- config/.gitignore | 1 + config/presets.json | 95 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 config/presets.json diff --git a/config/.gitignore b/config/.gitignore index d6b7ef3..3208467 100644 --- a/config/.gitignore +++ b/config/.gitignore @@ -1,2 +1,3 @@ * +!presets.json !.gitignore diff --git a/config/presets.json b/config/presets.json new file mode 100644 index 0000000..497d29a --- /dev/null +++ b/config/presets.json @@ -0,0 +1,95 @@ +[ + { + "preset_name": "table_wrapper", + "rules": [ + { + "tags": ["div"], + "attrs": [ + { + "name": "width", + "value": ".*" + }, + { + "name": "border", + "value": ".*" + }, + { + "name": "bgcolor", + "value": ".*" + } + ] + }, + { + "tags": ["section", "blockquote"], + "attrs": [ + { + "name": "class", + "value": "feature[1234]" + } + ] + } + ] + }, + { + "preset_name": "replacer", + "rules": [ + { + "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$"], + "condition": null, + "tag_to_replace": "p" + }, + { + "tags": ["^aside$"], + "condition": null, + "tag_to_replace": "blockquote" + }, + { + "tags": ["^header$", "^footer$"], + "condition": null, + "tag_to_replace": "span" + }, + { + "tags": ["^code$", "^kbd$", "^var$"], + "condition": { + "parent_tags": ":not(pre)", + "child_tags": null, + "attrs": null + }, + "tag_to_replace": "span" + }, + { + "tags": ["^b$"], + "condition": null, + "tag_to_replace": "strong" + } + ] + }, + { + "preset_name": "unwrapper", + "rules": { + "tags": [ + "section", + "article", + "figcaption", + "main", + "body", + "html", + "li > p" + ] + } + }, + { + "preset_name": "inserter", + "rules": [ + { + "tags": ["pre"], + "condition": { + "parent_tags": null, + "child_tags": ":not(code, kbd, var)", + "attrs": null + }, + "tag_to_insert": "code" + } + ] + } +] \ No newline at end of file From e1f06ba8844605df4ef3494300f62f8f506c276b Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 12:39:48 +0300 Subject: [PATCH 35/55] Add concrete book_id for imgs on Local --- src/epub_converter/epub_converter.py | 21 ++++++++++----------- src/epub_converter/image_processing.py | 4 +--- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index d3a623a..e050791 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -1,15 +1,15 @@ import re import json import codecs -import os +import ebooklib +from ebooklib import epub +from ebooklib.epub import Link, Section +from os import path +from pathlib import Path from itertools import chain from premailer import transform from collections import defaultdict from typing import Dict, Union, List - -import ebooklib -from ebooklib import epub -from ebooklib.epub import Link, Section from bs4 import BeautifulSoup, NavigableString, Tag from src.util.helpers import BookLogger @@ -370,8 +370,8 @@ class EpubConverter: prepared content """ - dir_name = os.path.dirname(cur_file_path) - normed_path = os.path.normpath(os.path.join( + dir_name = path.dirname(cur_file_path) + normed_path = path.normpath(path.join( dir_name, href_in_link)).replace("\\", "/") full_path = [ path for path in self.hrefs_added_to_toc if normed_path in path] @@ -446,7 +446,7 @@ class EpubConverter: a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split("#") a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( toc_href, a_tag_href, internal_link_tag) if a_tag_href \ - else os.path.normpath(toc_href).replace("\\", "/") + else path.normpath(toc_href).replace("\\", "/") if a_tag_href_matched_to_toc: new_id = self.create_unique_id( a_tag_href_matched_to_toc, a_tag_id) @@ -594,8 +594,7 @@ class EpubConverter: path_to_html=nav_point.href, access=self.access, path2aws_path=self.book_image_src_path2aws_path, - book_id=self.file_path.stem - if hasattr(self.file_path, "stem") else "book_id") + book_id=Path(self.file_path).stem) indent = " " * lvl self.logger.log(indent + f"Chapter: {title} is processing.") @@ -635,7 +634,7 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/9781641050234.epub" + epub_file_path = "../../epub/9781614382264.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py index aefa24d..be0246e 100644 --- a/src/epub_converter/image_processing.py +++ b/src/epub_converter/image_processing.py @@ -1,6 +1,5 @@ import os import pathlib - from bs4 import BeautifulSoup from src.access import Access @@ -35,7 +34,6 @@ def update_images_src_links(body_tag: BeautifulSoup, book_id: str = None) -> dict: """Function makes dictionary image_src_path -> Amazon web service_path""" img_tags = body_tag.find_all("img") - for img in img_tags: path_to_img_from_html = img.attrs.get("src") html_folder = os.path.dirname(path_to_html) @@ -55,7 +53,7 @@ def update_images_src_links(body_tag: BeautifulSoup, path2aws_path[path_to_img_from_root] = new_folder else: new_folder = save_image_locally( - path_to_img_from_root, img_content, "book_id") + path_to_img_from_root, img_content, book_id) img.attrs["src"] = str(new_folder) if img.attrs.get("width"): From 55372561cd6d2ba79160abad30dcf83090b2354b Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 19:09:33 +0300 Subject: [PATCH 36/55] Change processing of Section|Part Chapters --- src/epub_converter/epub_converter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index e050791..f779f98 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -255,7 +255,7 @@ class EpubConverter: sub_nodes = [] for elem in second: - if ("section" in first.title.lower() or "part" in first.title.lower()) and lvl == 1: + if (bool(re.search('^section$|^part$', first.title.lower()))) and lvl == 1: self.offset_sub_nodes.append( self.build_adjacency_list_from_toc(elem, lvl)) else: @@ -634,7 +634,7 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/9781614382264.epub" + epub_file_path = "../../epub/9780763774134.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) From 8624a0a77684958474775df1e60cf07eec57c37b Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 19:10:23 +0300 Subject: [PATCH 37/55] ' -> " in import css file --- src/epub_converter/css_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/epub_converter/css_processor.py b/src/epub_converter/css_processor.py index 0caad25..fbb2f99 100644 --- a/src/epub_converter/css_processor.py +++ b/src/epub_converter/css_processor.py @@ -106,6 +106,7 @@ class CSSPreprocessor: @staticmethod def style_conditions(style_value, style_name): cleaned_value = style_value.replace("\"", "") + # cleaned_value = style_value.replace("+", "%2B") constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get( style_name) value_not_in_possible_values_list = cleaned_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[ @@ -170,7 +171,7 @@ class CSSPreprocessor: # if in css file we import another css if "@import" in str(css_obj.content): path_to_css_from_root = "css/" + \ - re.search("'(.*)'", str(css_obj.content)).group(1) + re.search('"(.*)"', str(css_obj.content)).group(1) css_obj = ebooklib_book.get_item_with_href( path_to_css_from_root) assert css_obj, f"Css style {css_href} was not in manifest." @@ -193,6 +194,7 @@ class CSSPreprocessor: if style_type.name in self.LIVECARTA_STYLE_ATTRS_MAPPING: # function that converts our data func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] + print(cleaned_value) css_rule.style[style_type.name] = func(cleaned_value) def build_css_file_content(self, css_content: str) -> str: From ca229dc6b792a4db5ebeb45303287fabbbe5f29a Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 19:11:33 +0300 Subject: [PATCH 38/55] Rewrite _insert_tags cause of mutable tag --- src/epub_converter/html_epub_processor.py | 24 +++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index 8193285..7a6e476 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -123,24 +123,22 @@ class HtmlEpubPreprocessor: """ - def _wrap_tag_with_table(chapter_tag, tag_to_be_wrapped, width="100", border="", bg_color=None): + def _wrap_tag_with_table(width="100", border="", bg_color=None): table = chapter_tag.new_tag("table") table.attrs["border"], table.attrs["align"], table.attrs["style"] \ = border, "center", f"width:{width}%;" tbody, tr, td = \ chapter_tag.new_tag("tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") td.attrs["bgcolor"] = bg_color - tag_to_be_wrapped.wrap(td) + tag_to_wrap.wrap(td) td.wrap(tr) tr.wrap(tbody) tbody.wrap(table) table.insert_after(BeautifulSoup(features="lxml").new_tag("br")) return table - def process_tag_using_table(tag_to_wrap): + def process_tag_using_table(): _wrap_tag_with_table( - chapter_tag, - tag_to_be_wrapped=tag_to_wrap, width=tag_to_wrap.attrs["width"] if tag_to_wrap.attrs.get("width") else "100", border=tag_to_wrap.attrs["border"] if tag_to_wrap.attrs.get("border") else None, bg_color=tag_to_wrap.attrs["bgcolor"] if tag_to_wrap.attrs.get("bgcolor") else None) @@ -152,7 +150,7 @@ class HtmlEpubPreprocessor: for attr in rule["attrs"]: for tag_to_wrap in chapter_tag.find_all([re.compile(tag) for tag in tags], {attr["name"]: re.compile(fr"{attr['value']}")}): - process_tag_using_table(tag_to_wrap) + process_tag_using_table() @staticmethod def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list): @@ -229,7 +227,9 @@ class HtmlEpubPreprocessor: Chapter Tag with inserted tags """ - def insert(tag, tag_to_insert): + def insert(tag): + tag_to_insert = \ + chapter_tag.new_tag(rule["tag_to_insert"]) # insert all items that was in tag to subtag and remove from tag for content in reversed(tag.contents): tag_to_insert.insert(0, content.extract()) @@ -238,26 +238,24 @@ class HtmlEpubPreprocessor: for rule in rules: tags = rule["tags"] - tag_to_insert = \ - chapter_tag.new_tag(rule["tag_to_insert"]) if rule["condition"]: for condition_on_tag in ((k, v) for k, v in rule["condition"].items() if v): if condition_on_tag[0] == 'parent_tags': for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): if tag.parent.select(condition_on_tag[1]): - insert(tag, tag_to_insert) + insert(tag) elif condition_on_tag[0] == 'child_tags': for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): if not tag.select(re.sub('[():]|not', '', condition_on_tag[1])): - insert(tag, tag_to_insert) + insert(tag) elif condition_on_tag[0] == "attrs": for attr in rule["condition"]["attrs"]: for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], {attr["name"]: re.compile(fr"{attr['value']}")}): - insert(tag, tag_to_insert) + insert(tag) else: for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): - insert(tag, tag_to_insert) + insert(tag) def _remove_headings_content(self, content_tag, title_of_chapter: str): """ From 7b35d8a7c2f63e07f3873d05e351f2b51b67020e Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 19:12:22 +0300 Subject: [PATCH 39/55] Optimize speed of image_processing.py --- src/docx_converter/image_processing.py | 52 ++++++++++++-------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py index e593312..dfd413b 100644 --- a/src/docx_converter/image_processing.py +++ b/src/docx_converter/image_processing.py @@ -3,36 +3,32 @@ import pathlib from shutil import copyfile -def process_images(body_tag, access, html_path, book_id): +def process_images(access, html_path, book_id, body_tag): """ - Function to process tag. Img should be sent Amazon S3 and then return new tag with valid link. + Function to process tag. + Img should be sent Amazon S3 and then return new tag with valid link. For now images are moved to one folder. + """ img_tags = body_tag.find_all('img') + for img in img_tags: + img_name = img.attrs.get('src') + # quick fix for bad links + if (len(img_name) >= 3) and img_name[:3] == '../': + img_name = img_name[3:] + img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') - if len(img_tags): - if access is None: - folder_path = os.path.dirname( - os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join( - folder_path, f'json/img_{book_id}/')) - new_path.mkdir(exist_ok=True) - - for img in img_tags: - img_name = img.attrs.get('src') - # quick fix for bad links - if (len(img_name) >= 3) and img_name[:3] == '../': - img_name = img_name[3:] - - img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') - - if access is not None: - link = access.send_image(img_path, doc_id=book_id) - img.attrs['src'] = link - else: - img_size = os.path.getsize(img_path) - new_img_path = new_path / img_name - copyfile(img_path, new_img_path) - img.attrs["src"] = str(new_img_path) - - return img_tags \ No newline at end of file + if access is not None: + link = access.send_image(img_path, doc_id=book_id) + img.attrs['src'] = link + else: + if img_tags.index(img) == 0: + folder_path = os.path.dirname( + os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join( + folder_path, f'../json/img_{book_id}/')) + new_path.mkdir(exist_ok=True) + new_img_path = new_path / img_name + copyfile(img_path, new_img_path) + img.attrs["src"] = str(new_img_path) + return img_tags From b1ccd796c9cbd6f580ac4e9cf50ac679f757b3dc Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 19:13:34 +0300 Subject: [PATCH 40/55] Set up local docx_converter --- src/docx_converter/docx_solver.py | 11 +++++++---- src/epub_converter/footnotes_processing.py | 4 +--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index b4aa9b3..9f1735b 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -35,7 +35,7 @@ class DocxBook(BookSolver): """ # 1. Converts docx to html with LibreOffice html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, - self.logger_object, self.status_wrapper, self.libre_locker) + self.logger_object, self.libre_locker) # TODO presets # 2. Parses and cleans html, gets list of tags, gets footnotes @@ -46,7 +46,7 @@ class DocxBook(BookSolver): # 3. Parses from line structure to nested structure with JSONConverter json_converter = LibreHTML2JSONConverter(bs_tags, footnotes, top_level_headers, - self.logger_object, self.status_wrapper) + self.logger_object) content_dict = json_converter.convert_to_dict() return content_dict @@ -56,12 +56,15 @@ if __name__ == "__main__": docx_file_path = '../../docx/music_inquiry.docx' logger_object = BookLogger( name='docx', book_id=docx_file_path.split('/')[-1]) + locker = Event() + locker.set() - html_converter = Docx2LibreHTML(file_path=docx_file_path) + html_converter = Docx2LibreHTML(file_path=docx_file_path, + logger=logger_object, libre_locker=locker) parser = HTMLDocxPreprocessor(html_converter.html_soup, logger_object) content, footnotes, top_level_headers = parser.process_html( - html_converter.html_path) + html_path=html_converter.html_path, book_id=html_converter.book_id) json_converter = LibreHTML2JSONConverter( content, footnotes, top_level_headers, logger_object) diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py index ae568e0..f82f073 100644 --- a/src/epub_converter/footnotes_processing.py +++ b/src/epub_converter/footnotes_processing.py @@ -26,15 +26,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note """ - footnotes = [] + footnotes, new_noterefs_tags, new_footnotes_tags = [], [], [] noterefs_tags = source_html_tag.find_all( attrs={noteref_attr_name: "noteref"}) bad_noterefs_tags = set( [tag for tag in noterefs_tags if not tag.attrs.get("href")]) noterefs_tags = [ tag for tag in noterefs_tags if tag not in bad_noterefs_tags] - new_noterefs_tags = [] - new_footnotes_tags = [] [tag.decompose() for tag in bad_noterefs_tags] def parse_a_tag_href(s: str) -> Tuple[str, str]: From a6a54abb0a595b0bd698e296a44d1fc94a3b7fe0 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 19:13:59 +0300 Subject: [PATCH 41/55] Optimize docx2libre_html.py --- src/docx_converter/docx2libre_html.py | 102 ++++++++++++-------------- 1 file changed, 47 insertions(+), 55 deletions(-) diff --git a/src/docx_converter/docx2libre_html.py b/src/docx_converter/docx2libre_html.py index 889aa25..fbb24fe 100644 --- a/src/docx_converter/docx2libre_html.py +++ b/src/docx_converter/docx2libre_html.py @@ -10,12 +10,12 @@ from src.util.helpers import BookLogger class Docx2LibreHTML: - def __init__(self, book_id=0, file_path=None, access=None, logger=None, status_wrapper=None, libre_locker=None): - self.book_id = book_id + def __init__(self, book_id=0, file_path=None, access=None, logger=None, libre_locker=None): + self.book_id = book_id if book_id != 0 else pathlib.Path( + file_path).stem self.file_path = file_path self.access = access self.logger_object: BookLogger = logger - self.status_wrapper: status_wrapper = status_wrapper # critical section for occupying libreoffice by one thread self.libre_locker: Event() = libre_locker @@ -24,15 +24,15 @@ class Docx2LibreHTML: self.html_soup = self.read_html(self.html_path) def _libre_run(self, out_dir_path): - command = ['libreoffice', '--headless', - '--convert-to', 'html', f'{str(self.file_path)}', - '--outdir', f'{out_dir_path}'] + command = ["libreoffice", "--headless", + "--convert-to", "html", f"{str(self.file_path)}", + "--outdir", f"{out_dir_path}"] print(command) result = subprocess.run(command, stdout=PIPE, stderr=PIPE) - self.logger_object.log(f'Result of libre conversion for book_{self.book_id}:' - f' {result.returncode}, {result.stdout}', logging.DEBUG) - self.logger_object.log(f'Any error while libre conversion for book_' - f'{self.book_id}: {result.stderr}', logging.DEBUG) + self.logger_object.log(f"Result of libre conversion for book_{self.book_id}:" + f" {result.returncode}, {result.stdout}", logging.DEBUG) + self.logger_object.log(f"Any error while libre conversion for book_" + f"{self.book_id}: {result.stderr}", logging.DEBUG) def convert_docx_to_html(self): """ @@ -48,82 +48,74 @@ class Docx2LibreHTML: path to html file, file appears after libre-conversion """ - self.logger_object.log(f'File - {self.file_path}.') - print(f'{self.file_path}') - self.logger_object.log('Beginning of conversion from .docx to .html.') + def get_and_clear_flag(out_dir_path: str): + self.libre_locker.clear() + self.logger_object.log(f"Got flag!", logging.DEBUG) + self._libre_run(out_dir_path) + self.libre_locker.set() + self.logger_object.log("Cleared flag...", logging.DEBUG) - try: - f = open(self.file_path) - f.close() - except FileNotFoundError as error: - self.logger_object.log( - 'Invalid path to input data.', logging.ERROR) - self.status_wrapper.set_error() - raise error + def check_file_exists(path, error_string: str): + try: + f = open(path) + f.close() + except FileNotFoundError as error: + self.logger_object.log( + error_string, logging.ERROR) + self.logger_object.log_error_to_main_log() + raise error + + self.logger_object.log(f"File - {self.file_path}.") + print(f"{self.file_path}") + self.logger_object.log("Beginning of conversion from .docx to .html.") + + check_file_exists( + self.file_path, error_string="Invalid path to input data.") folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) - out_dir_path = os.path.join(folder_path, f'../html/{self.book_id}') + out_dir_path = os.path.join(folder_path, f"../html/{self.book_id}") pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) - is_book_converted = False try: if self.libre_locker.isSet(): - self.libre_locker.clear() - self.logger_object.log('Got flag...', logging.DEBUG) - self._libre_run(out_dir_path) - self.libre_locker.set() - self.logger_object.log('Cleared flag...', logging.DEBUG) - + get_and_clear_flag(out_dir_path) else: - while not self.libre_locker.isSet() and not is_book_converted: + while not self.libre_locker.isSet(): self.logger_object.log( - 'Waiting for libre...', logging.DEBUG) + "Waiting for libre...", logging.DEBUG) flag = self.libre_locker.wait(50) if flag: if self.libre_locker.isSet(): - self.libre_locker.clear() - self.logger_object.log(f'Got flag!', logging.DEBUG) - self._libre_run(out_dir_path) - self.libre_locker.set() + get_and_clear_flag(out_dir_path) break - except Exception as exc: self.logger_object.log( "Conversion has gone wrong. Libreoffice is not installed.", logging.ERROR) self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() raise exc - out_dir_path = os.path.join(out_dir_path, f'{self.book_id}.html') + out_dir_path = os.path.join(out_dir_path, f"{self.book_id}.html") html_path = pathlib.Path(out_dir_path) - try: - f = open(html_path) - f.close() - except FileNotFoundError as exc: - self.logger_object.log( - "Conversion has gone wrong. HTML file doesn't exist.", logging.ERROR) - self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() - raise exc + check_file_exists( + html_path, error_string="Conversion has gone wrong. HTML file doesn't exist.") - self.logger_object.log('End of conversion from .docx to .html.') + self.logger_object.log("End of conversion from .docx to .html.") self.logger_object.log( - f'Input file path after conversion: {html_path}.') + f"Input file path after conversion: {html_path}.") return html_path def read_html(self, html_path): """Method for reading .html file into beautiful soup tag.""" try: - html_text = open(html_path, 'r', encoding='utf8').read() - self.logger_object.log('HTML for book has been loaded.') + html_text = open(html_path, "r", encoding="utf8").read() + self.logger_object.log("HTML for book has been loaded.") except FileNotFoundError as exc: - self.logger_object.log('There is no html to process.' - 'Conversion went wrong or you specified wrong paths.', logging.ERROR) + self.logger_object.log("There is no html to process." + "Conversion went wrong or you specified wrong paths.", logging.ERROR) self.logger_object.log_error_to_main_log() - self.status_wrapper.set_error() raise exc - html_soup = BeautifulSoup(html_text, features='lxml') + html_soup = BeautifulSoup(html_text, features="lxml") return html_soup From 16a86907386e137e1aa1d75682426be7a8b33c19 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 14 Jul 2022 19:14:49 +0300 Subject: [PATCH 42/55] small refactoring --- src/docx_converter/html_docx_preprocessor.py | 30 +++++++++----------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index c264d17..046166f 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -173,11 +173,8 @@ class HTMLDocxPreprocessor: margin_bottom = re.search( r'margin-bottom: ([\d.]{1,4})in', style) else: - indent = None - margin_left = None - margin_right = None - margin_top = None - margin_bottom = None + indent = margin_left = margin_right = \ + margin_top = margin_bottom = None if margin_left and margin_right and margin_top and margin_bottom and \ margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \ @@ -328,23 +325,23 @@ class HTMLDocxPreprocessor: "a", {'name': re.compile(r'^_Toc\d+')}) headers = [link.parent for link in toc_links] outline_level = "1" # All the unknown outlines will be predicted as

    - for tag in headers: - if re.search(r"^h\d$", tag.name): - tag.a.unwrap() + for h_tag in headers: + if re.search(r"^h\d$", h_tag.name): + h_tag.a.unwrap() # outline_level = tag.name[-1] # TODO: add prediction of the outline level - elif tag.name == "p": - exist_in_toc = self._check_parent_link_exist_in_toc(tag) - if tag in self.body_tag.find_all("p") and exist_in_toc: + elif h_tag.name == "p": + exist_in_toc = self._check_parent_link_exist_in_toc(h_tag) + if h_tag in self.body_tag.find_all("p") and exist_in_toc: new_tag = BeautifulSoup( features="lxml").new_tag("h" + outline_level) - text = tag.text - tag.replaceWith(new_tag) + text = h_tag.text + h_tag.replaceWith(new_tag) new_tag.string = text else: # rethink document structure when you have toc_links, other cases? self.logger_object.log(f'Something went wrong in processing toc_links.' f' Check the structure of the file. ' - f'Tag name: {tag.name}') + f'Tag name: {h_tag.name}') @staticmethod def clean_title_from_numbering(title: str): @@ -568,7 +565,7 @@ class HTMLDocxPreprocessor: ind = self.content.index(toc_tag) + 1 self.content = self.content[ind:] - def process_html(self, access=None, html_path='', book_id='local'): + def process_html(self, access=None, html_path='', book_id=0): """Process html code to satisfy LiveCarta formatting.""" self.logger_object.log('Beginning of processing .html file.') try: @@ -600,7 +597,8 @@ class HTMLDocxPreprocessor: f'{len(self.footnotes)} footnotes have been processed.') self.logger_object.log('Image processing.') - self.images = process_images(self.body_tag, access=access, html_path=html_path, book_id=book_id) + self.images = process_images(access=access, html_path=html_path, + book_id=book_id, body_tag=self.body_tag) self.logger_object.log( f'{len(self.images)} images have been processed.') From 20fa1bfa86f4b70e62775a517fb6f37ef4055ee0 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Fri, 15 Jul 2022 14:18:53 +0300 Subject: [PATCH 43/55] Clear font-family value of regex characters --- src/epub_converter/css_processor.py | 37 +++++++++++++---------- src/epub_converter/epub_converter.py | 2 +- src/epub_converter/html_epub_processor.py | 24 +++++++-------- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/src/epub_converter/css_processor.py b/src/epub_converter/css_processor.py index fbb2f99..2be0dab 100644 --- a/src/epub_converter/css_processor.py +++ b/src/epub_converter/css_processor.py @@ -37,6 +37,7 @@ class CSSPreprocessor: "margin-left": self.convert_indents_tag_values, "margin-top": self.convert_tag_style_values, "margin": self.convert_indents_tag_values, + "width": self.convert_tag_style_values, } @staticmethod @@ -104,16 +105,23 @@ class CSSPreprocessor: return size_value @staticmethod - def style_conditions(style_value, style_name): + def clean_value(style_value: str, style_name: str): cleaned_value = style_value.replace("\"", "") - # cleaned_value = style_value.replace("+", "%2B") + if style_name == 'font-family': + for symbol in ["+", "*", ".", "%", "?", "$", "^", "[", "]"]: + cleaned_value = re.sub( + re.escape(f"{symbol}"), rf"\\{symbol}", cleaned_value) + return cleaned_value + + @staticmethod + def style_conditions(style_value: str, style_name: str) -> tuple[bool, bool]: constraints_on_value = LiveCartaConfig.LIVECARTA_STYLE_ATTRS.get( style_name) - value_not_in_possible_values_list = cleaned_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[ + value_not_in_possible_values_list = style_value not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS[ style_name] - return cleaned_value, constraints_on_value, value_not_in_possible_values_list + return constraints_on_value, value_not_in_possible_values_list - def update_inline_styles_to_livecarta_convention(self, split_style: list): + def update_inline_styles_to_livecarta_convention(self, split_style: list) -> list: for i, style in enumerate(split_style): style_name, style_value = style.split(":") if style_name not in LiveCartaConfig.LIVECARTA_STYLE_ATTRS: @@ -121,9 +129,8 @@ class CSSPreprocessor: split_style[i] = "" return split_style - cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ - self.style_conditions(style_value, style_name) - if constraints_on_value and value_not_in_possible_values_list: + cleaned_value = self.clean_value(style_value, style_name) + if all(self.style_conditions(cleaned_value, style_name)): # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file split_style[i] = "" else: @@ -138,18 +145,18 @@ class CSSPreprocessor: """Build inline style with LiveCarta convention""" # replace all spaces between "; & letter" to ";" style = re.sub(r"; *", ";", style) - # when we split style by ";", last element of the list is "" - None - # remove it + # when we split style by ";", last element of the list is "" - None (we remove it) split_style: list = list(filter(None, style.split(";"))) # replace all spaces between ": & letter" to ":" split_style = [el.replace( re.search(r"(:\s*)", el).group(1), ":") for el in split_style] - split_style = self.update_inline_styles_to_livecarta_convention(split_style) + split_style = self.update_inline_styles_to_livecarta_convention( + split_style) style = "; ".join(split_style) return style - def process_inline_styles_in_html_soup(self, html_href2html_body_soup): + def process_inline_styles_in_html_soup(self, html_href2html_body_soup: dict): """This function is designed to convert inline html styles""" for html_href in html_href2html_body_soup: html_content: BeautifulSoup = html_href2html_body_soup[html_href] @@ -185,16 +192,14 @@ class CSSPreprocessor: css_rule.style[style_type.name] = "" return - cleaned_value, constraints_on_value, value_not_in_possible_values_list =\ - self.style_conditions(style_type.value, style_type.name) - if constraints_on_value and value_not_in_possible_values_list: + cleaned_value = self.clean_value(style_type.value, style_type.name) + if all(self.style_conditions(cleaned_value, style_type.name)): # there are constraints + value not in LIVECARTA_STYLE_ATTRS, remove from css file css_rule.style[style_type.name] = "" else: if style_type.name in self.LIVECARTA_STYLE_ATTRS_MAPPING: # function that converts our data func = self.LIVECARTA_STYLE_ATTRS_MAPPING[style_type.name] - print(cleaned_value) css_rule.style[style_type.name] = func(cleaned_value) def build_css_file_content(self, css_content: str) -> str: diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index f779f98..7e8ab8a 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -305,7 +305,7 @@ class EpubConverter: self.adjacency_list[-1].append(nav_point) self.hrefs_added_to_toc.add(nav_point.href) - def add_not_added_files_to_adjacency_list(self, not_added): + def add_not_added_files_to_adjacency_list(self, not_added: list): """Function add files that not added to adjacency list""" for i, file in enumerate(not_added): nav_point = NavPoint( diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index 7a6e476..aba8811 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -66,7 +66,7 @@ class HtmlEpubPreprocessor: return title @staticmethod - def _remove_comments(chapter_tag): + def _remove_comments(chapter_tag: BeautifulSoup): """ Function remove comments Parameters @@ -85,7 +85,7 @@ class HtmlEpubPreprocessor: element.extract() @staticmethod - def _wrap_strings_with_p(chapter_tag): + def _wrap_strings_with_p(chapter_tag: BeautifulSoup): """ Function converts headings that aren't supported by LiveCarta with

    Parameters @@ -108,7 +108,7 @@ class HtmlEpubPreprocessor: p_tag.append(str(node)) node.replace_with(p_tag) - def _wrap_tags_with_table(self, chapter_tag, rules: list): + def _wrap_tags_with_table(self, chapter_tag: BeautifulSoup, rules: list): """ Function wraps with

    Parameters @@ -153,7 +153,7 @@ class HtmlEpubPreprocessor: process_tag_using_table() @staticmethod - def _tags_to_correspond_livecarta_tag(chapter_tag, rules: list): + def _tags_to_correspond_livecarta_tag(chapter_tag: BeautifulSoup, rules: list): """ Function to replace all tags to correspond LiveCarta tags Parameters @@ -190,7 +190,7 @@ class HtmlEpubPreprocessor: # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) tag.name = tag_to_replace - def _unwrap_tags(self, chapter_tag, rules: dict): + def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict): """ Function unwrap tags and moves id to span Parameters @@ -213,7 +213,7 @@ class HtmlEpubPreprocessor: tag.unwrap() @staticmethod - def _insert_tags_into_correspond_tags(chapter_tag, rules: list): + def _insert_tags_into_correspond_tags(chapter_tag: BeautifulSoup, rules: list): """ Function inserts tags into correspond tags Parameters @@ -257,14 +257,14 @@ class HtmlEpubPreprocessor: for tag in chapter_tag.find_all([re.compile(tag) for tag in tags]): insert(tag) - def _remove_headings_content(self, content_tag, title_of_chapter: str): + def _remove_headings_content(self, chapter_tag, title_of_chapter: str): """ Function - cleans/removes headings from chapter in order to avoid duplication of chapter titles in the content - adds span with id in order to Parameters ---------- - content_tag: soup object + chapter_tag: soup object Tag of the page title_of_chapter: str Chapter title @@ -276,15 +276,15 @@ class HtmlEpubPreprocessor: """ title_of_chapter = title_of_chapter.lower() - for tag in content_tag.contents: + for tag in chapter_tag.contents: text = tag if isinstance(tag, NavigableString) else tag.text if re.sub(r"[\s\xa0]", "", text): text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces if title_of_chapter == text or \ (title_of_chapter in text and - re.findall(r"^h[1-3]$", tag.name or content_tag.name)): - self._add_span_to_save_ids_for_links(tag, content_tag) + re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): + self._add_span_to_save_ids_for_links(tag, chapter_tag) tag.extract() return elif not isinstance(tag, NavigableString): @@ -329,7 +329,7 @@ class HtmlEpubPreprocessor: table.attrs["border"] = "1" @staticmethod - def _class_removing(chapter_tag): + def _class_removing(chapter_tag: BeautifulSoup): """ Function removes classes that aren't created by converter Parameters From 4f7aa69ab3f445abb5b171fc926eeb24e4a06958 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 20 Jul 2022 15:44:28 +0300 Subject: [PATCH 44/55] Heading removal fix --- src/epub_converter/html_epub_processor.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index aba8811..752c4ac 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -276,20 +276,25 @@ class HtmlEpubPreprocessor: """ title_of_chapter = title_of_chapter.lower() + if title_of_chapter == "chapter 1": + pass for tag in chapter_tag.contents: text = tag if isinstance(tag, NavigableString) else tag.text if re.sub(r"[\s\xa0]", "", text): text = re.sub(r"[\s\xa0]", " ", text).lower() text = text.strip() # delete extra spaces - if title_of_chapter == text or \ - (title_of_chapter in text and - re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): - self._add_span_to_save_ids_for_links(tag, chapter_tag) + if not isinstance(tag, NavigableString): + if title_of_chapter == text or \ + (title_of_chapter in text and + re.findall(r"^h[1-3]$", tag.name or chapter_tag.name)): + self._add_span_to_save_ids_for_links(tag, chapter_tag) + tag.extract() + return + elif not self._remove_headings_content(tag, title_of_chapter): + break + else: tag.extract() return - elif not isinstance(tag, NavigableString): - if not self._remove_headings_content(tag, title_of_chapter): - break @staticmethod def _process_tables(chapter_tag: BeautifulSoup): From ea4dd77155131c5dcf75e92d251c96ece8cd507f Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 20 Jul 2022 15:45:44 +0300 Subject: [PATCH 45/55] Add attr replacer & svg -> img --- src/epub_converter/epub_converter.py | 16 ++++++------ src/epub_converter/html_epub_processor.py | 31 ++++++++++++++++++++--- src/epub_converter/image_processing.py | 6 ++--- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 7e8ab8a..f2c3232 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -589,13 +589,6 @@ class EpubConverter: content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] \ if nav_point.id else self.html_href2html_body_soup[nav_point.href] - self.book_image_src_path2aws_path = update_images_src_links(content, - self.img_href2img_bytes, - path_to_html=nav_point.href, - access=self.access, - path2aws_path=self.book_image_src_path2aws_path, - book_id=Path(self.file_path).stem) - indent = " " * lvl self.logger.log(indent + f"Chapter: {title} is processing.") is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS @@ -604,6 +597,13 @@ class EpubConverter: self.logger.log(indent + "Process content.") content_preprocessed = self.html_processor.prepare_content(title_preprocessed, content, remove_title_from_chapter=is_chapter) + + self.book_image_src_path2aws_path = update_images_src_links(content_preprocessed, + self.img_href2img_bytes, + path_to_html=nav_point.href, + access=self.access, + path2aws_path=self.book_image_src_path2aws_path, + book_id=Path(self.file_path).stem) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter # if it doesn't have subchapters @@ -612,7 +612,7 @@ class EpubConverter: sub_chapter_item = self.html_node_to_livecarta_chapter_item( sub_node, lvl + 1) sub_nodes.append(sub_chapter_item) - return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) + return ChapterItem(title_preprocessed, str(content_preprocessed), sub_nodes) def convert_to_dict(self) -> dict: """Function which convert list of html nodes to appropriate json structure""" diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index 752c4ac..0df4908 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -11,6 +11,7 @@ class HtmlEpubPreprocessor: self.name2function = { "table_wrapper": self._wrap_tags_with_table, "replacer": self._tags_to_correspond_livecarta_tag, + "attr_replacer": self._replace_attrs_in_tags, "unwrapper": self._unwrap_tags, "inserter": self._insert_tags_into_correspond_tags } @@ -190,6 +191,30 @@ class HtmlEpubPreprocessor: # todo can cause appearance of \n

    ...

    ->

    \n

    ...

    \n

    (section) tag.name = tag_to_replace + @staticmethod + def _replace_attrs_in_tags(chapter_tag: BeautifulSoup, rules: list): + """ + Function to replace all tags to correspond LiveCarta tags + Parameters + ---------- + chapter_tag: BeautifulSoup + Tag & contents of the chapter tag + + Returns + ------- + None + Chapter Tag with all tags replaced with LiveCarta tags + + """ + for rule in rules: + attr = rule["attr"] + tags = rule["condition"]["tags"] + attr_to_replace = rule["attr_to_replace"] + for tag in chapter_tag.find_all([re.compile(tag) for tag in tags], + {attr: re.compile(r".*")}): + tag[attr_to_replace] = tag[attr] + del tag[attr] + def _unwrap_tags(self, chapter_tag: BeautifulSoup, rules: dict): """ Function unwrap tags and moves id to span @@ -353,7 +378,7 @@ class HtmlEpubPreprocessor: and (tag.attrs.get("class") not in ["link-anchor", "footnote-element"]): del tag.attrs["class"] - def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: + def prepare_content(self, title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> Tag: """ Function finalise processing/cleaning content Parameters @@ -378,7 +403,7 @@ class HtmlEpubPreprocessor: Returns ------- - content_tag: str + content_tag: Tag prepared content """ @@ -397,4 +422,4 @@ class HtmlEpubPreprocessor: self._process_tables(content_tag) # 9. remove classes that weren't created by converter self._class_removing(content_tag) - return str(content_tag) + return content_tag diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py index be0246e..e568aaa 100644 --- a/src/epub_converter/image_processing.py +++ b/src/epub_converter/image_processing.py @@ -27,7 +27,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): def update_images_src_links(body_tag: BeautifulSoup, - href2img_content: dict, + img_href2img_content: dict, path_to_html: str, access=None, path2aws_path: dict = None, @@ -40,10 +40,10 @@ def update_images_src_links(body_tag: BeautifulSoup, path_to_img_from_root = os.path.normpath(os.path.join( html_folder, path_to_img_from_html)).replace("\\", "/") - assert path_to_img_from_root in href2img_content, \ + assert path_to_img_from_root in img_href2img_content, \ f"Image {path_to_img_from_html} in file {path_to_html} was not added to manifest." - img_content = href2img_content[path_to_img_from_root] + img_content = img_href2img_content[path_to_img_from_root] if access is not None: if path_to_img_from_root in path2aws_path: new_folder = path2aws_path[path_to_img_from_root] From 618d57767d4e39763d69045241a51f1a38b8e71b Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 20 Jul 2022 15:47:37 +0300 Subject: [PATCH 46/55] Change presets.json (add attr_replacer) --- config/presets.json | 162 ++++++++++++++++++++++++-------------------- 1 file changed, 90 insertions(+), 72 deletions(-) diff --git a/config/presets.json b/config/presets.json index 497d29a..7272038 100644 --- a/config/presets.json +++ b/config/presets.json @@ -2,94 +2,112 @@ { "preset_name": "table_wrapper", "rules": [ - { - "tags": ["div"], - "attrs": [ - { - "name": "width", - "value": ".*" - }, - { - "name": "border", - "value": ".*" - }, - { - "name": "bgcolor", - "value": ".*" - } - ] - }, - { - "tags": ["section", "blockquote"], - "attrs": [ - { - "name": "class", - "value": "feature[1234]" - } - ] - } - ] + { + "tags": ["div"], + "attrs": [ + { + "name": "width", + "value": ".*" + }, + { + "name": "border", + "value": ".*" + }, + { + "name": "bgcolor", + "value": ".*" + } + ] + }, + { + "tags": ["section", "blockquote"], + "attrs": [ + { + "name": "class", + "value": "feature[1234]" + } + ] + } + ] }, { "preset_name": "replacer", "rules": [ - { - "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$"], - "condition": null, - "tag_to_replace": "p" - }, - { - "tags": ["^aside$"], - "condition": null, - "tag_to_replace": "blockquote" - }, - { - "tags": ["^header$", "^footer$"], - "condition": null, - "tag_to_replace": "span" - }, - { - "tags": ["^code$", "^kbd$", "^var$"], - "condition": { - "parent_tags": ":not(pre)", - "child_tags": null, - "attrs": null + { + "tags": ["^h[6-9]$", "^figure$", "^section$", "^div$"], + "condition": null, + "tag_to_replace": "p" }, - "tag_to_replace": "span" - }, - { - "tags": ["^b$"], - "condition": null, - "tag_to_replace": "strong" - } - ] + { + "tags": ["^aside$"], + "condition": null, + "tag_to_replace": "blockquote" + }, + { + "tags": ["^header$", "^footer$"], + "condition": null, + "tag_to_replace": "span" + }, + { + "tags": ["^code$", "^kbd$", "^var$"], + "condition": { + "parent_tags": ":not(pre)", + "child_tags": null, + "attrs": null + }, + "tag_to_replace": "span" + }, + { + "tags": ["^b$"], + "condition": null, + "tag_to_replace": "strong" + }, + { + "tags": ["^image$"], + "condition": null, + "tag_to_replace": "img" + } + ] + }, + { + "preset_name": "attr_replacer", + "rules": [ + { + "attr": "xlink:href", + "condition": { + "tags": ["img"] + }, + "attr_to_replace": "src" + } + ] }, { "preset_name": "unwrapper", "rules": { "tags": [ - "section", - "article", - "figcaption", - "main", - "body", - "html", - "li > p" + "section", + "article", + "figcaption", + "main", + "body", + "html", + "svg", + "li > p" ] } }, { "preset_name": "inserter", "rules": [ - { - "tags": ["pre"], - "condition": { - "parent_tags": null, - "child_tags": ":not(code, kbd, var)", - "attrs": null - }, - "tag_to_insert": "code" - } + { + "tags": ["pre"], + "condition": { + "parent_tags": null, + "child_tags": ":not(code, kbd, var)", + "attrs": null + }, + "tag_to_insert": "code" + } ] } ] \ No newline at end of file From 32a54f0e4e0987d3029362103796b663cc25fd5a Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 27 Jul 2022 20:12:50 +0300 Subject: [PATCH 47/55] Changes in gitignores --- config/.gitignore | 1 - docx/.gitignore | 2 -- epub/.gitignore | 2 -- html/.gitignore | 2 -- json/.gitignore | 2 -- 5 files changed, 9 deletions(-) delete mode 100644 docx/.gitignore delete mode 100644 epub/.gitignore delete mode 100644 html/.gitignore delete mode 100644 json/.gitignore diff --git a/config/.gitignore b/config/.gitignore index 3208467..d6b7ef3 100644 --- a/config/.gitignore +++ b/config/.gitignore @@ -1,3 +1,2 @@ * -!presets.json !.gitignore diff --git a/docx/.gitignore b/docx/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/docx/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/epub/.gitignore b/epub/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/epub/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/html/.gitignore b/html/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/html/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/json/.gitignore b/json/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/json/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore From 84b692d39b8741dcc100bfd3a56bd75890746db2 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 27 Jul 2022 20:19:48 +0300 Subject: [PATCH 48/55] Add preset processing from backend --- {config => presets}/presets.json | 0 src/book_solver.py | 94 ++++++++++++++--------- src/docx_converter/image_processing.py | 2 +- src/epub_converter/epub_converter.py | 14 ++-- src/epub_converter/epub_solver.py | 7 +- src/epub_converter/html_epub_processor.py | 5 +- src/preset_processor.py | 15 ---- 7 files changed, 69 insertions(+), 68 deletions(-) rename {config => presets}/presets.json (100%) delete mode 100644 src/preset_processor.py diff --git a/config/presets.json b/presets/presets.json similarity index 100% rename from config/presets.json rename to presets/presets.json diff --git a/src/book_solver.py b/src/book_solver.py index c45af0f..10af671 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -24,9 +24,10 @@ class BookSolver: self.book_type = None self.book_id = book_id self.access = access - self.file_path = None # path to book file, appears after downloading from server - self.output_path = None # path to json file - self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}', + self.preset_path = None + self.book_path = None # path to book file, appears after downloading from server + self.book_output_path = None # path to json file + self.logger_object = BookLogger(name=f"{__name__}_{self.book_id}", book_id=book_id, main_logger=main_logger) self.status_wrapper = BookStatusWrapper( @@ -35,9 +36,9 @@ class BookSolver: assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \ "Length of headers doesn't match allowed levels." - def save_book_file(self, content: bytes): + def save_file(self, content: bytes, path_to_save, file_type): """ - Function saves binary content of file to .docx/.epub + Function saves binary content of file to folder(path_to_save) Parameters ---------- content: bytes str @@ -47,80 +48,100 @@ class BookSolver: folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.join( - folder_path, f'{self.book_type}/{self.book_id}') + folder_path, path_to_save) pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) file_path = os.path.join( - folder_path, f'{self.book_id}.{self.book_type}') + folder_path, f"{self.book_id}.{file_type}") try: - with open(file_path, 'wb+') as file: + with open(file_path, "wb+") as file: file.write(content) - self.logger_object.log(f'File was saved to folder: {folder_path}.') + self.logger_object.log( + f"Preset file was saved to folder: {folder_path}.") except Exception as exc: self.logger_object.log( f"Error in writing {self.book_type} file.", logging.ERROR) self.logger_object.log_error_to_main_log() raise exc + return file_path - self.file_path = pathlib.Path(file_path) + def get_preset_file(self): + """Method for getting and saving preset from server""" + try: + self.logger_object.log(f"Start receiving preset file from server. URL:" + f" {self.access.url}/doc-convert/{self.book_id}/presets") + content = self.access.get_file( + file_path=f"{self.access.url}/doc-convert/{self.book_id}/presets") + self.logger_object.log("Preset file was received from server.") + self.preset_path = pathlib.Path( + str(self.save_file(content, path_to_save="presets", file_type="json"))) + except FileNotFoundError as f_err: + self.logger_object.log( + "Can't get preset file from server.", logging.ERROR) + self.logger_object.log_error_to_main_log() + raise f_err + except Exception as exc: + raise exc def get_book_file(self): """Method for getting and saving book from server""" try: - self.logger_object.log(f'Start receiving file from server. URL:' - f' {self.access.url}/doc-convert/{self.book_id}/file') - content = self.access.get_book(self.book_id) - self.logger_object.log('File was received from server.') - self.save_book_file(content) + self.logger_object.log(f"Start receiving book file from server. URL:" + f" {self.access.url}/doc-convert/{self.book_id}/file") + content = self.access.get_file( + file_path=f"{self.access.url}/doc-convert/{self.book_id}/file") + self.logger_object.log("Book file was received from server.") + self.book_path = pathlib.Path(self.save_file( + content, path_to_save=f"books/{self.book_type}", file_type=self.book_type)) except FileNotFoundError as f_err: self.logger_object.log( - "Can't get file from server.", logging.ERROR) + "Can't get book file from server.", logging.ERROR) self.logger_object.log_error_to_main_log() raise f_err except Exception as exc: raise exc def check_output_directory(self): - if self.output_path is None: + if self.book_output_path is None: folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) output_path = os.path.join( - folder_path, f'json/{self.book_id}.json') - self.output_path = output_path + folder_path, f"books/json/{self.book_id}.json") + self.book_output_path = output_path - self.output_path = pathlib.Path(self.output_path) - self.logger_object.log(f'Output file path: {self.output_path}') + self.book_output_path = pathlib.Path(self.book_output_path) + self.logger_object.log(f"Output file path: {self.book_output_path}") - pathlib.Path(self.output_path).parent.mkdir( + pathlib.Path(self.book_output_path).parent.mkdir( parents=True, exist_ok=True) - self.output_path.touch(exist_ok=True) + self.book_output_path.touch(exist_ok=True) def write_to_json(self, content: dict): self.check_output_directory() try: - with codecs.open(self.output_path, 'w', encoding='utf-8') as f: + with codecs.open(self.book_output_path, "w", encoding="utf-8") as f: json.dump(content, f, ensure_ascii=False) self.logger_object.log( - f'Data has been saved to .json file: {self.output_path}') + f"Data has been saved to .json file: {self.book_output_path}") except Exception as exc: self.logger_object.log( - 'Error has occurred while writing .json file.' + str(exc), logging.ERROR) + "Error has occurred while writing .json file." + str(exc), logging.ERROR) def send_json_content_to_server(self, content: dict): """Function sends json_content to site""" try: self.access.send_book(self.book_id, content) - self.logger_object.log(f'JSON data has been sent to server.') + self.logger_object.log(f"JSON data has been sent to server.") except Exception as exc: self.logger_object.log( - 'Error has occurred while sending json content.', logging.ERROR) + "Error has occurred while sending json content.", logging.ERROR) self.logger_object.log_error_to_main_log() self.status_wrapper.set_error() raise exc @abstractmethod def get_converted_book(self): - self.logger_object.log('Beginning of processing .json output.') + self.logger_object.log("Beginning of processing .json output.") self.status_wrapper.set_generating() return {} @@ -133,7 +154,8 @@ class BookSolver: """ try: self.logger_object.log( - f'Beginning of conversion from .{self.book_type} to .json.') + f"Beginning of conversion from .{self.book_type} to .json.") + self.get_preset_file() self.get_book_file() self.status_wrapper.set_processing() content_dict = self.get_converted_book() @@ -141,11 +163,11 @@ class BookSolver: self.write_to_json(content_dict) self.send_json_content_to_server(content_dict) self.logger_object.log( - f'End of the conversion to LiveCarta format. Check {self.output_path}.') + f"End of the conversion to LiveCarta format. Check {self.book_output_path}.") except Exception as exc: self.status_wrapper.set_error() self.logger_object.log( - 'Error has occurred while conversion.', logging.ERROR) + "Error has occurred while conversion.", logging.ERROR) self.logger_object.log_error_to_main_log(str(exc)) raise exc @@ -158,15 +180,15 @@ class BookSolver: """ try: self.logger_object.log( - f'Data has been downloaded from {file_path} file') + f"Data has been downloaded from {file_path} file") self.status_wrapper.set_processing() - with codecs.open(file_path, 'r', encoding='utf-8') as f_json: + with codecs.open(file_path, "r", encoding="utf-8") as f_json: content_dict = json.load(f_json) self.status_wrapper.set_generating() self.send_json_content_to_server(content_dict) - self.logger_object.log(f'Sent a file to server. Check LiveCarta.') + self.logger_object.log(f"Sent a file to server. Check LiveCarta.") except Exception as exc: self.status_wrapper.set_error() self.logger_object.log( - 'Error has occurred while reading json file.' + str(exc), logging.ERROR) + "Error has occurred while reading json file." + str(exc), logging.ERROR) self.logger_object.log_error_to_main_log(str(exc)) diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py index dfd413b..0eab671 100644 --- a/src/docx_converter/image_processing.py +++ b/src/docx_converter/image_processing.py @@ -26,7 +26,7 @@ def process_images(access, html_path, book_id, body_tag): folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join( - folder_path, f'../json/img_{book_id}/')) + folder_path, f'../books/json/img_{book_id}/')) new_path.mkdir(exist_ok=True) new_img_path = new_path / img_name copyfile(img_path, new_img_path) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index f2c3232..4a09481 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -13,7 +13,6 @@ from typing import Dict, Union, List from bs4 import BeautifulSoup, NavigableString, Tag from src.util.helpers import BookLogger -from src.preset_processor import PresetProcessor from src.epub_converter.css_processor import CSSPreprocessor from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.livecarta_config import LiveCartaConfig @@ -24,11 +23,11 @@ from src.epub_converter.tag_inline_style_processor import TagInlineStyleProcesso class EpubConverter: - def __init__(self, file_path, access=None, logger=None, css_processor=None, html_processor=None): - self.file_path = file_path + def __init__(self, book_path, access=None, logger=None, css_processor=None, html_processor=None): + self.book_path = book_path self.access = access self.logger: BookLogger = logger - self.ebooklib_book = epub.read_epub(file_path) + self.ebooklib_book = epub.read_epub(book_path) self.css_processor = css_processor self.html_processor = html_processor @@ -603,7 +602,7 @@ class EpubConverter: path_to_html=nav_point.href, access=self.access, path2aws_path=self.book_image_src_path2aws_path, - book_id=Path(self.file_path).stem) + book_id=Path(self.book_path).stem) sub_nodes = [] # warning! not EpubHtmlItems won't be added to chapter # if it doesn't have subchapters @@ -638,11 +637,8 @@ if __name__ == "__main__": logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) - preset = PresetProcessor(preset_path="../../config/presets.json", logger=logger_object)\ - .get_preset_json() css_processor = CSSPreprocessor() - html_processor = HtmlEpubPreprocessor( - preset=preset, logger=logger_object) + html_processor = HtmlEpubPreprocessor("../../presets/presets.json", logger=logger_object) json_converter = EpubConverter(epub_file_path, logger=logger_object, css_processor=css_processor, html_processor=html_processor) diff --git a/src/epub_converter/epub_solver.py b/src/epub_converter/epub_solver.py index e0cfef6..9131eda 100644 --- a/src/epub_converter/epub_solver.py +++ b/src/epub_converter/epub_solver.py @@ -1,5 +1,4 @@ from src.book_solver import BookSolver -from src.preset_processor import PresetProcessor from src.epub_converter.css_processor import CSSPreprocessor from src.epub_converter.html_epub_processor import HtmlEpubPreprocessor from src.epub_converter.epub_converter import EpubConverter @@ -28,12 +27,10 @@ class EpubBook(BookSolver): json for LiveCarta platform """ - preset = PresetProcessor(preset_path="config/presets.json", logger=self.logger_object)\ - .get_preset_json() css_processor = CSSPreprocessor() - html_processor = HtmlEpubPreprocessor(preset=preset, logger=self.logger_object) + html_processor = HtmlEpubPreprocessor(self.preset_path, logger=self.logger_object) json_converter = EpubConverter( - self.file_path, access=self.access, logger=self.logger_object, + self.book_path, access=self.access, logger=self.logger_object, css_processor=css_processor, html_processor=html_processor) content_dict = json_converter.convert_to_dict() return content_dict diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index 0df4908..d8403d1 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -1,12 +1,13 @@ import re +import json from bs4 import BeautifulSoup, NavigableString, Comment, Tag from src.util.helpers import BookLogger class HtmlEpubPreprocessor: - def __init__(self, preset, logger=None): - self.preset = preset + def __init__(self, preset_path, logger=None): + self.preset = json.load(open(preset_path)) self.logger: BookLogger = logger self.name2function = { "table_wrapper": self._wrap_tags_with_table, diff --git a/src/preset_processor.py b/src/preset_processor.py deleted file mode 100644 index a1cbb93..0000000 --- a/src/preset_processor.py +++ /dev/null @@ -1,15 +0,0 @@ -import json - - -from src.util.helpers import BookLogger - - -class PresetProcessor: - def __init__(self, preset_path="config/presets.json", logger=None): - self.preset_path = preset_path - self.logger: BookLogger = logger - - def get_preset_json(self): - f = open(self.preset_path) - data = json.load(f) - return data From 617d4fcaef1f3f8fad7f96cd10880c6e24534dde Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 27 Jul 2022 20:20:28 +0300 Subject: [PATCH 49/55] Small changes (to work with preset) --- consumer.py | 13 +++++++------ src/access.py | 45 ++++++++++++++++++++++----------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/consumer.py b/consumer.py index 2ea307c..095facf 100644 --- a/consumer.py +++ b/consumer.py @@ -77,7 +77,6 @@ def callback(ch, method, properties, body, logger, libre_locker): thread.start() logging.log(logging.INFO, f"Active threads: {active_count()}.") # print(f"Active threads: {active_count()}.") - except Exception as exc: if hasattr(exc, "message"): logger.error(f"{sys.exc_info()[0]}: {exc.message}") @@ -90,15 +89,18 @@ def callback(ch, method, properties, body, logger, libre_locker): def server_run(): logger = configure_file_logger("consumer") + channel = None try: folder_path = os.path.dirname(os.path.abspath(__file__)) - config_path = Path(os.path.join(folder_path, "config/queue_config.json")) + config_path = Path(os.path.join( + folder_path, "config/queue_config.json")) with open(config_path, "r") as f: conf_param = json.load(f) - host = conf_param.get("host") or pika.ConnectionParameters().DEFAULT_HOST - port = conf_param.get("port") or pika.ConnectionParameters().DEFAULT_PORT - channel = None + host = conf_param.get( + "host") or pika.ConnectionParameters().DEFAULT_HOST + port = conf_param.get( + "port") or pika.ConnectionParameters().DEFAULT_PORT credentials = pika.PlainCredentials( username=conf_param["username"], password=conf_param["password"]) parameters = pika.ConnectionParameters( @@ -113,7 +115,6 @@ def server_run(): logger.log(logging.ERROR, f"Queue {conf_param['queue']} is not declared.") raise exc - locker = Event() locker.set() channel.basic_consume(queue=conf_param["queue"], diff --git a/src/access.py b/src/access.py index 4367c33..ba8ddb6 100644 --- a/src/access.py +++ b/src/access.py @@ -35,22 +35,6 @@ class Access: self.get_token() self.refreshing.set() - def sleep(timeout: float, retry=3): - def decorator(function): - """Decorator sleeping timeout sec and makes 3 retries""" - def wrapper(*args, **kwargs): - retries = 0 - while retries < retry: - try: - value = function(*args, **kwargs) - if value is not None: - return value - except: - time.sleep(timeout) - retries += 1 - return wrapper - return decorator - def set_credentials(self, url): folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) @@ -60,8 +44,8 @@ class Access: self.refreshing.clear() self.url = url - self.username = params['username'] - self.password = params['password'] + self.username = params["username"] + self.password = params["password"] self.refreshing.set() def format_header(self): @@ -123,14 +107,14 @@ class Access: else: raise Exception(f'{response.status_code}') - def get_book(self, book_id): - """Function downloads the book from site""" + def get_file(self, file_path): + """Function downloads the file[book, preset] from site""" if self.is_time_for_refreshing(): self.refresh_token() self.refreshing.wait() response = requests.get( - f'{self.url}/doc-convert/{book_id}/file', headers=self.headers, + file_path, headers=self.headers, # auth=('kiryl.miatselitsa', 'iK4yXCvdyHFEEOvG2v3F') ) @@ -139,11 +123,26 @@ class Access: elif response.status_code == 200: content = response.content else: - raise Exception(f'Error in getting doc from url: {self.url}/doc-convert/{book_id}/file, ' + raise Exception(f'Error in getting preset from url: {file_path}, ' f'status code:{response.status_code}') - return content + def sleep(timeout: float, retry=3): + def decorator(function): + """Decorator sleeping timeout sec and makes 3 retries""" + def wrapper(*args, **kwargs): + retries = 0 + while retries < retry: + try: + value = function(*args, **kwargs) + if value is not None: + return value + except: + time.sleep(timeout) + retries += 1 + return wrapper + return decorator + @sleep(3) def send_image(self, img_path, doc_id, img_content: bytes = None): """Function sends images to site""" From 290ffa346a5d6c732d10930041805821c4bc9fc0 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 27 Jul 2022 20:20:52 +0300 Subject: [PATCH 50/55] Docx refactoring --- src/docx_converter/docx2libre_html.py | 3 +- src/docx_converter/docx_solver.py | 6 +- src/docx_converter/footnotes_processing.py | 2 +- src/docx_converter/html_docx_preprocessor.py | 397 +++++++++---------- 4 files changed, 182 insertions(+), 226 deletions(-) diff --git a/src/docx_converter/docx2libre_html.py b/src/docx_converter/docx2libre_html.py index fbb24fe..56fe2f7 100644 --- a/src/docx_converter/docx2libre_html.py +++ b/src/docx_converter/docx2libre_html.py @@ -66,7 +66,6 @@ class Docx2LibreHTML: raise error self.logger_object.log(f"File - {self.file_path}.") - print(f"{self.file_path}") self.logger_object.log("Beginning of conversion from .docx to .html.") check_file_exists( @@ -74,7 +73,7 @@ class Docx2LibreHTML: folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) - out_dir_path = os.path.join(folder_path, f"../html/{self.book_id}") + out_dir_path = os.path.join(folder_path, f"../books/html/{self.book_id}") pathlib.Path(out_dir_path).mkdir(parents=True, exist_ok=True) try: diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 9f1735b..6260edb 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -34,9 +34,9 @@ class DocxBook(BookSolver): """ # 1. Converts docx to html with LibreOffice - html_converter = Docx2LibreHTML(self.book_id, self.file_path, self.access, + html_converter = Docx2LibreHTML(self.book_id, self.book_path, self.access, self.logger_object, self.libre_locker) - # TODO presets + # todo presets # 2. Parses and cleans html, gets list of tags, gets footnotes parser = HTMLDocxPreprocessor( @@ -53,7 +53,7 @@ class DocxBook(BookSolver): if __name__ == "__main__": - docx_file_path = '../../docx/music_inquiry.docx' + docx_file_path = '../../books/docx/music_inquiry.docx' logger_object = BookLogger( name='docx', book_id=docx_file_path.split('/')[-1]) locker = Event() diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py index beb6d15..c269b73 100644 --- a/src/docx_converter/footnotes_processing.py +++ b/src/docx_converter/footnotes_processing.py @@ -1,7 +1,7 @@ import re from bs4 import BeautifulSoup, NavigableString -@staticmethod + def _clean_footnote_content(content): content = content.strip() return content.strip() diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index 046166f..a44df01 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -11,7 +11,7 @@ from src.docx_converter.image_processing import process_images class HTMLDocxPreprocessor: - + def __init__(self, html_soup, logger_object, status_wrapper=None): self.body_tag = html_soup.body self.html_soup = html_soup @@ -20,6 +20,38 @@ class HTMLDocxPreprocessor: self.top_level_headers = None self.content = list() + def _process_toc_links(self): + def _check_parent_link_exist_in_toc(tag_with_link): + toc_links = [] + for a_tag in tag_with_link.find_all("a", {"name": re.compile(r"^_Toc\d+")}): + link_name = a_tag.attrs["name"] + toc_item = self.body_tag.find("a", {"href": "#" + link_name}) + if toc_item: + toc_links.append(toc_item) + return len(toc_links) > 0 + """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" + toc_links = self.body_tag.find_all( + "a", {"name": re.compile(r"^_Toc\d+")}) + headers = [link.parent for link in toc_links] + outline_level = "1" # All the unknown outlines will be predicted as

    + for h_tag in headers: + if re.search(r"^h\d$", h_tag.name): + h_tag.a.unwrap() + # outline_level = tag.name[-1] # TODO: add prediction of the outline level + elif h_tag.name == "p": + exist_in_toc = _check_parent_link_exist_in_toc(h_tag) + if h_tag in self.body_tag.find_all("p") and exist_in_toc: + new_tag = BeautifulSoup( + features="lxml").new_tag("h" + outline_level) + text = h_tag.text + h_tag.replaceWith(new_tag) + new_tag.string = text + else: + # rethink document structure when you have toc_links, other cases? + self.logger_object.log(f"Something went wrong in processing toc_links." + f" Check the structure of the file. " + f"Tag name: {h_tag.name}") + def _clean_tag(self, tag: str, attr_name: str, attr_value: re): # todo regex """ @@ -48,12 +80,12 @@ class HTMLDocxPreprocessor: """Function cleans meaningless tags before links.""" underlines = self.body_tag.find_all("u") for u in underlines: - if u.find_all('a'): + if u.find_all("a"): u.unwrap() - links = self.body_tag.find_all('a') + links = self.body_tag.find_all("a") for link in links: - u = link.find_all('u') + u = link.find_all("u") if u and len(u) == 1: u[0].unwrap() @@ -81,16 +113,12 @@ class HTMLDocxPreprocessor: """ size = re.search(r"font-size: (\d{1,3})pt", style) - if size is None: return style - size = size.group(1) new_size = cls.convert_pt_to_px(size) - if new_size == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE: return "" - return re.sub(size + "pt", str(new_size) + "px", style) def _font_to_span(self): @@ -108,10 +136,10 @@ class HTMLDocxPreprocessor: style = self.convert_font_pt_to_px(style) if style != "": if color and color in LiveCartaConfig.COLORS_MAP: - style += f'; color: {color};' + style += f"; color: {color};" font.attrs["style"] = style elif color and color in LiveCartaConfig.COLORS_MAP: - font.attrs["style"] = f'color: {color};' + font.attrs["style"] = f"color: {color};" if len(font.attrs) == 0: font.unwrap() @@ -121,16 +149,16 @@ class HTMLDocxPreprocessor: def clean_trash(self): # todo make it regex dict - """Function to remove all styles and tags we don't need.""" - self._clean_tag('span', 'style', re.compile( - r'^background: #[\da-fA-F]{6}$')) + """Function to remove all styles and tags we don"t need.""" + self._clean_tag("span", "style", re.compile( + r"^background: #[\da-fA-F]{6}$")) # todo: check for another languages - self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) - self._clean_tag('span', 'style', re.compile( - '^letter-spacing: -?[\d.]+pt$')) + self._clean_tag("span", "lang", re.compile(r"^ru-RU$")) + self._clean_tag("span", "style", re.compile( + "^letter-spacing: -?[\d.]+pt$")) - self._clean_tag('font', 'face', re.compile( - r'^Times New Roman[\w, ]+$')) + self._clean_tag("font", "face", re.compile( + r"^Times New Roman[\w, ]+$")) self._clean_tag("a", "name", "_GoBack") self._clean_underline_links() @@ -139,60 +167,68 @@ class HTMLDocxPreprocessor: # replace toc with empty tag tables = self.body_tag.find_all( - "div", id=re.compile(r'^Table of Contents\d+')) + "div", id=re.compile(r"^Table of Contents\d+")) for table in tables: table.wrap(self.html_soup.new_tag("TOC")) table.decompose() + def _preprocessing_headings(self): + # todo regex + """Function to convert all lower level headings to p tags""" + pattern = f"^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$" + header_tags = self.body_tag.find_all(re.compile(pattern)) + for tag in header_tags: + tag.name = "p" + def _process_paragraph(self): """Function to process

    tags (text-align and text-indent value).""" - paragraphs = self.body_tag.find_all('p') + paragraphs = self.body_tag.find_all("p") for p in paragraphs: # libre converts some \n into

    with 2
    # there we remove 1 unnecessary
    - brs = p.find_all('br') + brs = p.find_all("br") text = p.text - if brs and text == '\n\n' and len(brs) == 2: + if brs and text == "\n\n" and len(brs) == 2: brs[0].decompose() indent_should_be_added = False - if text and ((text[0:1] == '\t') or (text[:2] == '\n\t')): + if text and ((text[0:1] == "\t") or (text[:2] == "\n\t")): indent_should_be_added = True - align = p.get('align') - style = p.get('style') + align = p.get("align") + style = p.get("style") if style: - indent = re.search(r'text-indent: ([\d.]{1,4})in', style) - margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style) + indent = re.search(r"text-indent: ([\d.]{1,4})in", style) + margin_left = re.search(r"margin-left: ([\d.]{1,4})in", style) margin_right = re.search( - r'margin-right: ([\d.]{1,4})in', style) - margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style) + r"margin-right: ([\d.]{1,4})in", style) + margin_top = re.search(r"margin-top: ([\d.]{1,4})in", style) margin_bottom = re.search( - r'margin-bottom: ([\d.]{1,4})in', style) + r"margin-bottom: ([\d.]{1,4})in", style) else: indent = margin_left = margin_right = \ margin_top = margin_bottom = None if margin_left and margin_right and margin_top and margin_bottom and \ - margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \ - margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11': - p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) + margin_left.group(1) == "0.6" and margin_right.group(1) == "0.6" and \ + margin_top.group(1) == "0.14" and margin_bottom.group(1) == "0.11": + p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote")) p.attrs = {} - style = '' + style = "" if align is not None and align != LiveCartaConfig.DEFAULT_ALIGN_STYLE: - style += f'text-align: {align};' + style += f"text-align: {align};" if indent is not None or indent_should_be_added: # indent = indent.group(1) - style += f'text-indent: {LiveCartaConfig.INDENT};' + style += f"text-indent: {LiveCartaConfig.INDENT};" if style: - p.attrs['style'] = style + p.attrs["style"] = style def _process_two_columns(self): """Function to process paragraphs which has two columns layout.""" @@ -203,40 +239,6 @@ class HTMLDocxPreprocessor: child["class"] = "columns2" div.unwrap() - def _process_tables(self): - """Function to process tables. Set "border" attribute.""" - tables = self.body_tag.find_all("table") - for table in tables: - tds = table.find_all("td") - - sizes = [] - for td in tds: - style = td.get('style') - - if style: - match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) - - if match: - size = match.group(1) - units = match.group(2) - - if units == "pt": - size = self.convert_pt_to_px(size) - - sizes.append(float(size)) - - width = td.get('width') - - td.attrs = {} - if width: - td.attrs['width'] = width - - if sizes: - border_size = sum(sizes) / len(sizes) - table.attrs['border'] = f'{border_size:.2}' - - self.tables_amount = len(tables) - def _process_quotes(self): """ Function to process block quotes. @@ -259,9 +261,9 @@ class HTMLDocxPreprocessor: for table in tables: trs = table.find_all("tr") tds = table.find_all("td") - if len(trs) == 1 and len(tds) == 1 and tds[0].get('width') == '600': + if len(trs) == 1 and len(tds) == 1 and tds[0].get("width") == "600": td = tds[0] - is_zero_border = 'border: none;' in td.get('style') + is_zero_border = "border: none;" in td.get("style") paragraphs = td.find_all("p") has_i_tag_or_br = [(p.i, p.br) for p in paragraphs] has_i_tag_or_br = [x[0] is not None or x[1] is not None @@ -269,27 +271,61 @@ class HTMLDocxPreprocessor: if all(has_i_tag_or_br) and is_zero_border: new_div = BeautifulSoup( - features='lxml').new_tag('blockquote') + features="lxml").new_tag("blockquote") for p in paragraphs: new_div.append(p) table.replaceWith(new_div) + def _process_tables(self): + """Function to process tables. Set "border" attribute.""" + tables = self.body_tag.find_all("table") + for table in tables: + tds = table.find_all("td") + + sizes = [] + for td in tds: + style = td.get("style") + + if style: + match = re.search(r"border: ?(\d+\.?\d*)(p[tx])", style) + + if match: + size = match.group(1) + units = match.group(2) + + if units == "pt": + size = self.convert_pt_to_px(size) + + sizes.append(float(size)) + + width = td.get("width") + + td.attrs = {} + if width: + td.attrs["width"] = width + + if sizes: + border_size = sum(sizes) / len(sizes) + table.attrs["border"] = f"{border_size:.2}" + + self.tables_amount = len(tables) + def _process_hrefs(self): a_tags_with_href = self.body_tag.find_all( - 'a', {'href': re.compile('^.*http.+')}) + "a", {"href": re.compile("^.*http.+")}) # remove char=end of file for some editors for tag in a_tags_with_href: - tag.string = tag.text.replace('\u200c', '') - tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') + tag.string = tag.text.replace("\u200c", "") + tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") a_tags_with_href = self.body_tag.find_all( - 'a', {'href': re.compile('^(?!#sdfootnote)')}) + "a", {"href": re.compile("^(?!#sdfootnote)")}) for tag in a_tags_with_href: - tag.string = tag.text.replace('\u200c', '') - tag.string = tag.text.replace('\u200b', '') # zero-width-space - tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') + tag.string = tag.text.replace("\u200c", "") + tag.string = tag.text.replace("\u200b", "") # zero-width-space + tag["href"] = tag.attrs.get("href").replace("%E2%80%8C", "") def _process_footer(self): # todo regex @@ -297,7 +333,7 @@ class HTMLDocxPreprocessor: Function to process

    tags. All the tags will be deleted from file. """ - divs = self.body_tag.find_all('div', {'title': 'footer'}) + divs = self.body_tag.find_all("div", {"title": "footer"}) for div in divs: div.decompose() @@ -305,90 +341,9 @@ class HTMLDocxPreprocessor: # todo regex """Function to process
    tags. All the tags will be deleted from file, all content of the tags will stay.""" divs = self.body_tag.find_all("div") - for div in divs: div.unwrap() - def _check_parent_link_exist_in_toc(self, tag_with_link): - toc_links = [] - for a_tag in tag_with_link.find_all("a", {'name': re.compile(r'^_Toc\d+')}): - link_name = a_tag.attrs['name'] - toc_item = self.body_tag.find("a", {'href': '#' + link_name}) - if toc_item: - toc_links.append(toc_item) - - return len(toc_links) > 0 - - def _process_toc_links(self): - """Function to extract nodes which contains TOC links, remove links from file and detect headers.""" - toc_links = self.body_tag.find_all( - "a", {'name': re.compile(r'^_Toc\d+')}) - headers = [link.parent for link in toc_links] - outline_level = "1" # All the unknown outlines will be predicted as

    - for h_tag in headers: - if re.search(r"^h\d$", h_tag.name): - h_tag.a.unwrap() - # outline_level = tag.name[-1] # TODO: add prediction of the outline level - elif h_tag.name == "p": - exist_in_toc = self._check_parent_link_exist_in_toc(h_tag) - if h_tag in self.body_tag.find_all("p") and exist_in_toc: - new_tag = BeautifulSoup( - features="lxml").new_tag("h" + outline_level) - text = h_tag.text - h_tag.replaceWith(new_tag) - new_tag.string = text - else: - # rethink document structure when you have toc_links, other cases? - self.logger_object.log(f'Something went wrong in processing toc_links.' - f' Check the structure of the file. ' - f'Tag name: {h_tag.name}') - - @staticmethod - def clean_title_from_numbering(title: str): - """Function to remove digits from headers.""" - title = re.sub(r'^(\s+)+', '', title) - # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title - # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title - # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title - return title - - @staticmethod - def clean_tag_from_tabs(tag: NavigableString): - cleaned = re.sub(r'(\s+)+', ' ', tag) - this = BeautifulSoup.new_string(BeautifulSoup( - features="lxml"), cleaned, NavigableString) - tag.replace_with(this) - # print('input: ', repr(tag)) - # print('test: ', repr(cleaned)) - - def clean_tag_from_numbering(self, tag): - cleaned = self.clean_title_from_numbering(tag) - this = BeautifulSoup.new_string(BeautifulSoup( - features="lxml"), cleaned, NavigableString) - tag.replace_with(this) - # print('input: ', repr(tag)) - # print('test: ', repr(cleaned)) - - def apply_func_to_last_child(self, tag, func=None): - """ - works only with constructions like (((child to work with))) - where child is object of NavigableString - """ - if type(tag) is NavigableString: - func(tag) - else: - children = list(tag.children) - if children: - self.apply_func_to_last_child(children[0], func) - - def _preprocessing_headings(self): - # todo regex - """Function to convert all lower level headings to p tags""" - pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' - header_tags = self.body_tag.find_all(re.compile(pattern)) - for tag in header_tags: - tag.name = 'p' - def _get_top_level_headers(self): """ Function for gathering info about top-level chapters. @@ -416,27 +371,26 @@ class HTMLDocxPreprocessor: tag.parent.unwrap() title = tag.text - title = re.sub(r'\s+', ' ', title).strip() - number = re.match(r'^(?:\.?\d+\.? ?)+', title) + title = re.sub(r"\s+", " ", title).strip() + number = re.match(r"^(?:\.?\d+\.? ?)+", title) is_numbered = number is not None - cleaned_title = self.clean_title_from_numbering(tag.text) - is_introduction = cleaned_title.lower() == 'introduction' + cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text) + is_introduction = cleaned_title.lower() == "introduction" headers_info.append({ - 'title': cleaned_title, - 'is_numbered': is_numbered, - 'is_introduction': is_introduction}) - + "title": cleaned_title, + "is_numbered": is_numbered, + "is_introduction": is_introduction}) return headers_info def _mark_introduction_headers(self): """ Function to find out: - what header shouldn't be numbered and can be treated as introduction chapter + what header shouldn"t be numbered and can be treated as introduction chapter Assume header(s) to be introduction if: 1. one header not numbered, before 1 numbered header - 2. it is first header from the top level list, and it equals to 'introduction' + 2. it is first header from the top level list, and it equals to "introduction" Returns ------- @@ -444,9 +398,9 @@ class HTMLDocxPreprocessor: mark each top-level header with flag should_be_numbered = true/false """ - is_numbered_header = [header['is_numbered'] + is_numbered_header = [header["is_numbered"] for header in self.top_level_headers] - is_title = [header['is_introduction'] + is_title = [header["is_introduction"] for header in self.top_level_headers] first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 @@ -454,12 +408,31 @@ class HTMLDocxPreprocessor: first_header_is_introduction = is_title and is_title[0] if (first_not_numbered and second_is_numbered_or_not_exist) or first_header_is_introduction: - self.top_level_headers[0]['should_be_numbered'] = False + self.top_level_headers[0]["should_be_numbered"] = False for i in range(1, len(self.top_level_headers)): - self.top_level_headers[i]['should_be_numbered'] = True + self.top_level_headers[i]["should_be_numbered"] = True else: for i in range(0, len(self.top_level_headers)): - self.top_level_headers[i]['should_be_numbered'] = True + self.top_level_headers[i]["should_be_numbered"] = True + + @staticmethod + def clean_title_from_tabs(tag: NavigableString): + cleaned = re.sub(r"[\s\xa0]", " ", tag) + this = BeautifulSoup.new_string(BeautifulSoup( + features="lxml"), cleaned, NavigableString) + tag.replace_with(this) + + def apply_func_to_last_child(self, tag, func=None): + """ + works only with constructions like (((child to work with))) + where child is object of NavigableString + """ + if type(tag) is NavigableString: + func(tag) + else: + children = list(tag.children) + if children: + self.apply_func_to_last_child(children[0], func) def _process_headings(self): # todo regex @@ -499,44 +472,33 @@ class HTMLDocxPreprocessor: while tag.parent.name == "ol": tag.parent.unwrap() - title = tag.text - title = self.clean_title_from_numbering(title) - if title == "": + cleaned_title = re.sub(r"[\s\xa0]", " ", tag.text) + if cleaned_title == "": tag.unwrap() else: assert tag.name in LiveCartaConfig.SUPPORTED_HEADERS, \ - f'Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' + f"Preprocessing went wrong, there is still h{LiveCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings." content = list(tag.children) - # do not take into account rubbish empty tags like , but don't remove them + # do not take into account rubbish empty tags like , but don"t remove them content = [item for item in content if - (type(item) is not NavigableString and item.text != '') + (type(item) is not NavigableString and item.text != "") or (type(item) is NavigableString)] + content[0] = "" if content[0] == " " else content[0] + content = [item for item in content if item != ""] + for i, item in enumerate(content): if type(content[i]) is NavigableString: - cleaned = re.sub(r'(\s+)+', ' ', content[i]) + cleaned = re.sub(r"(\s+)+", " ", content[i]) this = BeautifulSoup.new_string(BeautifulSoup( features="lxml"), cleaned, NavigableString) content[i].replace_with(this) content[i] = this else: self.apply_func_to_last_child( - content[i], self.clean_tag_from_tabs) - - content[0] = '' if content[0] == ' ' else content[0] - content = [item for item in content if item != ''] - - if type(content[0]) is NavigableString: - cleaned = self.clean_title_from_numbering(content[0]) - this = BeautifulSoup.new_string(BeautifulSoup( - features="lxml"), cleaned, NavigableString) - content[0].replace_with(this) - content[0] = this - else: - self.apply_func_to_last_child( - content[0], self.clean_tag_from_numbering) + content[i], self.clean_title_from_tabs) def _process_lists(self): # todo regex @@ -551,81 +513,76 @@ class HTMLDocxPreprocessor: uwrap

    tag with li """ - li_tags = self.body_tag.find_all("li") - for li_tag in li_tags: li_tag.attrs.update(li_tag.p.attrs) li_tag.p.unwrap() def delete_content_before_toc(self): # remove all tag upper the only in content !!! body tag is not updated - toc_tag = self.html_soup.new_tag('TOC') + toc_tag = self.html_soup.new_tag("TOC") + self.content: List[Tag] = self.body_tag.find_all(recursive=False) if toc_tag in self.content: ind = self.content.index(toc_tag) + 1 self.content = self.content[ind:] - def process_html(self, access=None, html_path='', book_id=0): + def process_html(self, access=None, html_path="", book_id=0): """Process html code to satisfy LiveCarta formatting.""" - self.logger_object.log('Beginning of processing .html file.') + self.logger_object.log("Beginning of processing .html file.") try: - self.logger_object.log(f'Processing TOC and headers.') + self.logger_object.log(f"Processing TOC and headers.") self._process_toc_links() self.clean_trash() # process main elements of the .html doc - self.logger_object.log(f'Processing main elements of html.') + self.logger_object.log(f"Processing main elements of html.") self._preprocessing_headings() self._process_paragraph() self._process_two_columns() - self.logger_object.log('Block quotes processing.') + self.logger_object.log("Block quotes processing.") self._process_quotes() - self.logger_object.log('Tables processing.') + self.logger_object.log("Tables processing.") self._process_tables() self.logger_object.log( - f'{self.tables_amount} tables have been processed.') + f"{self.tables_amount} tables have been processed.") - self.logger_object.log('Hrefs processing.') + self.logger_object.log("Hrefs processing.") self._process_hrefs() - self.logger_object.log('Footnotes processing.') + self.logger_object.log("Footnotes processing.") self.footnotes = process_footnotes(self.body_tag) self.logger_object.log( - f'{len(self.footnotes)} footnotes have been processed.') + f"{len(self.footnotes)} footnotes have been processed.") - self.logger_object.log('Image processing.') + self.logger_object.log("Image processing.") self.images = process_images(access=access, html_path=html_path, book_id=book_id, body_tag=self.body_tag) self.logger_object.log( - f'{len(self.images)} images have been processed.') + f"{len(self.images)} images have been processed.") self._process_footer() self._process_div() - self.content = self.body_tag.find_all(recursive=False) - self.top_level_headers = self._get_top_level_headers() self._mark_introduction_headers() self._process_headings() - self.content: List[Tag] = self.body_tag.find_all(recursive=False) - self._process_lists() # delete text before table of content if exists self.delete_content_before_toc() except Exception as exc: self.logger_object.log( - 'Error has occurred while processing html.', logging.ERROR) + "Error has occurred while processing html.", logging.ERROR) self.logger_object.log_error_to_main_log() if self.status_wrapper: self.status_wrapper.set_error() raise exc - self.logger_object.log('End of processing .html file.') + self.logger_object.log("End of processing .html file.") return self.content, self.footnotes, self.top_level_headers From 253c4ebe26664aa5abf8b7d893602818d4a62acf Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 27 Jul 2022 20:21:20 +0300 Subject: [PATCH 51/55] Add new gitignores --- books/docx/.gitignore | 2 ++ books/epub/.gitignore | 2 ++ books/html/.gitignore | 2 ++ books/json/.gitignore | 2 ++ 4 files changed, 8 insertions(+) create mode 100644 books/docx/.gitignore create mode 100644 books/epub/.gitignore create mode 100644 books/html/.gitignore create mode 100644 books/json/.gitignore diff --git a/books/docx/.gitignore b/books/docx/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/books/docx/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/books/epub/.gitignore b/books/epub/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/books/epub/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/books/html/.gitignore b/books/html/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/books/html/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/books/json/.gitignore b/books/json/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/books/json/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore From af466cbc27d2f43af6b317bf6a5e13b8881d2055 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 27 Jul 2022 20:44:19 +0300 Subject: [PATCH 52/55] Change paths to books --- consumer.py | 2 +- presets/.gitignore | 2 + src/docx_converter/docx_solver.py | 8 ++-- src/docx_converter/footnotes_processing.py | 44 +++++++++--------- src/docx_converter/image_processing.py | 12 ++--- .../libre_html2json_converter.py | 46 +++++++++---------- src/epub_converter/epub_converter.py | 2 +- src/epub_converter/footnotes_processing.py | 4 +- src/epub_converter/image_processing.py | 2 +- 9 files changed, 62 insertions(+), 60 deletions(-) create mode 100644 presets/.gitignore diff --git a/consumer.py b/consumer.py index 095facf..dfa0b16 100644 --- a/consumer.py +++ b/consumer.py @@ -33,7 +33,7 @@ def configure_file_logger(name, filename="logs/converter.log", filemode="w+", def local_convert_book(book_type: [DocxBook, EpubBook], book_id, logger, params: dict): logger.info(f"Start processing book-{book_id}.") try: - json_file_path = "json/9781614382264.json" + json_file_path = "books/json/9781614382264.json" book = book_type(book_id=book_id, main_logger=logger, **params) book.conversion_local(json_file_path) except Exception as exc: diff --git a/presets/.gitignore b/presets/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/presets/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/src/docx_converter/docx_solver.py b/src/docx_converter/docx_solver.py index 6260edb..5edeb46 100644 --- a/src/docx_converter/docx_solver.py +++ b/src/docx_converter/docx_solver.py @@ -14,7 +14,7 @@ class DocxBook(BookSolver): def __init__(self, book_id=0, access=None, main_logger=None, libre_locker=None): super().__init__(book_id, access, main_logger) - self.book_type = 'docx' + self.book_type = "docx" # critical section for occupying libreoffice by one thread self.libre_locker: Event() = libre_locker @@ -53,9 +53,9 @@ class DocxBook(BookSolver): if __name__ == "__main__": - docx_file_path = '../../books/docx/music_inquiry.docx' + docx_file_path = "../../books/docx/music_inquiry.docx" logger_object = BookLogger( - name='docx', book_id=docx_file_path.split('/')[-1]) + name="docx", book_id=docx_file_path.split("/")[-1]) locker = Event() locker.set() @@ -70,5 +70,5 @@ if __name__ == "__main__": content, footnotes, top_level_headers, logger_object) content_dict = json_converter.convert_to_dict() - with codecs.open(docx_file_path.replace('docx', 'json'), 'w', encoding='utf-8') as f: + with codecs.open(docx_file_path.replace("docx", "json"), "w", encoding="utf-8") as f: json.dump(content_dict, f, ensure_ascii=False) diff --git a/src/docx_converter/footnotes_processing.py b/src/docx_converter/footnotes_processing.py index c269b73..bda6733 100644 --- a/src/docx_converter/footnotes_processing.py +++ b/src/docx_converter/footnotes_processing.py @@ -9,58 +9,58 @@ def _clean_footnote_content(content): def process_footnotes(body_tag): """Function returns list of footnotes and delete them from html_soup.""" - footnote_anchors = body_tag.find_all('a', class_='sdfootnoteanc') + footnote_anchors = body_tag.find_all("a", class_="sdfootnoteanc") footnote_content = body_tag.find_all( - 'div', id=re.compile(r'^sdfootnote\d+$')) + "div", id=re.compile(r"^sdfootnote\d+$")) footnote_amt = len(footnote_anchors) assert footnote_amt == len(footnote_content), \ - 'Something went wrong with footnotes after libre conversion' + "Something went wrong with footnotes after libre conversion" footnotes = [] for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): true_a_tag = cont_tag.find_all( - 'a', class_=re.compile(r'^sdfootnote.+$'))[0] + "a", class_=re.compile(r"^sdfootnote.+$"))[0] - if true_a_tag.attrs.get('href') is None: + if true_a_tag.attrs.get("href") is None: cont_tag.a.decompose() continue - assert anc_tag['name'] == true_a_tag['href'][1:], \ - 'Something went wrong with footnotes after libre conversion' + assert anc_tag["name"] == true_a_tag["href"][1:], \ + "Something went wrong with footnotes after libre conversion" - new_tag = BeautifulSoup(features='lxml').new_tag('sup') - new_tag['class'] = 'footnote-element' - new_tag['data-id'] = i + 1 - new_tag['id'] = f'footnote-{i + 1}' - new_tag.string = '*' + new_tag = BeautifulSoup(features="lxml").new_tag("sup") + new_tag["class"] = "footnote-element" + new_tag["data-id"] = i + 1 + new_tag["id"] = f"footnote-{i + 1}" + new_tag.string = "*" anc_tag.replace_with(new_tag) # extra digits in footnotes from documents downloaded from livecarta a_text = true_a_tag.text - if len(cont_tag.find_all('p')): - sup = cont_tag.find_all('p')[0].find('sup') + if len(cont_tag.find_all("p")): + sup = cont_tag.find_all("p")[0].find("sup") if sup and sup.text == a_text: sup.decompose() - for tag_a in cont_tag.find_all('a', {'class': 'sdfootnotesym'}): + for tag_a in cont_tag.find_all("a", {"class": "sdfootnotesym"}): tag_a.decompose() # remove font-size - for span in cont_tag.find_all('span', {'style': re.compile('font-size')}): - style = span.get('style') + for span in cont_tag.find_all("span", {"style": re.compile("font-size")}): + style = span.get("style") style = re.sub(r"font-size: \d+px", "", style) - if style == '': - del span.attrs['style'] + if style == "": + del span.attrs["style"] else: - span.attrs['style'] = style + span.attrs["style"] = style - unicode_string = '' + unicode_string = "" for child in cont_tag.children: if type(child) is NavigableString: continue - if child.name == 'blockquote': + if child.name == "blockquote": unicode_string += str(child) else: unicode_string += child.decode_contents() diff --git a/src/docx_converter/image_processing.py b/src/docx_converter/image_processing.py index 0eab671..9c5fdab 100644 --- a/src/docx_converter/image_processing.py +++ b/src/docx_converter/image_processing.py @@ -10,23 +10,23 @@ def process_images(access, html_path, book_id, body_tag): For now images are moved to one folder. """ - img_tags = body_tag.find_all('img') + img_tags = body_tag.find_all("img") for img in img_tags: - img_name = img.attrs.get('src') + img_name = img.attrs.get("src") # quick fix for bad links - if (len(img_name) >= 3) and img_name[:3] == '../': + if (len(img_name) >= 3) and img_name[:3] == "../": img_name = img_name[3:] - img_path = pathlib.Path(f'{html_path.parent}', f'{img_name}') + img_path = pathlib.Path(f"{html_path.parent}", f"{img_name}") if access is not None: link = access.send_image(img_path, doc_id=book_id) - img.attrs['src'] = link + img.attrs["src"] = link else: if img_tags.index(img) == 0: folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join( - folder_path, f'../books/json/img_{book_id}/')) + folder_path, f"../books/json/img_{book_id}/")) new_path.mkdir(exist_ok=True) new_img_path = new_path / img_name copyfile(img_path, new_img_path) diff --git a/src/docx_converter/libre_html2json_converter.py b/src/docx_converter/libre_html2json_converter.py index 0cd92fa..eb5f0a2 100644 --- a/src/docx_converter/libre_html2json_converter.py +++ b/src/docx_converter/libre_html2json_converter.py @@ -29,7 +29,7 @@ class LibreHTML2JSONConverter: cleaned text """ - new_text = re.sub(r'([\n\t])', ' ', html_text) + new_text = re.sub(r"([\n\t])", " ", html_text) return new_text # TODO: rethink the function structure without indexes. @@ -48,16 +48,16 @@ class LibreHTML2JSONConverter: """ if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: title = str(self.content[ind]) - title = title.replace(f'<{self.content[ind].name}>', '') - title = title.replace(f'', '') - title = re.sub(r'^\n', '', title) + title = title.replace(f"<{self.content[ind].name}>", "") + title = title.replace(f"", "") + title = re.sub(r"^\n", "", title) # extract outline from tag curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) result = { - 'title': f'{title}', - 'contents': [], - 'sub_items': [] + "title": f"{title}", + "contents": [], + "sub_items": [] } ch_content = [] ind += 1 @@ -71,9 +71,9 @@ class LibreHTML2JSONConverter: header_dict, ind = self.header_to_livecarta_chapter_item( ind) if ch_content: - result['contents'].append("".join(ch_content)) + result["contents"].append("".join(ch_content)) ch_content = [] - result['sub_items'].append(header_dict) + result["sub_items"].append(header_dict) # - current h_i <= h_initial, end of recursion else: # return result, ind @@ -85,21 +85,21 @@ class LibreHTML2JSONConverter: ind += 1 if ch_content: - result['contents'].append("".join(ch_content)) + result["contents"].append("".join(ch_content)) return result, ind - return '' + return "" @staticmethod def _is_empty_p_tag(tag): - if tag.name != 'p': + if tag.name != "p": return False temp_tag = copy(tag) - brs = temp_tag.find_all('br') + brs = temp_tag.find_all("br") for br in brs: br.decompose() - text = re.sub(r'\s+', '', temp_tag.text) + text = re.sub(r"\s+", "", temp_tag.text) if text: return False @@ -117,7 +117,7 @@ class LibreHTML2JSONConverter: res, ind = self.header_to_livecarta_chapter_item(ind) else: - chapter_title = f'Untitled chapter {ch_num}' + chapter_title = f"Untitled chapter {ch_num}" chapter = [] while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: if not self._is_empty_p_tag(self.content[ind]): @@ -126,9 +126,9 @@ class LibreHTML2JSONConverter: ind += 1 if chapter: res = { - 'title': chapter_title, - 'contents': ["".join(chapter)], - 'sub_items': [] + "title": chapter_title, + "contents": ["".join(chapter)], + "sub_items": [] } ch_num += 1 @@ -136,10 +136,10 @@ class LibreHTML2JSONConverter: json_strc.append(res) ch_amt += 1 self.logger_object.log( - f'Chapter {ch_amt} has been added to structure.') + f"Chapter {ch_amt} has been added to structure.") except Exception as exc: self.logger_object.log( - 'Error has occurred while making json structure.', logging.ERROR) + "Error has occurred while making json structure.", logging.ERROR) self.logger_object.log_error_to_main_log() if self.book_api_status: self.book_api_status.set_error() @@ -148,10 +148,10 @@ class LibreHTML2JSONConverter: # Add is_introduction field to json structure # after deleting content before toc, some chapters can be deleted if self.top_level_headers: - same_first_titles = self.top_level_headers[0]['title'] == json_strc[0]['title'] - is_first_header_introduction = not self.top_level_headers[0]['should_be_numbered'] + same_first_titles = self.top_level_headers[0]["title"] == json_strc[0]["title"] + is_first_header_introduction = not self.top_level_headers[0]["should_be_numbered"] - json_strc[0]['is_introduction'] = is_first_header_introduction + json_strc[0]["is_introduction"] = is_first_header_introduction self.content_dict = { "content": json_strc, diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 4a09481..b8bccf2 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -633,7 +633,7 @@ class EpubConverter: if __name__ == "__main__": - epub_file_path = "../../epub/9780763774134.epub" + epub_file_path = "../../books/epub/9780763774134.epub" logger_object = BookLogger( name="epub", book_id=epub_file_path.split("/")[-1]) diff --git a/src/epub_converter/footnotes_processing.py b/src/epub_converter/footnotes_processing.py index f82f073..34cd1fb 100644 --- a/src/epub_converter/footnotes_processing.py +++ b/src/epub_converter/footnotes_processing.py @@ -72,7 +72,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note expected_footnote_tags = verify_footnote_tag(expected_footnote_tags) footnote_tag = expected_footnote_tags[0] - if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "doc-endnote": + if footnote_tag.parent.attrs.get("role") and footnote_tag.parent.attrs.get("role") == "docs-endnote": footnote_tag = footnote_tag.parent new_noterefs_tags.append( _replace_with_livecarta_anchor_tag(noteref_tag, i)) @@ -80,7 +80,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note # footnote_tag.decompose() footnotes.append(content) footnote_tag = footnote_tag.find( - attrs={"role": "doc-backlink"}) or footnote_tag + attrs={"role": "docs-backlink"}) or footnote_tag new_footnotes_tags.append(footnote_tag) for i, (noteref, footnote) in enumerate(zip(new_noterefs_tags, new_footnotes_tags)): diff --git a/src/epub_converter/image_processing.py b/src/epub_converter/image_processing.py index e568aaa..6f35c3a 100644 --- a/src/epub_converter/image_processing.py +++ b/src/epub_converter/image_processing.py @@ -16,7 +16,7 @@ def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): """Function saves all images locally""" folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) new_path = pathlib.Path(os.path.join( - folder_path, f"../json/img_{book_id}/")) + folder_path, f"../books/json/img_{book_id}/")) new_path.mkdir(exist_ok=True) new_img_path = new_path / os.path.basename(img_file_path) From c19f76bc90cec635790dfed819d6c965dc93e137 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 27 Jul 2022 20:45:00 +0300 Subject: [PATCH 53/55] Change folders structure --- {doc => docs}/style_config | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {doc => docs}/style_config (100%) diff --git a/doc/style_config b/docs/style_config similarity index 100% rename from doc/style_config rename to docs/style_config From 009b755a31a97e662c6e883c47952fceb2de65c0 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 28 Jul 2022 11:52:47 +0300 Subject: [PATCH 54/55] Add path to backend preset --- src/access.py | 33 +++++++++++------------ src/epub_converter/epub_converter.py | 2 +- src/epub_converter/html_epub_processor.py | 2 +- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/access.py b/src/access.py index ba8ddb6..6d22202 100644 --- a/src/access.py +++ b/src/access.py @@ -8,28 +8,25 @@ from io import BytesIO class Access: """Class accessing our platform""" - - PENDING = 1 - PROCESS = 2 - GENERATE = 3 - FINISH = 4 - ERROR = 5 - - url = None - username = None - password = None - - token = None - refresh = None - refresh_time = None - headers = None - refreshing = Event() - - def __init__(self, url): + def __init__(self, url=None): """ :param url: str, url received from queue message, if field apiURL exists else None """ + self.PENDING = 1 + self.PROCESS = 2 + self.GENERATE = 3 + self.FINISH = 4 + self.ERROR = 5 + + self.username = None + self.password = None + + self.token = None + self.refresh = None + self.refresh_time = None + self.headers = None + self.refreshing = Event() self.set_credentials(url) self.get_token() diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index b8bccf2..fb3b786 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -638,7 +638,7 @@ if __name__ == "__main__": name="epub", book_id=epub_file_path.split("/")[-1]) css_processor = CSSPreprocessor() - html_processor = HtmlEpubPreprocessor("../../presets/presets.json", logger=logger_object) + html_processor = HtmlEpubPreprocessor(logger=logger_object) json_converter = EpubConverter(epub_file_path, logger=logger_object, css_processor=css_processor, html_processor=html_processor) diff --git a/src/epub_converter/html_epub_processor.py b/src/epub_converter/html_epub_processor.py index d8403d1..da2a6c0 100644 --- a/src/epub_converter/html_epub_processor.py +++ b/src/epub_converter/html_epub_processor.py @@ -6,7 +6,7 @@ from src.util.helpers import BookLogger class HtmlEpubPreprocessor: - def __init__(self, preset_path, logger=None): + def __init__(self, preset_path="../../presets/presets.json", logger=None): self.preset = json.load(open(preset_path)) self.logger: BookLogger = logger self.name2function = { From ea13d38f276e090cf051cfd92c4359e008f3c358 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Thu, 28 Jul 2022 13:03:32 +0300 Subject: [PATCH 55/55] Delete temporary files --- src/book_solver.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/book_solver.py b/src/book_solver.py index 10af671..a7625d5 100644 --- a/src/book_solver.py +++ b/src/book_solver.py @@ -57,7 +57,7 @@ class BookSolver: with open(file_path, "wb+") as file: file.write(content) self.logger_object.log( - f"Preset file was saved to folder: {folder_path}.") + f"File was saved to folder: {folder_path}.") except Exception as exc: self.logger_object.log( f"Error in writing {self.book_type} file.", logging.ERROR) @@ -153,12 +153,14 @@ class BookSolver: """ try: - self.logger_object.log( - f"Beginning of conversion from .{self.book_type} to .json.") self.get_preset_file() self.get_book_file() + self.logger_object.log( + f"Beginning of conversion from .{self.book_type} to .json.") self.status_wrapper.set_processing() content_dict = self.get_converted_book() + [os.remove(path) for path in [self.preset_path, self.book_path]] + self.logger_object.log("Beginning of processing .json output.") self.status_wrapper.set_generating() self.write_to_json(content_dict) self.send_json_content_to_server(content_dict) @@ -184,6 +186,7 @@ class BookSolver: self.status_wrapper.set_processing() with codecs.open(file_path, "r", encoding="utf-8") as f_json: content_dict = json.load(f_json) + self.logger_object.log("Beginning of processing .json output.") self.status_wrapper.set_generating() self.send_json_content_to_server(content_dict) self.logger_object.log(f"Sent a file to server. Check LiveCarta.")