diff --git a/src/docx_converter/html_docx_preprocessor.py b/src/docx_converter/html_docx_preprocessor.py index db847b0..e9683f4 100644 --- a/src/docx_converter/html_docx_preprocessor.py +++ b/src/docx_converter/html_docx_preprocessor.py @@ -557,10 +557,10 @@ class HTMLDocxPreprocessor: """ Function to find out: what header shouldn't be numbered and can be treated as introduction chapter - Assume header(s) to be introduction if: 1. one header not numbered, before 1 numbered header 2. it is first header from the top level list, and it equals to 'introduction' + Returns ------- None @@ -665,6 +665,7 @@ class HTMLDocxPreprocessor: Function - process tags
tags. + Returns ------- None diff --git a/src/epub_converter/css_preprocessing.py b/src/epub_converter/css_preprocessing.py index f2dc536..ff4a347 100644 --- a/src/epub_converter/css_preprocessing.py +++ b/src/epub_converter/css_preprocessing.py @@ -37,7 +37,7 @@ def convert_tag_style_values(size_value: str) -> str: """ def find_closest_size(style_value): possible_sizes = list( - takewhile(lambda x: style_value > x, LiveCartaConfig.sizes_pr)) + takewhile(lambda x: style_value >= x, LiveCartaConfig.sizes_pr)) last_possible_size_index = LiveCartaConfig.sizes_pr.index( possible_sizes[-1]) return LiveCartaConfig.sizes_px[last_possible_size_index] diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 2e40dcd..2d286c6 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -380,6 +380,7 @@ class EpubConverter: 1. rebuild ids to be unique in all documents 2a. process anchor which is a whole xhtml file 2b. process anchor which is an element in xhtml file + Returns ------- None diff --git a/src/epub_converter/html_epub_preprocessor.py b/src/epub_converter/html_epub_preprocessor.py index 73e357c..d94c43a 100644 --- a/src/epub_converter/html_epub_preprocessor.py +++ b/src/epub_converter/html_epub_preprocessor.py @@ -9,178 +9,6 @@ from src.access import Access from src.livecarta_config import LiveCartaConfig -def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): - """Function saves all images locally""" - folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - new_path = pathlib.Path(os.path.join( - folder_path, f'../json/img_{book_id}/')) - new_path.mkdir(exist_ok=True) - - new_img_path = new_path / os.path.basename(img_file_path) - f = open(new_img_path, 'wb+') - f.write(img_content) - f.close() - - return new_img_path - - -def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str): - """Function saves all images to Amazon web service""" - link_path = access.send_image( - img_file_path, doc_id=book_id, img_content=img_content) - return link_path - - -def update_images_src_links(body_tag: BeautifulSoup, - href2img_content: dict, - path_to_html: str, - access=None, - path2aws_path: dict = None, - book_id: str = None) -> dict: - """Function makes dictionary image_src_path -> Amazon web service_path""" - img_tags = body_tag.find_all('img') - - for img in img_tags: - path_to_img_from_html = img.attrs.get('src') - html_folder = os.path.dirname(path_to_html) - path_to_img_from_root = os.path.normpath(os.path.join( - html_folder, path_to_img_from_html)).replace('\\', '/') - - assert path_to_img_from_root in href2img_content, \ - f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' - - img_content = href2img_content[path_to_img_from_root] - if access is not None: - if path_to_img_from_root in path2aws_path: - new_folder = path2aws_path[path_to_img_from_root] - else: - new_folder = save_image_to_aws( - access, path_to_img_from_root, img_content, book_id) - path2aws_path[path_to_img_from_root] = new_folder - else: - new_folder = save_image_locally( - path_to_img_from_root, img_content, 'book_id') - - img.attrs['src'] = str(new_folder) - if img.attrs.get('width'): - del img.attrs['width'] - if img.attrs.get('height'): - del img.attrs['height'] - if img.attrs.get('style'): - del img.attrs['style'] - return path2aws_path - - -def _preprocess_table(body_tag: BeautifulSoup): - """Function to preprocess tables and tags(td|th|tr): style""" - tables = body_tag.find_all("table") - for table in tables: - t_tags = table.find_all(re.compile("td|th|tr")) - for t_tag in t_tags: - style = t_tag.get('style') - width = '' - if style: - width_match = re.search( - r"[^-]width: ?(\d+\.?\d*)(p[tx])", style) - if width_match: - size = width_match.group(1) - width = size+'px' - - t_tag.attrs['width'] = t_tag.get('width') or width - - if t_tag.attrs.get('style'): - t_tag.attrs['style'] = t_tag.attrs['style'].replace( - 'border:0;', '') - - elif t_tag.attrs.get('style') == '': - del t_tag.attrs['style'] - - if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']: - table.attrs['border'] = '1' - - -def _process_lists(body_tag: BeautifulSoup): - """ - Function - - process tags
tags. - Parameters - ---------- - body_tag: Tag, soup object - - Returns - ------- - None - - """ - li_tags = body_tag.find_all("li") - for li_tag in li_tags: - if li_tag.p: - li_tag.attrs.update(li_tag.p.attrs) - li_tag.p.unwrap() - - -def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): - """Function inserts span before tag aren't supported by livecarta""" - new_tag = main_tag.new_tag("span") - new_tag.attrs['id'] = id_ or '' - new_tag.attrs['class'] = class_ or '' - new_tag.string = "\xa0" - tag.insert_before(new_tag) - - -def _clean_headings_content(content: BeautifulSoup, title: str): - def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup): - if tag_to_be_removed.attrs.get('id'): - _insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=tag_to_be_removed.attrs.get( - 'id'), - class_=tag_to_be_removed.attrs.get('class')) - - for sub_tag in tag_to_be_removed.find_all(): - if sub_tag.attrs.get('id'): - _insert_span_with_attrs_before_tag(body_tag, - tag_to_be_removed, - id_=sub_tag.attrs['id'], - class_=sub_tag.attrs.get('class')) - - title = title.lower() - for child in content.contents: - if isinstance(child, NavigableString): - text = child - else: - text = child.text - if text and re.sub(r'([\n\t\xa0])', '', text): - text = re.sub(r'([\n\t\xa0])', ' ', text) - text = re.sub(r' +', ' ', text).strip() - text = text.lower() - if title == text: - add_span_to_save_ids_for_links(child, content) - child.extract() - elif (title in text) and (child.name in ['h1', 'h2', 'h3']): - add_span_to_save_ids_for_links(child, content) - child.extract() - break - - -def _heading_tag_to_p_tag(body_tag): - """Function to convert all lower level headings to p tags""" - pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' - header_tags = body_tag.find_all(re.compile(pattern)) - for tag in header_tags: - tag.name = 'p' - - -def _clean_title_from_numbering(title: str): - """Function removes numbering from titles""" - title = re.sub(r'^(\s+)+', '', title) - # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title - # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title - # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title - return title - - def _replace_with_livecarta_anchor_tag(anchor, i): """Function replace noteref_tag(anchor) with new livecarta tag""" new_tag = BeautifulSoup(features='lxml').new_tag('sup') @@ -381,6 +209,13 @@ def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup: _add_span_to_save_ids_for_links(div) div.unwrap() + def _heading_tag_to_p_tag(body_tag): + """Function to convert all lower level headings to p tags""" + pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' + header_tags = body_tag.find_all(re.compile(pattern)) + for tag in header_tags: + tag.name = 'p' + # comments removal for tag in body_tag.find_all(): for element in tag(text=lambda text: isinstance(text, Comment)): @@ -497,6 +332,248 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu return tags +def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str): + """Function saves all images to Amazon web service""" + link_path = access.send_image( + img_file_path, doc_id=book_id, img_content=img_content) + return link_path + + +def save_image_locally(img_file_path: str, img_content: bytes, book_id: str): + """Function saves all images locally""" + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + new_path = pathlib.Path(os.path.join( + folder_path, f'../json/img_{book_id}/')) + new_path.mkdir(exist_ok=True) + + new_img_path = new_path / os.path.basename(img_file_path) + f = open(new_img_path, 'wb+') + f.write(img_content) + f.close() + + return new_img_path + + +def update_images_src_links(body_tag: BeautifulSoup, + href2img_content: dict, + path_to_html: str, + access=None, + path2aws_path: dict = None, + book_id: str = None) -> dict: + """Function makes dictionary image_src_path -> Amazon web service_path""" + img_tags = body_tag.find_all('img') + + for img in img_tags: + path_to_img_from_html = img.attrs.get('src') + html_folder = os.path.dirname(path_to_html) + path_to_img_from_root = os.path.normpath(os.path.join( + html_folder, path_to_img_from_html)).replace('\\', '/') + + assert path_to_img_from_root in href2img_content, \ + f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.' + + img_content = href2img_content[path_to_img_from_root] + if access is not None: + if path_to_img_from_root in path2aws_path: + new_folder = path2aws_path[path_to_img_from_root] + else: + new_folder = save_image_to_aws( + access, path_to_img_from_root, img_content, book_id) + path2aws_path[path_to_img_from_root] = new_folder + else: + new_folder = save_image_locally( + path_to_img_from_root, img_content, 'book_id') + + img.attrs['src'] = str(new_folder) + if img.attrs.get('width'): + del img.attrs['width'] + if img.attrs.get('height'): + del img.attrs['height'] + if img.attrs.get('style'): + del img.attrs['style'] + return path2aws_path + + +def _clean_title_from_numbering(title: str): + """Function removes numbering from titles""" + title = re.sub(r'^(\s+)+', '', title) + # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title + # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering(letters) from the title + # title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title + return title + + +def prepare_title(title_of_chapter: str) -> str: + """Function finalise processing/cleaning title""" + title_str = BeautifulSoup(title_of_chapter, features='lxml').string + title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) + title_str = re.sub(r' +', ' ', title_str).rstrip() + title_str = _clean_title_from_numbering(title_str) + return title_str + + +def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): + """Function inserts span before tag aren't supported by livecarta""" + new_tag = main_tag.new_tag("span") + new_tag.attrs['id'] = id_ or '' + new_tag.attrs['class'] = class_ or '' + new_tag.string = "\xa0" + tag.insert_before(new_tag) + + +def _clean_headings_content(content: BeautifulSoup, title: str): + def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup): + if tag_to_be_removed.attrs.get('id'): + _insert_span_with_attrs_before_tag(body_tag, + tag_to_be_removed, + id_=tag_to_be_removed.attrs.get( + 'id'), + class_=tag_to_be_removed.attrs.get('class')) + + for sub_tag in tag_to_be_removed.find_all(): + if sub_tag.attrs.get('id'): + _insert_span_with_attrs_before_tag(body_tag, + tag_to_be_removed, + id_=sub_tag.attrs['id'], + class_=sub_tag.attrs.get('class')) + + title = title.lower() + for child in content.contents: + if isinstance(child, NavigableString): + text = child + else: + text = child.text + if text and re.sub(r'([\n\t\xa0])', '', text): + text = re.sub(r'([\n\t\xa0])', ' ', text) + text = re.sub(r' +', ' ', text).strip() + text = text.lower() + if title == text: + add_span_to_save_ids_for_links(child, content) + child.extract() + elif (title in text) and (child.name in ['h1', 'h2', 'h3']): + add_span_to_save_ids_for_links(child, content) + child.extract() + break + + +def _process_lists(body_tag: BeautifulSoup): + """ + Function + - process tags
tags.
+ Parameters
+ ----------
+ body_tag: Tag, soup object
+
+ Returns
+ -------
+ None
+
+ """
+ li_tags = body_tag.find_all("li")
+ for li_tag in li_tags:
+ if li_tag.p:
+ li_tag.attrs.update(li_tag.p.attrs)
+ li_tag.p.unwrap()
+
+
+def _preprocess_table(body_tag: BeautifulSoup):
+ """Function to preprocess tables and tags(td|th|tr): style"""
+ tables = body_tag.find_all("table")
+ for table in tables:
+ t_tags = table.find_all(re.compile("td|th|tr"))
+ for t_tag in t_tags:
+ style = t_tag.get('style')
+ width = ''
+ if style:
+ width_match = re.search(
+ r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
+ if width_match:
+ size = width_match.group(1)
+ width = size + 'px'
+
+ t_tag.attrs['width'] = t_tag.get('width') or width
+
+ if t_tag.attrs.get('style'):
+ t_tag.attrs['style'] = t_tag.attrs['style'].replace(
+ 'border:0;', '')
+
+ elif t_tag.attrs.get('style') == '':
+ del t_tag.attrs['style']
+
+ if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
+ table.attrs['border'] = '1'
+
+
+def _preprocess_code_tags(chapter_tag: BeautifulSoup):
+ """
+ Function
+ - transform to save brs
- p_for_br = chapter_tag.new_tag("p")
- p_for_br.string = "\xa0"
- table.insert_after(p_for_br)
-
-
-def _preprocess_code_tags(chapter_tag: BeautifulSoup):
- """
- Function
- - transform , tags
+ Wrap string of the tag with
if it's necessary
+ Parameters
+ ----------
+ chapter_tag: Tag, soup object
+
+ Returns
+ ----------
+ None
+ Modified chapter tag
+
+ """
+ for pre in chapter_tag.find_all("pre"):
+ if pre.find_all("code|kbd|var"):
+ continue
+ else:
+ code = chapter_tag.new_tag("code")
+ # insert all items that was in pre to code and remove from pre
+ for content in reversed(pre.contents):
+ code.insert(0, content.extract())
+ # wrap code with items
+ pre.append(code)
+
+
+def _clean_wiley_block(block):
+ hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
+ for hr in hrs:
+ hr.extract()
+ h = block.find(re.compile("h[1-9]"))
+ if h:
+ h.name = "p"
+ h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
+
+
def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
"""Function wraps """
table = main_tag.new_tag("table")
@@ -517,16 +594,6 @@ def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_c
return table
-def _clean_wiley_block(block):
- hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
- for hr in hrs:
- hr.extract()
- h = block.find(re.compile("h[1-9]"))
- if h:
- h.name = "p"
- h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
-
-
def _preprocess_block_tags(chapter_tag: Tag):
"""Function preprocessing
"""
- table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
- "tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
- table.attrs['border'], table.attrs['style'] = '1px #ccc;', 'width:100%;'
- td.attrs['bgcolor'] = '#f5f5f5'
- # td.attrs['border-radius'] = '4px'
- span_tag.wrap(td)
- td.wrap(tr)
- tr.wrap(tbody)
- tbody.wrap(table)
- return table
-
-
-def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
- """
- Function preprocessing
tags
- Parameters
- ----------
- chapter_tag: Tag, soup object
-
- Steps
- ----------
- 1. Process NavigableString
- 2. Process Tags and their children
-
- """
- for pre in chapter_tag.find_all("pre"):
- new_tag = BeautifulSoup(features='lxml').new_tag("span")
- new_tag.attrs = pre.attrs.copy()
- new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
- "font-size: 14px; white-space: nowrap;"
- # if in there are multiple , we need to add
after each content
- to_add_br = len(pre.find_all("span")) > 1
- copy_contents = pre.contents[:]
- for child in copy_contents:
- # Navigable String
- if isinstance(child, NavigableString):
- cleaned_text = _prepare_formatted(str(child))
- sub_strings = re.split('\r\n|\n|\r', cleaned_text)
- for string in sub_strings[:-1]:
- new_tag.append(NavigableString(string))
- new_tag.append(BeautifulSoup(
- features='lxml').new_tag('br'))
- new_tag.append(NavigableString(sub_strings[-1]))
- # Tag
- else:
- for sub_child in child.children:
- if isinstance(sub_child, NavigableString):
- cleaned_text = _prepare_formatted(str(sub_child))
- sub_child.replace_with(NavigableString(cleaned_text))
- else:
- sub_child.string = _prepare_formatted(sub_child.text)
- cleaned_tag = child.extract()
- new_tag.append(cleaned_tag)
- if to_add_br:
- new_tag.append(BeautifulSoup(
- features='lxml').new_tag('br'))
- pre.replace_with(new_tag)
- table = _wrap_preformatted_span_with_table(chapter_tag, new_tag)
- # add ,