forked from LiveCarta/BookConverter
Function annotations
This commit is contained in:
@@ -38,7 +38,7 @@ class NavPoint:
|
|||||||
|
|
||||||
|
|
||||||
def flatten(x):
|
def flatten(x):
|
||||||
"""magic function from stackoverflow for list flattening"""
|
"""Magic function from stackoverflow for list flattening"""
|
||||||
atom = lambda i: not isinstance(i, list)
|
atom = lambda i: not isinstance(i, list)
|
||||||
nil = lambda i: not i
|
nil = lambda i: not i
|
||||||
car = lambda i: i[0]
|
car = lambda i: i[0]
|
||||||
|
|||||||
@@ -28,24 +28,27 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
|
|||||||
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
||||||
|
|
||||||
|
|
||||||
def convert_tag_values(value):
|
def convert_tag_values(value: str) -> str:
|
||||||
"""Function 1. converts values of tags from em/%/pt to px
|
"""
|
||||||
2. find closest font-size px
|
Function
|
||||||
|
- converts values of tags from em/%/pt to px
|
||||||
|
- find closest font-size px
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
value: str
|
value: str
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
converted value: str
|
value: str
|
||||||
"""
|
|
||||||
|
|
||||||
|
"""
|
||||||
def find_closest_size(value):
|
def find_closest_size(value):
|
||||||
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
|
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
|
||||||
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
||||||
return sizes_px[last_possible_size_index]
|
return sizes_px[last_possible_size_index]
|
||||||
|
|
||||||
font_size_regexp = re.compile(r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
|
font_size_regexp = re.compile(
|
||||||
|
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
|
||||||
has_style_attrs = re.search(font_size_regexp, value)
|
has_style_attrs = re.search(font_size_regexp, value)
|
||||||
if has_style_attrs:
|
if has_style_attrs:
|
||||||
if has_style_attrs.group(1):
|
if has_style_attrs.group(1):
|
||||||
@@ -61,7 +64,6 @@ def convert_tag_values(value):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
|
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
|
||||||
Style properties that can be used to fit livecarta css style convention.
|
Style properties that can be used to fit livecarta css style convention.
|
||||||
@@ -164,17 +166,20 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
|||||||
|
|
||||||
|
|
||||||
def check_style_to_be_tag(style) -> List[tuple]:
|
def check_style_to_be_tag(style) -> List[tuple]:
|
||||||
"""Function search style properties that can be converted to tags.
|
"""
|
||||||
|
Function searches style properties that can be converted to tags.
|
||||||
It searches for them and prepare list of properties to be removed from style string
|
It searches for them and prepare list of properties to be removed from style string
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
style: str
|
style: str
|
||||||
<tag style="...">
|
<tag style="...">
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
properties to remove: list
|
to_remove: list
|
||||||
"""
|
properties to remove
|
||||||
|
|
||||||
|
"""
|
||||||
to_remove = []
|
to_remove = []
|
||||||
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
|
||||||
if f'{k[0]}:{k[1]}' in style:
|
if f'{k[0]}:{k[1]}' in style:
|
||||||
@@ -227,7 +232,7 @@ class TagStyleConverter:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def remove_white_if_no_bgcolor(style_, tag):
|
def remove_white_if_no_bgcolor(style_, tag):
|
||||||
""" Function remove white color if there is no text bg color """
|
"""Function remove text white color if there is no bg color"""
|
||||||
if 'background' in style_:
|
if 'background' in style_:
|
||||||
return style_
|
return style_
|
||||||
|
|
||||||
@@ -264,9 +269,11 @@ class TagStyleConverter:
|
|||||||
item = item.split(':')
|
item = item.split(':')
|
||||||
if item[0] in ['text-indent', 'margin-left', 'margin']:
|
if item[0] in ['text-indent', 'margin-left', 'margin']:
|
||||||
if len(item[1].split(' ')) == 3:
|
if len(item[1].split(' ')) == 3:
|
||||||
item[1] = convert_tag_values(item[1].split(' ')[-2]) # split returns middle value
|
item[1] = convert_tag_values(item[1].split(
|
||||||
|
' ')[-2]) # split returns middle value
|
||||||
else:
|
else:
|
||||||
item[1] = convert_tag_values(item[1].split(' ')[-1]) # split returns last value
|
item[1] = convert_tag_values(item[1].split(
|
||||||
|
' ')[-1]) # split returns last value
|
||||||
clean_style += item[0] + ': ' + item[1] + '; '
|
clean_style += item[0] + ': ' + item[1] + '; '
|
||||||
|
|
||||||
margin_left_regexp = re.compile(
|
margin_left_regexp = re.compile(
|
||||||
@@ -360,7 +367,7 @@ class TagStyleConverter:
|
|||||||
s = f'{attr}:{value};'
|
s = f'{attr}:{value};'
|
||||||
self.style = self.style.replace(s, '')
|
self.style = self.style.replace(s, '')
|
||||||
self.style = self.style.strip()
|
self.style = self.style.strip()
|
||||||
if i == 0:
|
if not i:
|
||||||
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
|
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
|
||||||
attr, value)]
|
attr, value)]
|
||||||
new_tags.append(self.tag_with_inline_style)
|
new_tags.append(self.tag_with_inline_style)
|
||||||
@@ -402,7 +409,6 @@ class TagStyleConverter:
|
|||||||
if has_p_style_attrs:
|
if has_p_style_attrs:
|
||||||
p_style += item + ';'
|
p_style += item + ';'
|
||||||
initial_style = initial_style.replace(item + ';', '')
|
initial_style = initial_style.replace(item + ';', '')
|
||||||
|
|
||||||
# here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
|
# here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
|
||||||
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
|
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
|
||||||
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
|
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
|
||||||
@@ -410,14 +416,15 @@ class TagStyleConverter:
|
|||||||
# if find styles that cannot be in <p> -> wrap them in span
|
# if find styles that cannot be in <p> -> wrap them in span
|
||||||
tag.name = 'span'
|
tag.name = 'span'
|
||||||
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
||||||
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||||
has_li_style_attr = re.search(li_attrs_regexp, initial_style)
|
has_p_style_attr = re.search(p_attrs_regexp, initial_style)
|
||||||
span_style = initial_style if not has_li_style_attr else initial_style.replace(
|
span_style = initial_style if not has_p_style_attr else initial_style.replace(
|
||||||
has_li_style_attr.group(1), '')
|
has_p_style_attr.group(1), '')
|
||||||
p_tag.attrs['style'] = p_style
|
p_tag.attrs['style'] = p_style
|
||||||
tag.attrs['style'] = span_style
|
tag.attrs['style'] = span_style
|
||||||
tag.wrap(p_tag)
|
tag.wrap(p_tag)
|
||||||
else: tag.attrs['style'] = p_style
|
else:
|
||||||
|
tag.attrs['style'] = p_style
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def wrap_span_in_li_to_save_style_attrs(tag):
|
def wrap_span_in_li_to_save_style_attrs(tag):
|
||||||
@@ -426,14 +433,13 @@ class TagStyleConverter:
|
|||||||
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
|
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
|
||||||
attr not in ['text-align', 'list-style-type']]
|
attr not in ['text-align', 'list-style-type']]
|
||||||
|
|
||||||
styles_to_be_saved = [attr in tag.attrs.get(
|
styles_to_be_saved_in_span = [attr in tag.attrs.get(
|
||||||
'style') for attr in styles_cant_be_in_li]
|
'style') for attr in styles_cant_be_in_li]
|
||||||
if any(styles_to_be_saved):
|
if any(styles_to_be_saved_in_span):
|
||||||
tag.name = 'span'
|
tag.name = 'span'
|
||||||
li_tag = BeautifulSoup(features='lxml').new_tag('li')
|
li_tag = BeautifulSoup(features='lxml').new_tag('li')
|
||||||
span_style = tag.attrs['style']
|
span_style = tag.attrs['style']
|
||||||
li_style = ''
|
li_style = ''
|
||||||
|
|
||||||
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
|
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
|
||||||
re.compile(r'(list-style-type:(\w+);)')]:
|
re.compile(r'(list-style-type:(\w+);)')]:
|
||||||
has_li_style_attrs = re.search(
|
has_li_style_attrs = re.search(
|
||||||
@@ -442,7 +448,6 @@ class TagStyleConverter:
|
|||||||
li_style += has_li_style_attrs.group(1)
|
li_style += has_li_style_attrs.group(1)
|
||||||
span_style = span_style.replace(
|
span_style = span_style.replace(
|
||||||
has_li_style_attrs.group(1), '')
|
has_li_style_attrs.group(1), '')
|
||||||
|
|
||||||
li_tag.attrs['style'] = li_style
|
li_tag.attrs['style'] = li_style
|
||||||
tag.attrs['style'] = span_style
|
tag.attrs['style'] = span_style
|
||||||
tag.wrap(li_tag)
|
tag.wrap(li_tag)
|
||||||
@@ -454,23 +459,23 @@ class TagStyleConverter:
|
|||||||
styles_cant_be_in_ul_ol = [
|
styles_cant_be_in_ul_ol = [
|
||||||
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
|
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
|
||||||
|
|
||||||
check = [attr in tag.attrs.get('style')
|
styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
|
||||||
for attr in styles_cant_be_in_ul_ol]
|
for attr in styles_cant_be_in_ul_ol]
|
||||||
if any(check):
|
if any(styles_to_be_saved_in_span):
|
||||||
tag.name = 'span'
|
tag.name = 'span'
|
||||||
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
|
oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
|
||||||
span_style = tag.attrs['style']
|
span_style = tag.attrs['style']
|
||||||
|
|
||||||
possible_li_attrs_regexp = re.compile(
|
possible_uol_attrs_regexp = re.compile(
|
||||||
r'(list-style-type:(\w+);)')
|
r'(list-style-type:(\w+);)')
|
||||||
has_li_style_attrs = re.search(
|
has_uol_style_attrs = re.search(
|
||||||
possible_li_attrs_regexp, span_style)
|
possible_uol_attrs_regexp, span_style)
|
||||||
if has_li_style_attrs and has_li_style_attrs.group(1):
|
if has_uol_style_attrs and has_uol_style_attrs.group(1):
|
||||||
oul_style = has_li_style_attrs.group(1)
|
oul_style = has_uol_style_attrs.group(1)
|
||||||
span_style = span_style.replace(oul_style, '')
|
span_style = span_style.replace(oul_style, '')
|
||||||
li_tag.attrs['style'] = oul_style
|
oul_tag.attrs['style'] = oul_style
|
||||||
tag.attrs['style'] = span_style
|
tag.attrs['style'] = span_style
|
||||||
tag.wrap(li_tag)
|
tag.wrap(oul_tag)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def wrap_span_in_h_to_save_style_attrs(tag):
|
def wrap_span_in_h_to_save_style_attrs(tag):
|
||||||
@@ -482,10 +487,10 @@ class TagStyleConverter:
|
|||||||
tag.name = 'span'
|
tag.name = 'span'
|
||||||
tag.wrap(h_tag)
|
tag.wrap(h_tag)
|
||||||
style = tag.attrs['style']
|
style = tag.attrs['style']
|
||||||
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||||
has_li_style_attr = re.search(li_attrs_regexp, style)
|
has_h_style_attr = re.search(h_attrs_regexp, style)
|
||||||
tag.attrs['style'] = style if not has_li_style_attr else style.replace(
|
tag.attrs['style'] = style if not has_h_style_attr else style.replace(
|
||||||
has_li_style_attr.group(1), '')
|
has_h_style_attr.group(1), '')
|
||||||
|
|
||||||
def convert_initial_tag(self):
|
def convert_initial_tag(self):
|
||||||
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
|
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
|
||||||
@@ -496,7 +501,7 @@ class TagStyleConverter:
|
|||||||
return self.tag_with_inline_style
|
return self.tag_with_inline_style
|
||||||
|
|
||||||
|
|
||||||
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
|
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
|
||||||
"""Function adds styles from .css to inline style"""
|
"""Function adds styles from .css to inline style"""
|
||||||
css_text = css_text.replace(
|
css_text = css_text.replace(
|
||||||
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
'@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import codecs
|
import codecs
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
from os.path import dirname, normpath, join
|
from os.path import dirname, normpath, join
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
@@ -51,7 +50,8 @@ class EpubConverter:
|
|||||||
# flag to be updated while ebooklib.toc is parsed
|
# flag to be updated while ebooklib.toc is parsed
|
||||||
self.id_anchor_exist_in_nav_points = False
|
self.id_anchor_exist_in_nav_points = False
|
||||||
self.img_href2img_bytes = {} # file path to bytes
|
self.img_href2img_bytes = {} # file path to bytes
|
||||||
self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path
|
# file path from <a> to generated aws path
|
||||||
|
self.book_image_src_path2aws_path = {}
|
||||||
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
||||||
self.noterefs: List[Tag] = [] # start of the footnote
|
self.noterefs: List[Tag] = [] # start of the footnote
|
||||||
self.footnotes: List[Tag] = [] # end of the footnote
|
self.footnotes: List[Tag] = [] # end of the footnote
|
||||||
@@ -116,7 +116,6 @@ class EpubConverter:
|
|||||||
return nodes
|
return nodes
|
||||||
|
|
||||||
def get_css_content(self, css_href, html_href):
|
def get_css_content(self, css_href, html_href):
|
||||||
|
|
||||||
path_to_css_from_html = css_href
|
path_to_css_from_html = css_href
|
||||||
html_folder = dirname(html_href)
|
html_folder = dirname(html_href)
|
||||||
path_to_css_from_root = normpath(
|
path_to_css_from_root = normpath(
|
||||||
@@ -132,8 +131,8 @@ class EpubConverter:
|
|||||||
The first is css_href2css_content. It is created to connect href of css to content of css
|
The first is css_href2css_content. It is created to connect href of css to content of css
|
||||||
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
|
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
|
||||||
...2... = key2value
|
...2... = key2value
|
||||||
"""
|
|
||||||
|
|
||||||
|
"""
|
||||||
# dictionary: href of html to related css files
|
# dictionary: href of html to related css files
|
||||||
html_href2css_href: defaultdict = defaultdict(list)
|
html_href2css_href: defaultdict = defaultdict(list)
|
||||||
css_href2css_content: dict = {}
|
css_href2css_content: dict = {}
|
||||||
@@ -165,6 +164,7 @@ class EpubConverter:
|
|||||||
"""
|
"""
|
||||||
This function is designed to update html_href2html_body_soup
|
This function is designed to update html_href2html_body_soup
|
||||||
And add to html_inline_style css_style_content
|
And add to html_inline_style css_style_content
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for html_href in self.html_href2html_body_soup:
|
for html_href in self.html_href2html_body_soup:
|
||||||
if self.html_href2css_href.get(html_href):
|
if self.html_href2css_href.get(html_href):
|
||||||
@@ -191,8 +191,8 @@ class EpubConverter:
|
|||||||
|
|
||||||
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
|
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
|
||||||
:param lvl: level of depth
|
:param lvl: level of depth
|
||||||
"""
|
|
||||||
|
|
||||||
|
"""
|
||||||
if isinstance(element, Link):
|
if isinstance(element, Link):
|
||||||
nav_point = NavPoint(element)
|
nav_point = NavPoint(element)
|
||||||
if nav_point.id:
|
if nav_point.id:
|
||||||
@@ -215,7 +215,8 @@ class EpubConverter:
|
|||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
for elem in second:
|
for elem in second:
|
||||||
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
|
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
|
||||||
self.offset_sub_nodes.append(self.build_adjacency_list_from_toc(elem, lvl))
|
self.offset_sub_nodes.append(
|
||||||
|
self.build_adjacency_list_from_toc(elem, lvl))
|
||||||
else:
|
else:
|
||||||
sub_nodes.append(
|
sub_nodes.append(
|
||||||
self.build_adjacency_list_from_toc(elem, lvl + 1))
|
self.build_adjacency_list_from_toc(elem, lvl + 1))
|
||||||
@@ -239,7 +240,7 @@ class EpubConverter:
|
|||||||
else:
|
else:
|
||||||
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
|
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
|
||||||
|
|
||||||
def is_toc_empty(self):
|
def is_toc_empty(self) -> bool:
|
||||||
"""Function checks is toc empty"""
|
"""Function checks is toc empty"""
|
||||||
# there is no toc in ebook or no top chapters
|
# there is no toc in ebook or no top chapters
|
||||||
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
||||||
@@ -295,19 +296,26 @@ class EpubConverter:
|
|||||||
new_anchor_span.string = "\xa0"
|
new_anchor_span.string = "\xa0"
|
||||||
return new_anchor_span
|
return new_anchor_span
|
||||||
|
|
||||||
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
|
||||||
"""
|
"""
|
||||||
|
Function used to find full path to file that is parsed from tag link
|
||||||
TOC: a/b/c.xhtml
|
TOC: a/b/c.xhtml
|
||||||
|
|
||||||
b/c.xhtml -> a/b/c.xhtml
|
b/c.xhtml -> a/b/c.xhtml
|
||||||
c.xhtml -> a/b/c.xhtml
|
c.xhtml -> a/b/c.xhtml
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cur_file_path: str
|
||||||
|
path to current file with tag link
|
||||||
|
href_in_link: str
|
||||||
|
filename got from tag link, like file1.xhtml
|
||||||
|
internal_link_tag: Tag
|
||||||
|
tag object that is parsed now
|
||||||
|
|
||||||
Used to find full path to file that is parsed from tag link
|
Returns
|
||||||
|
-------
|
||||||
|
full_path[0]: s
|
||||||
|
prepared content
|
||||||
|
|
||||||
:param cur_file_path: path to current file with tag link
|
|
||||||
:param href_in_link: filename got from tag link, like file1.xhtml
|
|
||||||
:param internal_link_tag: tag object that is parsed now
|
|
||||||
:return:
|
|
||||||
"""
|
"""
|
||||||
dir_name = os.path.dirname(cur_file_path)
|
dir_name = os.path.dirname(cur_file_path)
|
||||||
normed_path = os.path.normpath(os.path.join(
|
normed_path = os.path.normpath(os.path.join(
|
||||||
@@ -331,6 +339,12 @@ class EpubConverter:
|
|||||||
Function
|
Function
|
||||||
- processing internal links in a book
|
- processing internal links in a book
|
||||||
- make ids unique
|
- make ids unique
|
||||||
|
Steps
|
||||||
|
----------
|
||||||
|
1. rebuild ids to be unique in all documents
|
||||||
|
2a. process anchor which is a whole xhtml file
|
||||||
|
2b. process anchor which is an element in xhtml file
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# 1. rebuild ids to be unique in all documents
|
# 1. rebuild ids to be unique in all documents
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
@@ -344,7 +358,7 @@ class EpubConverter:
|
|||||||
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
|
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
|
||||||
tag.attrs['id'] = new_id
|
tag.attrs['id'] = new_id
|
||||||
|
|
||||||
# 2.a) process anchor which is a whole xhtml file
|
# 2a. process anchor which is a whole xhtml file
|
||||||
internal_link_reg1 = re.compile(
|
internal_link_reg1 = re.compile(
|
||||||
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
|
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
@@ -367,7 +381,7 @@ class EpubConverter:
|
|||||||
|
|
||||||
del internal_link_tag.attrs['href']
|
del internal_link_tag.attrs['href']
|
||||||
|
|
||||||
# 2.b) process anchor which is an element in xhtml file
|
# 2b. process anchor which is an element in xhtml file
|
||||||
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
|
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
for toc_href in self.hrefs_added_to_toc:
|
||||||
soup = self.html_href2html_body_soup[toc_href]
|
soup = self.html_href2html_body_soup[toc_href]
|
||||||
@@ -418,9 +432,9 @@ class EpubConverter:
|
|||||||
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
|
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
|
||||||
f' Old id={a_tag_id}')
|
f' Old id={a_tag_id}')
|
||||||
|
|
||||||
def build_one_chapter(self, nav_point):
|
def build_one_chapter(self, nav_point: NavPoint):
|
||||||
"""
|
"""
|
||||||
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
||||||
|
|
||||||
3 cases:
|
3 cases:
|
||||||
id wraps all chapter content,
|
id wraps all chapter content,
|
||||||
@@ -429,7 +443,13 @@ class EpubConverter:
|
|||||||
|
|
||||||
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
|
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
|
||||||
and id of the next chapter/subchapter
|
and id of the next chapter/subchapter
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
nav_point: NavPoint
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
"""
|
"""
|
||||||
if nav_point.id:
|
if nav_point.id:
|
||||||
soup = self.html_href2html_body_soup[nav_point.href]
|
soup = self.html_href2html_body_soup[nav_point.href]
|
||||||
@@ -446,7 +466,7 @@ class EpubConverter:
|
|||||||
self.build_one_chapter(sub_node)
|
self.build_one_chapter(sub_node)
|
||||||
|
|
||||||
def define_chapters_content(self):
|
def define_chapters_content(self):
|
||||||
""" Function build chapters content starts from top level chapters """
|
"""Function build chapters content, starts from top level chapters"""
|
||||||
top_level_nav_points = self.adjacency_list[-1]
|
top_level_nav_points = self.adjacency_list[-1]
|
||||||
if self.id_anchor_exist_in_nav_points:
|
if self.id_anchor_exist_in_nav_points:
|
||||||
for point in top_level_nav_points:
|
for point in top_level_nav_points:
|
||||||
@@ -483,8 +503,8 @@ class EpubConverter:
|
|||||||
self.logger.log(f'{indent}Chapter: {title} is prepared.')
|
self.logger.log(f'{indent}Chapter: {title} is prepared.')
|
||||||
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
|
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
|
||||||
|
|
||||||
def convert_to_dict(self):
|
def convert_to_dict(self) -> dict:
|
||||||
""" Function which convert list of html nodes to appropriate json structure. """
|
"""Function which convert list of html nodes to appropriate json structure"""
|
||||||
top_level_nav_points = self.adjacency_list[-1]
|
top_level_nav_points = self.adjacency_list[-1]
|
||||||
top_level_chapters = []
|
top_level_chapters = []
|
||||||
|
|
||||||
@@ -502,7 +522,7 @@ class EpubConverter:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
filename = '9781641051217'
|
filename = '9781614382264'
|
||||||
logger_object = BookLogger(name='epub', book_id=filename)
|
logger_object = BookLogger(name='epub', book_id=filename)
|
||||||
|
|
||||||
json_converter = EpubConverter(f'../../epub/{filename}.epub',
|
json_converter = EpubConverter(f'../../epub/{filename}.epub',
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from src.access import Access
|
|||||||
from src.livecarta_config import LiveCartaConfig
|
from src.livecarta_config import LiveCartaConfig
|
||||||
|
|
||||||
|
|
||||||
def save_image_locally(img_file_path, img_content, book_id):
|
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
|
||||||
"""Function saves all images locally"""
|
"""Function saves all images locally"""
|
||||||
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
new_path = pathlib.Path(os.path.join(
|
new_path = pathlib.Path(os.path.join(
|
||||||
@@ -24,19 +24,19 @@ def save_image_locally(img_file_path, img_content, book_id):
|
|||||||
return new_img_path
|
return new_img_path
|
||||||
|
|
||||||
|
|
||||||
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
|
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
|
||||||
"""Function saves all images to Amazon web service"""
|
"""Function saves all images to Amazon web service"""
|
||||||
link_path = access.send_image(
|
link_path = access.send_image(
|
||||||
img_file_path, doc_id=book_id, img_content=img_content)
|
img_file_path, doc_id=book_id, img_content=img_content)
|
||||||
return link_path
|
return link_path
|
||||||
|
|
||||||
|
|
||||||
def update_images_src_links(body_tag: Tag,
|
def update_images_src_links(body_tag: BeautifulSoup,
|
||||||
href2img_content: dict,
|
href2img_content: dict,
|
||||||
path_to_html,
|
path_to_html: str,
|
||||||
access=None,
|
access=None,
|
||||||
path2aws_path=None,
|
path2aws_path: dict = None,
|
||||||
book_id=None):
|
book_id: str = None) -> dict:
|
||||||
"""Function makes dictionary image_src_path -> Amazon web service_path"""
|
"""Function makes dictionary image_src_path -> Amazon web service_path"""
|
||||||
img_tags = body_tag.find_all('img')
|
img_tags = body_tag.find_all('img')
|
||||||
|
|
||||||
@@ -99,13 +99,22 @@ def preprocess_table(body_tag: BeautifulSoup):
|
|||||||
table.attrs['border'] = '1'
|
table.attrs['border'] = '1'
|
||||||
|
|
||||||
|
|
||||||
def process_lists(body_tag):
|
def process_lists(body_tag: BeautifulSoup):
|
||||||
"""
|
"""
|
||||||
Function to process tags <li>.
|
Function
|
||||||
Unwrap <p> tags.
|
- process tags <li>.
|
||||||
"""
|
- unwrap <p> tags.
|
||||||
li_tags = body_tag.find_all("li")
|
Parameters
|
||||||
|
----------
|
||||||
|
body_tag: Tag, soup object
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
li_tags = body_tag.find_all("li")
|
||||||
for li_tag in li_tags:
|
for li_tag in li_tags:
|
||||||
if li_tag.p:
|
if li_tag.p:
|
||||||
li_tag.attrs.update(li_tag.p.attrs)
|
li_tag.attrs.update(li_tag.p.attrs)
|
||||||
@@ -113,7 +122,7 @@ def process_lists(body_tag):
|
|||||||
|
|
||||||
|
|
||||||
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||||
"""Function inserts span before tag to be removed(aren't supported by livecarta)"""
|
"""Function inserts span before tag aren't supported by livecarta"""
|
||||||
new_tag = main_tag.new_tag("span")
|
new_tag = main_tag.new_tag("span")
|
||||||
new_tag.attrs['id'] = id_ or ''
|
new_tag.attrs['id'] = id_ or ''
|
||||||
new_tag.attrs['class'] = class_ or ''
|
new_tag.attrs['class'] = class_ or ''
|
||||||
@@ -121,8 +130,8 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
|||||||
tag.insert_before(new_tag)
|
tag.insert_before(new_tag)
|
||||||
|
|
||||||
|
|
||||||
def clean_headings_content(content: Tag, title: str):
|
def clean_headings_content(content: BeautifulSoup, title: str):
|
||||||
def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
|
def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
|
||||||
if tag_to_be_removed.attrs.get('id'):
|
if tag_to_be_removed.attrs.get('id'):
|
||||||
insert_span_with_attrs_before_tag(body_tag,
|
insert_span_with_attrs_before_tag(body_tag,
|
||||||
tag_to_be_removed,
|
tag_to_be_removed,
|
||||||
@@ -194,6 +203,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
|||||||
|
|
||||||
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
|
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
|
||||||
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
|
||||||
|
|
||||||
"""
|
"""
|
||||||
footnotes = []
|
footnotes = []
|
||||||
noterefs_tags = source_html_tag.find_all(
|
noterefs_tags = source_html_tag.find_all(
|
||||||
@@ -258,21 +268,28 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
|||||||
return footnotes, new_noterefs_tags, new_footnotes_tags
|
return footnotes, new_noterefs_tags, new_footnotes_tags
|
||||||
|
|
||||||
|
|
||||||
def unwrap_structural_tags(body_tag):
|
def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||||
"""Main function that works with structure of html. Make changes inplace.
|
"""
|
||||||
|
Main function that works with structure of html. Make changes inplace.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
body_tag: Tag, soup object
|
||||||
|
|
||||||
|
Steps
|
||||||
|
----------
|
||||||
1. Extracts tags that are not needed
|
1. Extracts tags that are not needed
|
||||||
|
|
||||||
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
|
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
|
||||||
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
|
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
|
||||||
This tag must have a body_tag as a parent.
|
This tag must have a body_tag as a parent.
|
||||||
Otherwise, it is wrapped with some tags. Like:
|
Otherwise, it is wrapped with some tags. Like:
|
||||||
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
|
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
|
||||||
|
|
||||||
3. Headings that are not supported by livecarta converts to <p>
|
3. Headings that are not supported by livecarta converts to <p>
|
||||||
4. Wrapping NavigableString
|
4. Wrapping NavigableString
|
||||||
:param body_tag: Tag, soup object
|
|
||||||
:return: None
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def preserve_class_in_aside_tag(tag_):
|
def preserve_class_in_aside_tag(tag_):
|
||||||
@@ -284,10 +301,18 @@ def unwrap_structural_tags(body_tag):
|
|||||||
if not tag_.parent.attrs.get('class'):
|
if not tag_.parent.attrs.get('class'):
|
||||||
tag_.parent.attrs['class'] = tag_class
|
tag_.parent.attrs['class'] = tag_class
|
||||||
|
|
||||||
def preserve_class_in_section_tag(tag_) -> bool:
|
def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
|
||||||
"""
|
"""
|
||||||
to save css style inherited from class, copy class to child <p>
|
Function saves css style inherited from class, copies class to child <p>
|
||||||
returns True, if <section> could be unwrapped
|
returns True, if <section> could be unwrapped
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tag_: Tag, soup object
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# this is for Wiley books with boxes
|
# this is for Wiley books with boxes
|
||||||
tag_class = tag_.attrs['class'] if not isinstance(
|
tag_class = tag_.attrs['class'] if not isinstance(
|
||||||
@@ -314,9 +339,11 @@ def unwrap_structural_tags(body_tag):
|
|||||||
class_=tag_to_be_removed.attrs.get('class'))
|
class_=tag_to_be_removed.attrs.get('class'))
|
||||||
|
|
||||||
def replace_div_tag_with_table():
|
def replace_div_tag_with_table():
|
||||||
"""Function replace <div> with <table>:
|
"""
|
||||||
|
Function replace <div> with <table>:
|
||||||
1. Convert div with certain classes to tables
|
1. Convert div with certain classes to tables
|
||||||
2. Add background color to div with background-color
|
2. Add background color to div with background-color
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for div in body_tag.find_all("div"):
|
for div in body_tag.find_all("div"):
|
||||||
if div.attrs.get('class'):
|
if div.attrs.get('class'):
|
||||||
@@ -431,7 +458,7 @@ def unwrap_structural_tags(body_tag):
|
|||||||
return body_tag
|
return body_tag
|
||||||
|
|
||||||
|
|
||||||
def get_tags_between_chapter_marks(first_id, href, html_soup):
|
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||||
"""After processing on a first_id that corresponds to current chapter,
|
"""After processing on a first_id that corresponds to current chapter,
|
||||||
from initial html_soup all tags from current chapter are extracted
|
from initial html_soup all tags from current chapter are extracted
|
||||||
|
|
||||||
@@ -441,7 +468,7 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
|
|||||||
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
||||||
href:
|
href:
|
||||||
Name of current chapter's file
|
Name of current chapter's file
|
||||||
html_soup :
|
html_soup: Tag, soup object
|
||||||
Soup object of current file
|
Soup object of current file
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
@@ -536,37 +563,33 @@ def prepare_formatted(text: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def wrap_preformatted_span_with_table(main_tag, old_tag):
|
def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
|
||||||
"""Function wraps <span> with <table>"""
|
"""Function wraps <span> with <table>"""
|
||||||
table = main_tag.new_tag("table")
|
table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
|
||||||
table.attrs['border'] = '1px #ccc;'
|
"tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||||
table.attrs['style'] = 'width:100%;'
|
table.attrs['border'], table.attrs['style'] = '1px #ccc;', 'width:100%;'
|
||||||
tbody = main_tag.new_tag("tbody")
|
|
||||||
tr = main_tag.new_tag("tr")
|
|
||||||
td = main_tag.new_tag("td")
|
|
||||||
td.attrs['bgcolor'] = '#f5f5f5'
|
td.attrs['bgcolor'] = '#f5f5f5'
|
||||||
# td.attrs['border-radius'] = '4px'
|
# td.attrs['border-radius'] = '4px'
|
||||||
old_tag.wrap(td)
|
span_tag.wrap(td)
|
||||||
td.wrap(tr)
|
td.wrap(tr)
|
||||||
tr.wrap(tbody)
|
tr.wrap(tbody)
|
||||||
tbody.wrap(table)
|
tbody.wrap(table)
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
def preprocess_pre_tags(chapter_tag):
|
def preprocess_pre_tags(chapter_tag: BeautifulSoup):
|
||||||
"""Function preprocessing <pre> tags
|
"""
|
||||||
|
Function preprocessing <pre> tags
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
chapter_tag: BeautifulSoup
|
chapter_tag: Tag, soup object
|
||||||
|
|
||||||
Steps
|
Steps
|
||||||
----------
|
----------
|
||||||
1. cleaning \n
|
1. Process NavigableString
|
||||||
2. heading removal
|
2. Process Tags and their children
|
||||||
3. processing tags
|
|
||||||
4. class removal
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
"""
|
||||||
for pre in chapter_tag.find_all("pre"):
|
for pre in chapter_tag.find_all("pre"):
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag("span")
|
new_tag = BeautifulSoup(features='lxml').new_tag("span")
|
||||||
new_tag.attrs = pre.attrs.copy()
|
new_tag.attrs = pre.attrs.copy()
|
||||||
@@ -599,17 +622,26 @@ def preprocess_pre_tags(chapter_tag):
|
|||||||
"font-size: 14px; white-space: nowrap;"
|
"font-size: 14px; white-space: nowrap;"
|
||||||
pre.replace_with(new_tag)
|
pre.replace_with(new_tag)
|
||||||
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
|
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
|
||||||
|
# add <p> to save brs
|
||||||
p_for_br = chapter_tag.new_tag("p")
|
p_for_br = chapter_tag.new_tag("p")
|
||||||
p_for_br.string = "\xa0"
|
p_for_br.string = "\xa0"
|
||||||
table.insert_after(p_for_br)
|
table.insert_after(p_for_br)
|
||||||
|
|
||||||
|
|
||||||
def preprocess_code_tags(chapter_tag: Tag):
|
def preprocess_code_tags(chapter_tag: BeautifulSoup):
|
||||||
"""Function that
|
"""
|
||||||
|
Function
|
||||||
- transform <code>, <kdb>, <var> tags into span
|
- transform <code>, <kdb>, <var> tags into span
|
||||||
- add code style to this tags
|
- add code style to this tags
|
||||||
"""
|
Parameters
|
||||||
|
----------
|
||||||
|
chapter_tag: Tag, soup object
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
|
||||||
|
"""
|
||||||
for code in chapter_tag.find_all(re.compile("code|kbd|var")):
|
for code in chapter_tag.find_all(re.compile("code|kbd|var")):
|
||||||
code.name = "span"
|
code.name = "span"
|
||||||
if code.parent.name == "pre":
|
if code.parent.name == "pre":
|
||||||
@@ -620,7 +652,6 @@ def preprocess_code_tags(chapter_tag: Tag):
|
|||||||
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
|
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_title(title_of_chapter: str) -> str:
|
def prepare_title(title_of_chapter: str) -> str:
|
||||||
"""Function finalise processing/cleaning title"""
|
"""Function finalise processing/cleaning title"""
|
||||||
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
|
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
|
||||||
@@ -631,18 +662,19 @@ def prepare_title(title_of_chapter: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
|
||||||
"""Function finalise processing/cleaning content
|
"""
|
||||||
|
Function finalise processing/cleaning content
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
title_str: str
|
title_str: str
|
||||||
|
|
||||||
content_tag: BeautifulSoup
|
content_tag: Tag, soup object
|
||||||
|
|
||||||
remove_title_from_chapter: bool
|
remove_title_from_chapter: bool
|
||||||
|
|
||||||
Steps
|
Steps
|
||||||
----------
|
----------
|
||||||
1. cleaning \n
|
1. find \n
|
||||||
2. heading removal
|
2. heading removal
|
||||||
3. processing tags
|
3. processing tags
|
||||||
4. class removal
|
4. class removal
|
||||||
@@ -651,9 +683,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
|||||||
-------
|
-------
|
||||||
content_tag: str
|
content_tag: str
|
||||||
prepared content
|
prepared content
|
||||||
"""
|
|
||||||
|
|
||||||
# 0. cleaning \n
|
"""
|
||||||
|
# 1. find \n
|
||||||
to_remove = []
|
to_remove = []
|
||||||
for child in content_tag.contents:
|
for child in content_tag.contents:
|
||||||
if isinstance(child, NavigableString):
|
if isinstance(child, NavigableString):
|
||||||
@@ -661,18 +693,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
|||||||
if s == '':
|
if s == '':
|
||||||
to_remove.append(child)
|
to_remove.append(child)
|
||||||
|
|
||||||
# 1. heading removal
|
# 2. heading removal
|
||||||
if remove_title_from_chapter:
|
if remove_title_from_chapter:
|
||||||
clean_headings_content(content_tag, title_str)
|
clean_headings_content(content_tag, title_str)
|
||||||
|
|
||||||
# 2. processing tags (<li>, <table>, <code>, <pre>, <block>)
|
# 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
|
||||||
process_lists(content_tag)
|
process_lists(content_tag)
|
||||||
preprocess_table(content_tag)
|
preprocess_table(content_tag)
|
||||||
preprocess_code_tags(content_tag)
|
preprocess_code_tags(content_tag)
|
||||||
preprocess_pre_tags(content_tag)
|
preprocess_pre_tags(content_tag)
|
||||||
preprocess_block_tags(content_tag)
|
preprocess_block_tags(content_tag)
|
||||||
|
|
||||||
# 3. class removal
|
# 4. class removal
|
||||||
for tag in content_tag.find_all(recursive=True):
|
for tag in content_tag.find_all(recursive=True):
|
||||||
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
|
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
|
||||||
'footnote-element']):
|
'footnote-element']):
|
||||||
|
|||||||
Reference in New Issue
Block a user