forked from LiveCarta/BookConverter
Formatting
This commit is contained in:
@@ -14,21 +14,23 @@ from src.livecarta_config import LiveCartaConfig
|
||||
cssutils.log.setLevel(CRITICAL)
|
||||
|
||||
|
||||
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
|
||||
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
|
||||
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
|
||||
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0,
|
||||
1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69,
|
||||
1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38,
|
||||
2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
|
||||
|
||||
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px',
|
||||
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
|
||||
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
|
||||
'48px', '49px', '50px', '64px', '72px']
|
||||
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px',
|
||||
'17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px',
|
||||
'26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
|
||||
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px',
|
||||
'44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px']
|
||||
|
||||
list_types = ['circle', 'disc', 'armenian', 'decimal',
|
||||
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
|
||||
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
|
||||
|
||||
|
||||
def convert_tag_values(value: str) -> str:
|
||||
def convert_tag_style_values(value: str) -> str:
|
||||
"""
|
||||
Function
|
||||
- converts values of tags from em/%/pt to px
|
||||
@@ -42,8 +44,8 @@ def convert_tag_values(value: str) -> str:
|
||||
value: str
|
||||
|
||||
"""
|
||||
def find_closest_size(value):
|
||||
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
|
||||
def find_closest_size(size_value):
|
||||
possible_sizes = list(takewhile(lambda x: size_value > x, sizes_pr))
|
||||
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
|
||||
return sizes_px[last_possible_size_index]
|
||||
|
||||
@@ -122,12 +124,13 @@ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING shou
|
||||
to suit livecarta style convention.
|
||||
"""
|
||||
LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||
'text-indent': convert_tag_values,
|
||||
'text-indent': convert_tag_style_values,
|
||||
'font-variant': lambda x: x,
|
||||
'text-align': lambda x: x,
|
||||
'font': lambda x: '',
|
||||
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
|
||||
'font-size': convert_tag_values,
|
||||
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or
|
||||
LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
|
||||
'font-size': convert_tag_style_values,
|
||||
'color': get_text_color,
|
||||
'background-color': get_bg_color,
|
||||
'background': get_bg_color,
|
||||
@@ -140,9 +143,9 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
|
||||
'border-bottom': lambda x: x if x != '0' else '',
|
||||
'list-style-type': lambda x: x if x in list_types else 'disc',
|
||||
'list-style-image': lambda x: 'disc',
|
||||
'margin-left': convert_tag_values,
|
||||
'margin-top': convert_tag_values,
|
||||
'margin': convert_tag_values,
|
||||
'margin-left': convert_tag_style_values,
|
||||
'margin-top': convert_tag_style_values,
|
||||
'margin': convert_tag_style_values,
|
||||
}
|
||||
|
||||
"""
|
||||
@@ -269,10 +272,10 @@ class TagStyleConverter:
|
||||
item = item.split(':')
|
||||
if item[0] in ['text-indent', 'margin-left', 'margin']:
|
||||
if len(item[1].split(' ')) == 3:
|
||||
item[1] = convert_tag_values(item[1].split(
|
||||
item[1] = convert_tag_style_values(item[1].split(
|
||||
' ')[-2]) # split returns middle value
|
||||
else:
|
||||
item[1] = convert_tag_values(item[1].split(
|
||||
item[1] = convert_tag_style_values(item[1].split(
|
||||
' ')[-1]) # split returns last value
|
||||
clean_style += item[0] + ': ' + item[1] + '; '
|
||||
|
||||
@@ -343,7 +346,8 @@ class TagStyleConverter:
|
||||
|
||||
split_inline_style: dict = remove_extra_spaces(inline_style)
|
||||
|
||||
# repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css
|
||||
# repetition check - if the tag had already had inline style
|
||||
# that isn't in the css styles, add this to style parsed from css
|
||||
repeat_styles = list(set(split_ultimate_style.keys())
|
||||
& set(split_inline_style.keys()))
|
||||
|
||||
@@ -409,7 +413,8 @@ class TagStyleConverter:
|
||||
if has_p_style_attrs:
|
||||
p_style += item + ';'
|
||||
initial_style = initial_style.replace(item + ';', '')
|
||||
# here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
|
||||
# here check that this style i exactly the same.
|
||||
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
|
||||
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
|
||||
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
|
||||
if any(styles_to_be_saved_in_span):
|
||||
@@ -549,4 +554,4 @@ if __name__ == '__main__':
|
||||
'pr01s05.xhtml').get_body_content().decode()
|
||||
html_soup = BeautifulSoup(html_, features='lxml')
|
||||
|
||||
print(convert_html_soup_with_css_style(html_soup, css_cleaned))
|
||||
print(convert_html_soup_with_css_style(html_soup, css_cleaned))
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from src.book_solver import BookSolver
|
||||
from src.epub_converter.epub_converter import EpubConverter
|
||||
|
||||
|
||||
class EpubBook(BookSolver):
|
||||
"""Class of .epub type book - child of BookSolver"""
|
||||
|
||||
@@ -10,10 +11,19 @@ class EpubBook(BookSolver):
|
||||
|
||||
def get_converted_book(self):
|
||||
"""
|
||||
1. Convert epub to html
|
||||
2. Parse from line structure to nested structure
|
||||
Function
|
||||
Steps
|
||||
----------
|
||||
1. Converts .epub to .html
|
||||
2. Parses from line structure to nested structure
|
||||
|
||||
Returns
|
||||
----------
|
||||
content_dict
|
||||
json for LiveCarta platform
|
||||
|
||||
"""
|
||||
json_converter = EpubConverter(self.file_path, access=self.access, logger=self.logger_object)
|
||||
json_converter = EpubConverter(
|
||||
self.file_path, access=self.access, logger=self.logger_object)
|
||||
content_dict = json_converter.convert_to_dict()
|
||||
self.status_wrapper.set_generating()
|
||||
return content_dict
|
||||
return content_dict
|
||||
|
||||
@@ -71,7 +71,7 @@ def update_images_src_links(body_tag: BeautifulSoup,
|
||||
return path2aws_path
|
||||
|
||||
|
||||
def preprocess_table(body_tag: BeautifulSoup):
|
||||
def _preprocess_table(body_tag: BeautifulSoup):
|
||||
"""Function to preprocess tables and tags(td|th|tr): style"""
|
||||
tables = body_tag.find_all("table")
|
||||
for table in tables:
|
||||
@@ -99,7 +99,7 @@ def preprocess_table(body_tag: BeautifulSoup):
|
||||
table.attrs['border'] = '1'
|
||||
|
||||
|
||||
def process_lists(body_tag: BeautifulSoup):
|
||||
def _process_lists(body_tag: BeautifulSoup):
|
||||
"""
|
||||
Function
|
||||
- process tags <li>.
|
||||
@@ -121,7 +121,7 @@ def process_lists(body_tag: BeautifulSoup):
|
||||
li_tag.p.unwrap()
|
||||
|
||||
|
||||
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||
def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||
"""Function inserts span before tag aren't supported by livecarta"""
|
||||
new_tag = main_tag.new_tag("span")
|
||||
new_tag.attrs['id'] = id_ or ''
|
||||
@@ -130,21 +130,21 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
|
||||
tag.insert_before(new_tag)
|
||||
|
||||
|
||||
def clean_headings_content(content: BeautifulSoup, title: str):
|
||||
def _clean_headings_content(content: BeautifulSoup, title: str):
|
||||
def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
|
||||
if tag_to_be_removed.attrs.get('id'):
|
||||
insert_span_with_attrs_before_tag(body_tag,
|
||||
tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs.get(
|
||||
'id'),
|
||||
class_=tag_to_be_removed.attrs.get('class'))
|
||||
_insert_span_with_attrs_before_tag(body_tag,
|
||||
tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs.get(
|
||||
'id'),
|
||||
class_=tag_to_be_removed.attrs.get('class'))
|
||||
|
||||
for sub_tag in tag_to_be_removed.find_all():
|
||||
if sub_tag.attrs.get('id'):
|
||||
insert_span_with_attrs_before_tag(body_tag,
|
||||
tag_to_be_removed,
|
||||
id_=sub_tag.attrs['id'],
|
||||
class_=sub_tag.attrs.get('class'))
|
||||
_insert_span_with_attrs_before_tag(body_tag,
|
||||
tag_to_be_removed,
|
||||
id_=sub_tag.attrs['id'],
|
||||
class_=sub_tag.attrs.get('class'))
|
||||
|
||||
title = title.lower()
|
||||
for child in content.contents:
|
||||
@@ -165,7 +165,7 @@ def clean_headings_content(content: BeautifulSoup, title: str):
|
||||
break
|
||||
|
||||
|
||||
def heading_tag_to_p_tag(body_tag):
|
||||
def _heading_tag_to_p_tag(body_tag):
|
||||
"""Function to convert all lower level headings to p tags"""
|
||||
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||
header_tags = body_tag.find_all(re.compile(pattern))
|
||||
@@ -173,7 +173,7 @@ def heading_tag_to_p_tag(body_tag):
|
||||
tag.name = 'p'
|
||||
|
||||
|
||||
def clean_title_from_numbering(title: str):
|
||||
def _clean_title_from_numbering(title: str):
|
||||
"""Function removes numbering from titles"""
|
||||
title = re.sub(r'^(\s+)+', '', title)
|
||||
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
|
||||
@@ -182,7 +182,7 @@ def clean_title_from_numbering(title: str):
|
||||
return title
|
||||
|
||||
|
||||
def replace_with_livecarta_anchor_tag(anchor, i):
|
||||
def _replace_with_livecarta_anchor_tag(anchor, i):
|
||||
"""Function replace noteref_tag(anchor) with new livecarta tag"""
|
||||
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
|
||||
new_tag['class'] = 'footnote-element'
|
||||
@@ -257,7 +257,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
||||
if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
|
||||
footnote_tag = footnote_tag.parent
|
||||
new_noterefs_tags.append(
|
||||
replace_with_livecarta_anchor_tag(noteref_tag, i))
|
||||
_replace_with_livecarta_anchor_tag(noteref_tag, i))
|
||||
content = footnote_tag.text
|
||||
# footnote_tag.decompose()
|
||||
footnotes.append(content)
|
||||
@@ -292,7 +292,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
|
||||
"""
|
||||
|
||||
def preserve_class_in_aside_tag(tag_):
|
||||
def _preserve_class_in_aside_tag(tag_):
|
||||
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
|
||||
# this is for Wiley books with boxes
|
||||
tag_class = tag_.attrs['class'] if not isinstance(
|
||||
@@ -301,7 +301,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
if not tag_.parent.attrs.get('class'):
|
||||
tag_.parent.attrs['class'] = tag_class
|
||||
|
||||
def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
|
||||
def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
|
||||
"""
|
||||
Function saves css style inherited from class, copies class to child <p>
|
||||
returns True, if <section> could be unwrapped
|
||||
@@ -332,13 +332,13 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
else:
|
||||
return True
|
||||
|
||||
def add_span_to_save_ids_for_links(tag_to_be_removed):
|
||||
def _add_span_to_save_ids_for_links(tag_to_be_removed):
|
||||
if tag_to_be_removed.attrs.get('id'):
|
||||
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs['id'],
|
||||
class_=tag_to_be_removed.attrs.get('class'))
|
||||
_insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
|
||||
id_=tag_to_be_removed.attrs['id'],
|
||||
class_=tag_to_be_removed.attrs.get('class'))
|
||||
|
||||
def replace_div_tag_with_table():
|
||||
def _replace_div_tag_with_table():
|
||||
"""
|
||||
Function replace <div> with <table>:
|
||||
1. Convert div with certain classes to tables
|
||||
@@ -350,11 +350,11 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
div_class = div.attrs['class'] if not isinstance(
|
||||
div.attrs['class'], list) else div.attrs['class'][0]
|
||||
if div_class in ['C409', 'C409a']:
|
||||
wrap_block_tag_with_table(
|
||||
_wrap_block_tag_with_table(
|
||||
body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')
|
||||
|
||||
elif div_class in ['C441', 'C816']:
|
||||
wrap_block_tag_with_table(
|
||||
_wrap_block_tag_with_table(
|
||||
body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')
|
||||
|
||||
if div.attrs.get('style'):
|
||||
@@ -363,7 +363,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
'background-color') + len('background-color')
|
||||
start_index_of_color = end_index + 2
|
||||
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
|
||||
wrap_block_tag_with_table(
|
||||
_wrap_block_tag_with_table(
|
||||
body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
|
||||
elif div.attrs.get('style') == '':
|
||||
del div.attrs['style']
|
||||
@@ -379,7 +379,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
if all(is_not_struct_tag):
|
||||
div.name = 'p'
|
||||
continue
|
||||
add_span_to_save_ids_for_links(div)
|
||||
_add_span_to_save_ids_for_links(div)
|
||||
div.unwrap()
|
||||
|
||||
# comments removal
|
||||
@@ -387,18 +387,18 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
for element in tag(text=lambda text: isinstance(text, Comment)):
|
||||
element.extract()
|
||||
|
||||
replace_div_tag_with_table()
|
||||
_replace_div_tag_with_table()
|
||||
|
||||
for s in body_tag.find_all("section"):
|
||||
could_be_unwrapped = True
|
||||
if s.attrs.get('class'):
|
||||
could_be_unwrapped = preserve_class_in_section_tag(s)
|
||||
add_span_to_save_ids_for_links(s)
|
||||
could_be_unwrapped = _preserve_class_in_section_tag(s)
|
||||
_add_span_to_save_ids_for_links(s)
|
||||
if could_be_unwrapped:
|
||||
s.unwrap()
|
||||
|
||||
for s in body_tag.find_all("article"):
|
||||
add_span_to_save_ids_for_links(s)
|
||||
_add_span_to_save_ids_for_links(s)
|
||||
s.unwrap()
|
||||
|
||||
for s in body_tag.find_all("figure"):
|
||||
@@ -407,22 +407,22 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
s.attrs['style'] = "text-align: center;"
|
||||
|
||||
for s in body_tag.find_all("figcaption"):
|
||||
add_span_to_save_ids_for_links(s)
|
||||
_add_span_to_save_ids_for_links(s)
|
||||
s.unwrap()
|
||||
|
||||
for s in body_tag.find_all("aside"):
|
||||
s.name = 'blockquote'
|
||||
|
||||
for s in body_tag.find_all("main"):
|
||||
add_span_to_save_ids_for_links(s)
|
||||
_add_span_to_save_ids_for_links(s)
|
||||
s.unwrap()
|
||||
|
||||
for s in body_tag.find_all("body"):
|
||||
add_span_to_save_ids_for_links(s)
|
||||
_add_span_to_save_ids_for_links(s)
|
||||
s.unwrap()
|
||||
|
||||
for s in body_tag.find_all("html"):
|
||||
add_span_to_save_ids_for_links(s)
|
||||
_add_span_to_save_ids_for_links(s)
|
||||
s.unwrap()
|
||||
|
||||
for s in body_tag.find_all("header"):
|
||||
@@ -442,7 +442,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
||||
assert all(
|
||||
parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
|
||||
|
||||
heading_tag_to_p_tag(body_tag)
|
||||
_heading_tag_to_p_tag(body_tag)
|
||||
|
||||
# wrap NavigableString with <p>
|
||||
for node in body_tag:
|
||||
@@ -500,7 +500,7 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
|
||||
return tags
|
||||
|
||||
|
||||
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
|
||||
def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
|
||||
"""Function wraps <block> with <table>"""
|
||||
table = main_tag.new_tag("table")
|
||||
table.attrs['border'] = border
|
||||
@@ -520,7 +520,7 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co
|
||||
return table
|
||||
|
||||
|
||||
def clean_wiley_block(block):
|
||||
def _clean_wiley_block(block):
|
||||
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
|
||||
for hr in hrs:
|
||||
hr.extract()
|
||||
@@ -530,30 +530,30 @@ def clean_wiley_block(block):
|
||||
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
|
||||
|
||||
|
||||
def preprocess_block_tags(chapter_tag):
|
||||
def _preprocess_block_tags(chapter_tag):
|
||||
"""Function preprocessing <block> tags"""
|
||||
for block in chapter_tag.find_all("blockquote"):
|
||||
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
|
||||
clean_wiley_block(block)
|
||||
_clean_wiley_block(block)
|
||||
|
||||
color = '#DDDDDD' if block.attrs.get(
|
||||
'class') == 'feature1' else None
|
||||
color = '#EEEEEE' if block.attrs.get(
|
||||
'class') == 'feature2' else color
|
||||
wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
|
||||
_wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
|
||||
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
|
||||
block.unwrap()
|
||||
|
||||
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
|
||||
clean_wiley_block(future_block)
|
||||
_clean_wiley_block(future_block)
|
||||
color = '#DDDDDD' if future_block.attrs.get(
|
||||
'class') == 'feature1' else None
|
||||
color = '#EEEEEE' if future_block.attrs.get(
|
||||
'class') == 'feature2' else color
|
||||
wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
|
||||
_wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
|
||||
|
||||
|
||||
def prepare_formatted(text: str) -> str:
|
||||
def _prepare_formatted(text: str) -> str:
|
||||
"""Function replaces special symbols with their Unicode representation"""
|
||||
text = text.replace("<", "\x3C")
|
||||
text = text.replace(">", "\x3E")
|
||||
@@ -563,7 +563,7 @@ def prepare_formatted(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
|
||||
def _wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
|
||||
"""Function wraps <span> with <table>"""
|
||||
table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
|
||||
"tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
|
||||
@@ -577,7 +577,7 @@ def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
|
||||
return table
|
||||
|
||||
|
||||
def preprocess_pre_tags(chapter_tag: BeautifulSoup):
|
||||
def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function preprocessing <pre> tags
|
||||
Parameters
|
||||
@@ -601,7 +601,7 @@ def preprocess_pre_tags(chapter_tag: BeautifulSoup):
|
||||
for child in copy_contents:
|
||||
# Navigable String
|
||||
if isinstance(child, NavigableString):
|
||||
cleaned_text = prepare_formatted(str(child))
|
||||
cleaned_text = _prepare_formatted(str(child))
|
||||
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
|
||||
for string in sub_strings[:-1]:
|
||||
new_tag.append(NavigableString(string))
|
||||
@@ -612,24 +612,24 @@ def preprocess_pre_tags(chapter_tag: BeautifulSoup):
|
||||
else:
|
||||
for sub_child in child.children:
|
||||
if isinstance(sub_child, NavigableString):
|
||||
cleaned_text = prepare_formatted(str(sub_child))
|
||||
cleaned_text = _prepare_formatted(str(sub_child))
|
||||
sub_child.replace_with(NavigableString(cleaned_text))
|
||||
else:
|
||||
sub_child.string = prepare_formatted(sub_child.text)
|
||||
sub_child.string = _prepare_formatted(sub_child.text)
|
||||
cleaned_tag = child.extract()
|
||||
new_tag.append(cleaned_tag)
|
||||
if to_add_br:
|
||||
new_tag.append(BeautifulSoup(
|
||||
features='lxml').new_tag('br'))
|
||||
pre.replace_with(new_tag)
|
||||
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
|
||||
table = _wrap_preformatted_span_with_table(chapter_tag, new_tag)
|
||||
# add <p> to save brs
|
||||
p_for_br = chapter_tag.new_tag("p")
|
||||
p_for_br.string = "\xa0"
|
||||
table.insert_after(p_for_br)
|
||||
|
||||
|
||||
def preprocess_code_tags(chapter_tag: BeautifulSoup):
|
||||
def _preprocess_code_tags(chapter_tag: BeautifulSoup):
|
||||
"""
|
||||
Function
|
||||
- transform <code>, <kdb>, <var> tags into span
|
||||
@@ -658,7 +658,7 @@ def prepare_title(title_of_chapter: str) -> str:
|
||||
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
|
||||
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
|
||||
title_str = re.sub(r' +', ' ', title_str).rstrip()
|
||||
title_str = clean_title_from_numbering(title_str)
|
||||
title_str = _clean_title_from_numbering(title_str)
|
||||
return title_str
|
||||
|
||||
|
||||
@@ -696,18 +696,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
|
||||
|
||||
# 2. heading removal
|
||||
if remove_title_from_chapter:
|
||||
clean_headings_content(content_tag, title_str)
|
||||
_clean_headings_content(content_tag, title_str)
|
||||
|
||||
# 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
|
||||
process_lists(content_tag)
|
||||
preprocess_table(content_tag)
|
||||
preprocess_code_tags(content_tag)
|
||||
preprocess_pre_tags(content_tag)
|
||||
preprocess_block_tags(content_tag)
|
||||
_process_lists(content_tag)
|
||||
_preprocess_table(content_tag)
|
||||
_preprocess_code_tags(content_tag)
|
||||
_preprocess_pre_tags(content_tag)
|
||||
_preprocess_block_tags(content_tag)
|
||||
|
||||
# 4. class removal
|
||||
for tag in content_tag.find_all(recursive=True):
|
||||
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
|
||||
'footnote-element']):
|
||||
del tag.attrs['class']
|
||||
return str(content_tag)
|
||||
return str(content_tag)
|
||||
|
||||
Reference in New Issue
Block a user