Function annotations

This commit is contained in:
Kiryl
2022-04-29 17:44:07 +03:00
parent 8de1d0d042
commit 37533e9b67
5 changed files with 187 additions and 130 deletions

View File

@@ -38,7 +38,7 @@ class NavPoint:
def flatten(x):
"""magic function from stackoverflow for list flattening"""
"""Magic function from stackoverflow for list flattening"""
atom = lambda i: not isinstance(i, list)
nil = lambda i: not i
car = lambda i: i[0]

View File

@@ -28,24 +28,27 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_tag_values(value):
"""Function 1. converts values of tags from em/%/pt to px
2. find closest font-size px
def convert_tag_values(value: str) -> str:
"""
Function
- converts values of tags from em/%/pt to px
- find closest font-size px
Parameters
----------
value: str
Returns
-------
converted value: str
"""
value: str
"""
def find_closest_size(value):
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index]
font_size_regexp = re.compile(r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
font_size_regexp = re.compile(
r'(^-*(\d*\.*\d+)%$)|(^-*(\d*\.*\d+)em$)|(^-*(\d*\.*\d+)pt$)')
has_style_attrs = re.search(font_size_regexp, value)
if has_style_attrs:
if has_style_attrs.group(1):
@@ -61,8 +64,7 @@ def convert_tag_values(value):
return value
"""
"""
Dictionary LIVECARTA_STYLE_ATTRS = { css property: value }
Style properties that can be used to fit livecarta css style convention.
If property has empty list, it means that any value can be converted.
@@ -164,17 +166,20 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
def check_style_to_be_tag(style) -> List[tuple]:
"""Function search style properties that can be converted to tags.
"""
Function searches style properties that can be converted to tags.
It searches for them and prepare list of properties to be removed from style string
Parameters
----------
style: str
<tag style="...">
Returns
-------
properties to remove: list
"""
to_remove: list
properties to remove
"""
to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
@@ -203,7 +208,7 @@ def update_css_style_types_to_livecarta_convention(css_rule, style_type):
def build_css_content(css_content):
""" Build css content with livecarta convention """
"""Build css content with livecarta convention"""
sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet:
@@ -227,7 +232,7 @@ class TagStyleConverter:
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
""" Function remove white color if there is no text bg color """
"""Function remove text white color if there is no bg color"""
if 'background' in style_:
return style_
@@ -264,9 +269,11 @@ class TagStyleConverter:
item = item.split(':')
if item[0] in ['text-indent', 'margin-left', 'margin']:
if len(item[1].split(' ')) == 3:
item[1] = convert_tag_values(item[1].split(' ')[-2]) # split returns middle value
item[1] = convert_tag_values(item[1].split(
' ')[-2]) # split returns middle value
else:
item[1] = convert_tag_values(item[1].split(' ')[-1]) # split returns last value
item[1] = convert_tag_values(item[1].split(
' ')[-1]) # split returns last value
clean_style += item[0] + ': ' + item[1] + '; '
margin_left_regexp = re.compile(
@@ -360,7 +367,7 @@ class TagStyleConverter:
s = f'{attr}:{value};'
self.style = self.style.replace(s, '')
self.style = self.style.strip()
if i == 0:
if not i:
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(
attr, value)]
new_tags.append(self.tag_with_inline_style)
@@ -388,7 +395,7 @@ class TagStyleConverter:
@staticmethod
def wrap_span_in_p_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in p -> span """
"""Function designed to save style attrs that cannot be in p -> span"""
if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
@@ -402,7 +409,6 @@ class TagStyleConverter:
if has_p_style_attrs:
p_style += item + ';'
initial_style = initial_style.replace(item + ';', '')
# here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
'-' + attr not in initial_style) for attr in styles_cant_be_in_p]
@@ -410,30 +416,30 @@ class TagStyleConverter:
# if find styles that cannot be in <p> -> wrap them in span
tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p')
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, initial_style)
span_style = initial_style if not has_li_style_attr else initial_style.replace(
has_li_style_attr.group(1), '')
p_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_p_style_attr = re.search(p_attrs_regexp, initial_style)
span_style = initial_style if not has_p_style_attr else initial_style.replace(
has_p_style_attr.group(1), '')
p_tag.attrs['style'] = p_style
tag.attrs['style'] = span_style
tag.wrap(p_tag)
else: tag.attrs['style'] = p_style
else:
tag.attrs['style'] = p_style
@staticmethod
def wrap_span_in_li_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in li -> span """
"""Function designed to save style attrs that cannot be in li -> span"""
if tag.name == 'li' and tag.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']]
styles_to_be_saved = [attr in tag.attrs.get(
styles_to_be_saved_in_span = [attr in tag.attrs.get(
'style') for attr in styles_cant_be_in_li]
if any(styles_to_be_saved):
if any(styles_to_be_saved_in_span):
tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li')
span_style = tag.attrs['style']
li_style = ''
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
re.compile(r'(list-style-type:(\w+);)')]:
has_li_style_attrs = re.search(
@@ -442,39 +448,38 @@ class TagStyleConverter:
li_style += has_li_style_attrs.group(1)
span_style = span_style.replace(
has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = li_style
tag.attrs['style'] = span_style
tag.wrap(li_tag)
@staticmethod
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in ul/ol -> span """
"""Function designed to save style attrs that cannot be in ul/ol -> span"""
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
check = [attr in tag.attrs.get('style')
for attr in styles_cant_be_in_ul_ol]
if any(check):
styles_to_be_saved_in_span = [attr in tag.attrs.get('style')
for attr in styles_cant_be_in_ul_ol]
if any(styles_to_be_saved_in_span):
tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
oul_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
span_style = tag.attrs['style']
possible_li_attrs_regexp = re.compile(
possible_uol_attrs_regexp = re.compile(
r'(list-style-type:(\w+);)')
has_li_style_attrs = re.search(
possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
oul_style = has_li_style_attrs.group(1)
has_uol_style_attrs = re.search(
possible_uol_attrs_regexp, span_style)
if has_uol_style_attrs and has_uol_style_attrs.group(1):
oul_style = has_uol_style_attrs.group(1)
span_style = span_style.replace(oul_style, '')
li_tag.attrs['style'] = oul_style
oul_tag.attrs['style'] = oul_style
tag.attrs['style'] = span_style
tag.wrap(li_tag)
tag.wrap(oul_tag)
@staticmethod
def wrap_span_in_h_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in h -> span """
"""Function designed to save style attrs that cannot be in h -> span"""
h_regexp = re.compile('(^h[1-9]$)')
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
@@ -482,10 +487,10 @@ class TagStyleConverter:
tag.name = 'span'
tag.wrap(h_tag)
style = tag.attrs['style']
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, style)
tag.attrs['style'] = style if not has_li_style_attr else style.replace(
has_li_style_attr.group(1), '')
h_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_h_style_attr = re.search(h_attrs_regexp, style)
tag.attrs['style'] = style if not has_h_style_attr else style.replace(
has_h_style_attr.group(1), '')
def convert_initial_tag(self):
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
@@ -496,8 +501,8 @@ class TagStyleConverter:
return self.tag_with_inline_style
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
""" Function adds styles from .css to inline style """
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str) -> BeautifulSoup:
"""Function adds styles from .css to inline style"""
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []

View File

@@ -1,7 +1,6 @@
import re
import json
import codecs
import logging
import os
from os.path import dirname, normpath, join
from itertools import chain
@@ -51,7 +50,8 @@ class EpubConverter:
# flag to be updated while ebooklib.toc is parsed
self.id_anchor_exist_in_nav_points = False
self.img_href2img_bytes = {} # file path to bytes
self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path
# file path from <a> to generated aws path
self.book_image_src_path2aws_path = {}
self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
@@ -116,7 +116,6 @@ class EpubConverter:
return nodes
def get_css_content(self, css_href, html_href):
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(
@@ -132,8 +131,8 @@ class EpubConverter:
The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
...2... = key2value
"""
"""
# dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list)
css_href2css_content: dict = {}
@@ -165,6 +164,7 @@ class EpubConverter:
"""
This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content
"""
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
@@ -191,8 +191,8 @@ class EpubConverter:
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
:param lvl: level of depth
"""
"""
if isinstance(element, Link):
nav_point = NavPoint(element)
if nav_point.id:
@@ -215,7 +215,8 @@ class EpubConverter:
sub_nodes = []
for elem in second:
if ('section' in first.title.lower() or 'part' in first.title.lower()) and lvl == 1:
self.offset_sub_nodes.append(self.build_adjacency_list_from_toc(elem, lvl))
self.offset_sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl))
else:
sub_nodes.append(
self.build_adjacency_list_from_toc(elem, lvl + 1))
@@ -239,8 +240,8 @@ class EpubConverter:
else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
def is_toc_empty(self):
""" Function checks is toc empty """
def is_toc_empty(self) -> bool:
"""Function checks is toc empty"""
# there is no toc in ebook or no top chapters
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return True
@@ -258,7 +259,7 @@ class EpubConverter:
self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added):
""" Function add files that not added to adjacency list """
"""Function add files that not added to adjacency list"""
for i, file in enumerate(not_added):
nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file))
@@ -295,19 +296,26 @@ class EpubConverter:
new_anchor_span.string = "\xa0"
return new_anchor_span
def match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
def match_href_to_path_from_toc(self, cur_file_path: str, href_in_link: str, internal_link_tag: Tag) -> str:
"""
Function used to find full path to file that is parsed from tag link
TOC: a/b/c.xhtml
b/c.xhtml -> a/b/c.xhtml
c.xhtml -> a/b/c.xhtml
Parameters
----------
cur_file_path: str
path to current file with tag link
href_in_link: str
filename got from tag link, like file1.xhtml
internal_link_tag: Tag
tag object that is parsed now
Used to find full path to file that is parsed from tag link
Returns
-------
full_path[0]: s
prepared content
:param cur_file_path: path to current file with tag link
:param href_in_link: filename got from tag link, like file1.xhtml
:param internal_link_tag: tag object that is parsed now
:return:
"""
dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join(
@@ -331,6 +339,12 @@ class EpubConverter:
Function
- processing internal links in a book
- make ids unique
Steps
----------
1. rebuild ids to be unique in all documents
2a. process anchor which is a whole xhtml file
2b. process anchor which is an element in xhtml file
"""
# 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc:
@@ -344,7 +358,7 @@ class EpubConverter:
new_id = self.create_unique_id(toc_href, tag.attrs['id'])
tag.attrs['id'] = new_id
# 2.a) process anchor which is a whole xhtml file
# 2a. process anchor which is a whole xhtml file
internal_link_reg1 = re.compile(
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
for toc_href in self.hrefs_added_to_toc:
@@ -367,7 +381,7 @@ class EpubConverter:
del internal_link_tag.attrs['href']
# 2.b) process anchor which is an element in xhtml file
# 2b. process anchor which is an element in xhtml file
internal_link_reg2 = re.compile(r'(^.+\.(htm|html|xhtml)\#.+)|(^\#.+)')
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
@@ -418,9 +432,9 @@ class EpubConverter:
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={a_tag_id}')
def build_one_chapter(self, nav_point):
def build_one_chapter(self, nav_point: NavPoint):
"""
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
Function updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
3 cases:
id wraps all chapter content,
@@ -429,7 +443,13 @@ class EpubConverter:
In all cases we know where chapter starts. Therefore chapter is all tags between chapter's id
and id of the next chapter/subchapter
Parameters
----------
nav_point: NavPoint
Returns
-------
None
"""
if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href]
@@ -446,7 +466,7 @@ class EpubConverter:
self.build_one_chapter(sub_node)
def define_chapters_content(self):
""" Function build chapters content starts from top level chapters """
"""Function build chapters content, starts from top level chapters"""
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points:
@@ -483,8 +503,8 @@ class EpubConverter:
self.logger.log(f'{indent}Chapter: {title} is prepared.')
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self):
""" Function which convert list of html nodes to appropriate json structure. """
def convert_to_dict(self) -> dict:
"""Function which convert list of html nodes to appropriate json structure"""
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
@@ -502,7 +522,7 @@ class EpubConverter:
if __name__ == "__main__":
filename = '9781641051217'
filename = '9781614382264'
logger_object = BookLogger(name='epub', book_id=filename)
json_converter = EpubConverter(f'../../epub/{filename}.epub',

View File

@@ -2,7 +2,7 @@ from src.book_solver import BookSolver
from src.epub_converter.epub_converter import EpubConverter
class EpubBook(BookSolver):
""" Class of .epub type book - child of BookSolver """
"""Class of .epub type book - child of BookSolver"""
def __init__(self, book_id=0, access=None, main_logger=None):
super().__init__(book_id, access, main_logger)

View File

@@ -9,7 +9,7 @@ from src.access import Access
from src.livecarta_config import LiveCartaConfig
def save_image_locally(img_file_path, img_content, book_id):
def save_image_locally(img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images locally"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
@@ -24,19 +24,19 @@ def save_image_locally(img_file_path, img_content, book_id):
return new_img_path
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
def save_image_to_aws(access: Access, img_file_path: str, img_content: bytes, book_id: str):
"""Function saves all images to Amazon web service"""
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link_path
def update_images_src_links(body_tag: Tag,
def update_images_src_links(body_tag: BeautifulSoup,
href2img_content: dict,
path_to_html,
path_to_html: str,
access=None,
path2aws_path=None,
book_id=None):
path2aws_path: dict = None,
book_id: str = None) -> dict:
"""Function makes dictionary image_src_path -> Amazon web service_path"""
img_tags = body_tag.find_all('img')
@@ -99,13 +99,22 @@ def preprocess_table(body_tag: BeautifulSoup):
table.attrs['border'] = '1'
def process_lists(body_tag):
def process_lists(body_tag: BeautifulSoup):
"""
Function to process tags <li>.
Unwrap <p> tags.
"""
li_tags = body_tag.find_all("li")
Function
- process tags <li>.
- unwrap <p> tags.
Parameters
----------
body_tag: Tag, soup object
Returns
-------
None
"""
li_tags = body_tag.find_all("li")
for li_tag in li_tags:
if li_tag.p:
li_tag.attrs.update(li_tag.p.attrs)
@@ -113,7 +122,7 @@ def process_lists(body_tag):
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
"""Function inserts span before tag to be removed(aren't supported by livecarta)"""
"""Function inserts span before tag aren't supported by livecarta"""
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
@@ -121,8 +130,8 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
tag.insert_before(new_tag)
def clean_headings_content(content: Tag, title: str):
def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
def clean_headings_content(content: BeautifulSoup, title: str):
def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
@@ -194,6 +203,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(
@@ -258,21 +268,28 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
return footnotes, new_noterefs_tags, new_footnotes_tags
def unwrap_structural_tags(body_tag):
"""Main function that works with structure of html. Make changes inplace.
def unwrap_structural_tags(body_tag: BeautifulSoup):
"""
Main function that works with structure of html. Make changes inplace.
Parameters
----------
body_tag: Tag, soup object
Steps
----------
1. Extracts tags that are not needed
2. Checks that marks for pointing a start of a chapter are placed on one level in html tree.
Mark is tag with 'class': 'converter-chapter-mark'. Added while TOC was parsed.
This tag must have a body_tag as a parent.
Otherwise, it is wrapped with some tags. Like:
<p> <span id='123', class='converter-chapter-mark'> </span> </p>
3. Headings that are not supported by livecarta converts to <p>
4. Wrapping NavigableString
:param body_tag: Tag, soup object
:return: None
Returns
-------
None
"""
def preserve_class_in_aside_tag(tag_):
@@ -284,10 +301,18 @@ def unwrap_structural_tags(body_tag):
if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class
def preserve_class_in_section_tag(tag_) -> bool:
def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
"""
to save css style inherited from class, copy class to child <p>
Function saves css style inherited from class, copies class to child <p>
returns True, if <section> could be unwrapped
Parameters
----------
tag_: Tag, soup object
Returns
-------
None
"""
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(
@@ -314,9 +339,11 @@ def unwrap_structural_tags(body_tag):
class_=tag_to_be_removed.attrs.get('class'))
def replace_div_tag_with_table():
"""Function replace <div> with <table>:
"""
Function replace <div> with <table>:
1. Convert div with certain classes to tables
2. Add background color to div with background-color
"""
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
@@ -431,22 +458,22 @@ def unwrap_structural_tags(body_tag):
return body_tag
def get_tags_between_chapter_marks(first_id, href, html_soup):
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id :
first_id:
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
href :
href:
Name of current chapter's file
html_soup :
html_soup: Tag, soup object
Soup object of current file
Returns
-------
tags : list [Tag, NavigableString]
tags: list [Tag, NavigableString]
Chapter's tags
"""
@@ -536,37 +563,33 @@ def prepare_formatted(text: str) -> str:
return text
def wrap_preformatted_span_with_table(main_tag, old_tag):
def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
"""Function wraps <span> with <table>"""
table = main_tag.new_tag("table")
table.attrs['border'] = '1px #ccc;'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
"tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
table.attrs['border'], table.attrs['style'] = '1px #ccc;', 'width:100%;'
td.attrs['bgcolor'] = '#f5f5f5'
# td.attrs['border-radius'] = '4px'
old_tag.wrap(td)
span_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def preprocess_pre_tags(chapter_tag):
"""Function preprocessing <pre> tags
def preprocess_pre_tags(chapter_tag: BeautifulSoup):
"""
Function preprocessing <pre> tags
Parameters
----------
chapter_tag: BeautifulSoup
chapter_tag: Tag, soup object
Steps
----------
1. cleaning \n
2. heading removal
3. processing tags
4. class removal
"""
1. Process NavigableString
2. Process Tags and their children
"""
for pre in chapter_tag.find_all("pre"):
new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy()
@@ -599,17 +622,26 @@ def preprocess_pre_tags(chapter_tag):
"font-size: 14px; white-space: nowrap;"
pre.replace_with(new_tag)
table = wrap_preformatted_span_with_table(chapter_tag, new_tag)
# add <p> to save brs
p_for_br = chapter_tag.new_tag("p")
p_for_br.string = "\xa0"
table.insert_after(p_for_br)
def preprocess_code_tags(chapter_tag: Tag):
"""Function that
- transform <code>, <kdb>, <var> tags into span
- add code style to this tags
def preprocess_code_tags(chapter_tag: BeautifulSoup):
"""
Function
- transform <code>, <kdb>, <var> tags into span
- add code style to this tags
Parameters
----------
chapter_tag: Tag, soup object
Returns
-------
None
"""
for code in chapter_tag.find_all(re.compile("code|kbd|var")):
code.name = "span"
if code.parent.name == "pre":
@@ -620,7 +652,6 @@ def preprocess_code_tags(chapter_tag: Tag):
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
def prepare_title(title_of_chapter: str) -> str:
"""Function finalise processing/cleaning title"""
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
@@ -631,18 +662,19 @@ def prepare_title(title_of_chapter: str) -> str:
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""Function finalise processing/cleaning content
"""
Function finalise processing/cleaning content
Parameters
----------
title_str: str
content_tag: BeautifulSoup
content_tag: Tag, soup object
remove_title_from_chapter: bool
Steps
----------
1. cleaning \n
1. find \n
2. heading removal
3. processing tags
4. class removal
@@ -651,9 +683,9 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
-------
content_tag: str
prepared content
"""
# 0. cleaning \n
"""
# 1. find \n
to_remove = []
for child in content_tag.contents:
if isinstance(child, NavigableString):
@@ -661,18 +693,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
if s == '':
to_remove.append(child)
# 1. heading removal
# 2. heading removal
if remove_title_from_chapter:
clean_headings_content(content_tag, title_str)
# 2. processing tags (<li>, <table>, <code>, <pre>, <block>)
# 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
process_lists(content_tag)
preprocess_table(content_tag)
preprocess_code_tags(content_tag)
preprocess_pre_tags(content_tag)
preprocess_block_tags(content_tag)
# 3. class removal
# 4. class removal
for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']):