Formatting

This commit is contained in:
Kiryl
2022-06-01 16:23:53 +03:00
parent 5039417a0f
commit c0ef0b6d6e
13 changed files with 318 additions and 185 deletions

View File

@@ -201,4 +201,4 @@ class Access:
pass pass
else: else:
raise Exception( raise Exception(
f'{response.status_code} Bad request: {response.json()["message"]}.') f'{response.status_code} Bad request: {response.json()["message"]}.')

View File

@@ -29,12 +29,13 @@ class BookSolver:
self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}', self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
book_id=book_id, book_id=book_id,
main_logger=main_logger) main_logger=main_logger)
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id) self.status_wrapper = BookStatusWrapper(
access, self.logger_object, book_id)
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \ assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels." "Length of headers doesn't match allowed levels."
def save_book_file(self, content): def save_book_file(self, content: str):
""" """
Function saves binary content of file to .docx/.epub Function saves binary content of file to .docx/.epub
Parameters Parameters
@@ -43,17 +44,21 @@ class BookSolver:
binary content of the file binary content of the file
""" """
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(
folder_path = os.path.join(folder_path, f'{self.book_type}/{self.book_id}') os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(
folder_path, f'{self.book_type}/{self.book_id}')
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}') file_path = os.path.join(
folder_path, f'{self.book_id}.{self.book_type}')
try: try:
with open(file_path, 'wb+') as file: with open(file_path, 'wb+') as file:
file.write(content) file.write(content)
self.logger_object.log(f'File was saved to folder: {folder_path}.') self.logger_object.log(f'File was saved to folder: {folder_path}.')
except Exception as exc: except Exception as exc:
self.logger_object.log(f"Error in writing {self.book_type} file.", logging.ERROR) self.logger_object.log(
f"Error in writing {self.book_type} file.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
raise exc raise exc
@@ -62,12 +67,14 @@ class BookSolver:
def get_book_file(self): def get_book_file(self):
"""Method for getting and saving book from server""" """Method for getting and saving book from server"""
try: try:
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file') self.logger_object.log(f'Start receiving file from server. URL:'
f' {self.access.url}/doc-convert/{self.book_id}/file')
content = self.access.get_book(self.book_id) content = self.access.get_book(self.book_id)
self.logger_object.log('File was received from server.') self.logger_object.log('File was received from server.')
self.save_book_file(content) self.save_book_file(content)
except FileNotFoundError as f_err: except FileNotFoundError as f_err:
self.logger_object.log("Can't get file from server.", logging.ERROR) self.logger_object.log(
"Can't get file from server.", logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
raise f_err raise f_err
except Exception as exc: except Exception as exc:
@@ -75,14 +82,17 @@ class BookSolver:
def check_output_directory(self): def check_output_directory(self):
if self.output_path is None: if self.output_path is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(
output_path = os.path.join(folder_path, f'json/{self.book_id}.json') os.path.dirname(os.path.abspath(__file__)))
output_path = os.path.join(
folder_path, f'json/{self.book_id}.json')
self.output_path = output_path self.output_path = output_path
self.output_path = pathlib.Path(self.output_path) self.output_path = pathlib.Path(self.output_path)
self.logger_object.log(f'Output file path: {self.output_path}') self.logger_object.log(f'Output file path: {self.output_path}')
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) pathlib.Path(self.output_path).parent.mkdir(
parents=True, exist_ok=True)
self.output_path.touch(exist_ok=True) self.output_path.touch(exist_ok=True)
def write_to_json(self, content: dict): def write_to_json(self, content: dict):
@@ -90,9 +100,11 @@ class BookSolver:
try: try:
with codecs.open(self.output_path, 'w', encoding='utf-8') as f: with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
json.dump(content, f, ensure_ascii=False) json.dump(content, f, ensure_ascii=False)
self.logger_object.log(f'Data has been saved to .json file: {self.output_path}') self.logger_object.log(
f'Data has been saved to .json file: {self.output_path}')
except Exception as exc: except Exception as exc:
self.logger_object.log('Error has occurred while writing json file.' + str(exc), logging.ERROR) self.logger_object.log(
'Error has occurred while writing .json file.' + str(exc), logging.ERROR)
def send_json_content_to_server(self, content: dict): def send_json_content_to_server(self, content: dict):
"""Function sends json_content to site""" """Function sends json_content to site"""
@@ -100,14 +112,15 @@ class BookSolver:
self.access.send_book(self.book_id, content) self.access.send_book(self.book_id, content)
self.logger_object.log(f'JSON data has been sent to server.') self.logger_object.log(f'JSON data has been sent to server.')
except Exception as exc: except Exception as exc:
self.logger_object.log('Error has occurred while sending json content.', logging.ERROR) self.logger_object.log(
'Error has occurred while sending json content.', logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error() self.status_wrapper.set_error()
raise exc raise exc
@abstractmethod @abstractmethod
def get_converted_book(self): def get_converted_book(self):
self.logger_object.log('Beginning of processing json output.') self.logger_object.log('Beginning of processing .json output.')
self.status_wrapper.set_generating() self.status_wrapper.set_generating()
return {} return {}
@@ -119,21 +132,24 @@ class BookSolver:
""" """
try: try:
self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.') self.logger_object.log(
f'Beginning of conversion from .{self.book_type} to .json.')
self.get_book_file() self.get_book_file()
self.status_wrapper.set_processing() self.status_wrapper.set_processing()
content_dict = self.get_converted_book() content_dict = self.get_converted_book()
self.status_wrapper.set_generating()
self.write_to_json(content_dict) self.write_to_json(content_dict)
self.send_json_content_to_server(content_dict) self.send_json_content_to_server(content_dict)
self.logger_object.log(f'End of the conversion to LiveCarta format. Check {self.output_path}.') self.logger_object.log(
f'End of the conversion to LiveCarta format. Check {self.output_path}.')
except Exception as exc: except Exception as exc:
self.status_wrapper.set_error() self.status_wrapper.set_error()
self.logger_object.log('Error has occurred while conversion.', logging.ERROR) self.logger_object.log(
'Error has occurred while conversion.', logging.ERROR)
self.logger_object.log_error_to_main_log(str(exc)) self.logger_object.log_error_to_main_log(str(exc))
raise exc raise exc
def conversion_local(self, file_name: str): def conversion_local(self, file_path: str):
""" """
Function Function
- without downloading book from server (local) - without downloading book from server (local)
@@ -141,13 +157,16 @@ class BookSolver:
""" """
try: try:
self.logger_object.log(f'Data has been downloaded from {file_name}.json file: ..\converter\json') self.logger_object.log(
f'Data has been downloaded from {file_path} file')
self.status_wrapper.set_processing() self.status_wrapper.set_processing()
with codecs.open(f'json/{file_name}.json', 'r', encoding='utf-8') as f_json: with codecs.open(file_path, 'r', encoding='utf-8') as f_json:
content_dict = json.load(f_json) content_dict = json.load(f_json)
self.status_wrapper.set_generating()
self.send_json_content_to_server(content_dict) self.send_json_content_to_server(content_dict)
self.logger_object.log(f'Sent a file to server. Check LiveCarta.') self.logger_object.log(f'Sent a file to server. Check LiveCarta.')
except Exception as exc: except Exception as exc:
self.status_wrapper.set_error() self.status_wrapper.set_error()
self.logger_object.log('Error has occurred while reading json file.' + str(exc), logging.ERROR) self.logger_object.log(
self.logger_object.log_error_to_main_log(str(exc)) 'Error has occurred while reading json file.' + str(exc), logging.ERROR)
self.logger_object.log_error_to_main_log(str(exc))

View File

@@ -88,4 +88,4 @@ class ChapterItem:
} }
def __str__(self): def __str__(self):
return '<Chapter: %s>' % self.title return '<Chapter: %s>' % self.title

View File

@@ -21,13 +21,22 @@ class HTMLDocxPreprocessor:
self.top_level_headers = None self.top_level_headers = None
self.content = list() self.content = list()
def _clean_tag(self, tag, attr_name, attr_value): def _clean_tag(self, tag: str, attr_name: str, attr_value: re):
""" """
Function to clean tags by its name and attribute value. Function to clean tags by its name and attribute value.
Parameters
----------
tag: str
tag name to clean
attr_name: str
attribute name
attr_value: [str,re]
attribute value
Returns
-------
clean tag
:param tag: Tag name to clean.
:param attr_name: Attribute name.
:param attr_value: Attribute value.
""" """
tags = self.body_tag.find_all(tag, {attr_name: attr_value}) tags = self.body_tag.find_all(tag, {attr_name: attr_value})
for tag in tags: for tag in tags:
@@ -56,12 +65,19 @@ class HTMLDocxPreprocessor:
return value return value
@classmethod @classmethod
def convert_font_pt_to_px(cls, style): def convert_font_pt_to_px(cls, style: str) -> str:
""" """
Method converts point in the font-size to pixels. Function converts point in the font-size to pixels.
Parameters
----------
style: str
str with style to proces
Returns
-------
: str
str with converted style
:param style: Str with style to process.
:return: Str with converted style.
""" """
size = re.search(r"font-size: (\d{1,3})pt", style) size = re.search(r"font-size: (\d{1,3})pt", style)
@@ -77,7 +93,10 @@ class HTMLDocxPreprocessor:
return re.sub(size + "pt", str(new_size) + "px", style) return re.sub(size + "pt", str(new_size) + "px", style)
def _font_to_span(self): def _font_to_span(self):
"""Function to convert <font> tag to <span>. If font style is default, then remove this tag.""" """
Function to convert <font> tag to <span>.
If font style is default, then remove this tag.
"""
fonts = self.body_tag.find_all("font") fonts = self.body_tag.find_all("font")
for font in fonts: for font in fonts:
face = font.get("face") face = font.get("face")
@@ -105,7 +124,8 @@ class HTMLDocxPreprocessor:
if len(font.attrs) == 0: if len(font.attrs) == 0:
font.unwrap() font.unwrap()
assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more <font> tags # on this step there should be no more <font> tags
assert len(self.body_tag.find_all("font")) == 0
def delete_content_before_toc(self): def delete_content_before_toc(self):
# remove all tag upper the <TOC> only in content !!! body tag is not updated # remove all tag upper the <TOC> only in content !!! body tag is not updated
@@ -116,11 +136,15 @@ class HTMLDocxPreprocessor:
def clean_trash(self): def clean_trash(self):
"""Function to remove all styles and tags we don't need.""" """Function to remove all styles and tags we don't need."""
self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$')) self._clean_tag('span', 'style', re.compile(
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages r'^background: #[0-9a-fA-F]{6}$'))
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$')) # todo: check for another languages
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))
self._clean_tag('span', 'style', re.compile(
'^letter-spacing: -?[\d\.]+pt$'))
self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$')) self._clean_tag('font', 'face', re.compile(
r'^Times New Roman[\w, ]+$'))
self._clean_tag("a", "name", "_GoBack") self._clean_tag("a", "name", "_GoBack")
self._clean_underline_links() self._clean_underline_links()
@@ -128,7 +152,8 @@ class HTMLDocxPreprocessor:
self._font_to_span() self._font_to_span()
# replace toc with empty <TOC> tag # replace toc with empty <TOC> tag
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) tables = self.body_tag.find_all(
"div", id=re.compile(r'^Table of Contents\d+'))
for table in tables: for table in tables:
table.wrap(self.html_soup.new_tag("TOC")) table.wrap(self.html_soup.new_tag("TOC"))
table.decompose() table.decompose()
@@ -138,7 +163,7 @@ class HTMLDocxPreprocessor:
paragraphs = self.body_tag.find_all('p') paragraphs = self.body_tag.find_all('p')
for p in paragraphs: for p in paragraphs:
# libra converts some \n into <p> with 2 </br> # libre converts some \n into <p> with 2 </br>
# there we remove 1 unnecessary <br> # there we remove 1 unnecessary <br>
brs = p.find_all('br') brs = p.find_all('br')
text = p.text text = p.text
@@ -156,9 +181,11 @@ class HTMLDocxPreprocessor:
if style: if style:
indent = re.search(r'text-indent: ([\d\.]{1,4})in', style) indent = re.search(r'text-indent: ([\d\.]{1,4})in', style)
margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style) margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style)
margin_right = re.search(r'margin-right: ([\d\.]{1,4})in', style) margin_right = re.search(
r'margin-right: ([\d\.]{1,4})in', style)
margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style) margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style)
margin_bottom = re.search(r'margin-bottom: ([\d\.]{1,4})in', style) margin_bottom = re.search(
r'margin-bottom: ([\d\.]{1,4})in', style)
else: else:
indent = None indent = None
margin_left = None margin_left = None
@@ -195,6 +222,7 @@ class HTMLDocxPreprocessor:
def _process_tables(self): def _process_tables(self):
"""Function to process tables. Set "border" attribute.""" """Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table") tables = self.body_tag.find_all("table")
for table in tables: for table in tables:
tds = table.find_all("td") tds = table.find_all("td")
@@ -258,21 +286,24 @@ class HTMLDocxPreprocessor:
for x in has_i_tag_or_br] for x in has_i_tag_or_br]
if all(has_i_tag_or_br) and is_zero_border: if all(has_i_tag_or_br) and is_zero_border:
new_div = BeautifulSoup(features='lxml').new_tag('blockquote') new_div = BeautifulSoup(
features='lxml').new_tag('blockquote')
for p in paragraphs: for p in paragraphs:
new_div.append(p) new_div.append(p)
table.replaceWith(new_div) table.replaceWith(new_div)
def _process_hrefs(self): def _process_hrefs(self):
a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^.*http.+')}) a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^.*http.+')})
# remove char=end of file for some editors # remove char=end of file for some editors
for tag in a_tags_with_href: for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace('\u200c', '')
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
a_tags_with_href = self.body_tag.find_all('a', {'href': re.compile('^(?!#sdfootnote)')}) a_tags_with_href = self.body_tag.find_all(
'a', {'href': re.compile('^(?!#sdfootnote)')})
for tag in a_tags_with_href: for tag in a_tags_with_href:
tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace('\u200c', '')
tag.string = tag.text.replace('\u200b', '') # zero-width-space tag.string = tag.text.replace('\u200b', '') # zero-width-space
@@ -286,23 +317,25 @@ class HTMLDocxPreprocessor:
def _process_footnotes(self): def _process_footnotes(self):
"""Function returns list of footnotes and delete them from html_soup.""" """Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc') footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$')) footnote_content = self.body_tag.find_all(
'div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_anchors) footnote_amt = len(footnote_anchors)
assert footnote_amt == len(footnote_content), \ assert footnote_amt == len(footnote_content), \
'Something went wrong with footnotes after libra conversion' 'Something went wrong with footnotes after libre conversion'
footnotes = [] footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
true_a_tag = cont_tag.find_all('a', class_=re.compile(r'^sdfootnote.+$'))[0] true_a_tag = cont_tag.find_all(
'a', class_=re.compile(r'^sdfootnote.+$'))[0]
if true_a_tag.attrs.get('href') is None: if true_a_tag.attrs.get('href') is None:
cont_tag.a.decompose() cont_tag.a.decompose()
continue continue
assert anc_tag['name'] == true_a_tag['href'][1:], \ assert anc_tag['name'] == true_a_tag['href'][1:], \
'Something went wrong with footnotes after libra conversion' 'Something went wrong with footnotes after libre conversion'
new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element' new_tag['class'] = 'footnote-element'
@@ -355,8 +388,10 @@ class HTMLDocxPreprocessor:
if len(img_tags): if len(img_tags):
if access is None: if access is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(
new_path = pathlib.Path(os.path.join(folder_path, f'json/img_{book_id}/')) os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'json/img_{book_id}/'))
new_path.mkdir(exist_ok=True) new_path.mkdir(exist_ok=True)
for img in img_tags: for img in img_tags:
@@ -370,10 +405,12 @@ class HTMLDocxPreprocessor:
if access is not None: if access is not None:
link = access.send_image(img_path, doc_id=book_id) link = access.send_image(img_path, doc_id=book_id)
img.attrs['src'] = link img.attrs['src'] = link
self.logger_object.log(f'{img_name} successfully uploaded.') self.logger_object.log(
f'{img_name} successfully uploaded.')
else: else:
img_size = os.path.getsize(img_path) img_size = os.path.getsize(img_path)
self.logger_object.log(f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG) self.logger_object.log(
f'{img_name} successfully loaded. Image size: {img_size}.', logging.DEBUG)
new_img_path = new_path / img_name new_img_path = new_path / img_name
copyfile(img_path, new_img_path) copyfile(img_path, new_img_path)
img.attrs["src"] = str(new_img_path) img.attrs["src"] = str(new_img_path)
@@ -408,7 +445,8 @@ class HTMLDocxPreprocessor:
def _process_toc_links(self): def _process_toc_links(self):
"""Function to extract nodes which contains TOC links, remove links from file and detect headers.""" """Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')}) toc_links = self.body_tag.find_all(
"a", {'name': re.compile(r'^_Toc\d+')})
headers = [link.parent for link in toc_links] headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1> outline_level = "1" # All the unknown outlines will be predicted as <h1>
for tag in headers: for tag in headers:
@@ -418,7 +456,8 @@ class HTMLDocxPreprocessor:
elif tag.name == "p": elif tag.name == "p":
exist_in_toc = self._check_parent_link_exist_in_toc(tag) exist_in_toc = self._check_parent_link_exist_in_toc(tag)
if tag in self.body_tag.find_all("p") and exist_in_toc: if tag in self.body_tag.find_all("p") and exist_in_toc:
new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level) new_tag = BeautifulSoup(
features="lxml").new_tag("h" + outline_level)
text = tag.text text = tag.text
tag.replaceWith(new_tag) tag.replaceWith(new_tag)
new_tag.string = text new_tag.string = text
@@ -440,14 +479,16 @@ class HTMLDocxPreprocessor:
@staticmethod @staticmethod
def clean_tag_from_tabs(tag: NavigableString): def clean_tag_from_tabs(tag: NavigableString):
cleaned = re.sub(r'(\s+)+', ' ', tag) cleaned = re.sub(r'(\s+)+', ' ', tag)
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this) tag.replace_with(this)
# print('input: ', repr(tag)) # print('input: ', repr(tag))
# print('test: ', repr(cleaned)) # print('test: ', repr(cleaned))
def clean_tag_from_numbering(self, tag): def clean_tag_from_numbering(self, tag):
cleaned = self.clean_title_from_numbering(tag) cleaned = self.clean_title_from_numbering(tag)
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
tag.replace_with(this) tag.replace_with(this)
# print('input: ', repr(tag)) # print('input: ', repr(tag))
# print('test: ', repr(cleaned)) # print('test: ', repr(cleaned))
@@ -484,7 +525,8 @@ class HTMLDocxPreprocessor:
""" """
headers_info = [] headers_info = []
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
headers_outline = [int(re.sub(r"^h", "", tag.name)) for tag in header_tags] headers_outline = [int(re.sub(r"^h", "", tag.name))
for tag in header_tags]
if headers_outline: if headers_outline:
top_level_outline = min(headers_outline) top_level_outline = min(headers_outline)
top_level_headers = [tag for tag in header_tags top_level_headers = [tag for tag in header_tags
@@ -518,13 +560,17 @@ class HTMLDocxPreprocessor:
Assume header(s) to be introduction if: Assume header(s) to be introduction if:
1. one header not numbered, before 1 numbered header 1. one header not numbered, before 1 numbered header
2. it is first header from the top level list and it equals to 'introduction' 2. it is first header from the top level list and it equals to 'introductio
Returns
-------
None
mark each top-level header with flag should_be_numbered = true/false
Result :
Mark each top-level header with flag should_be_numbered = true/false
""" """
is_numbered_header = [header['is_numbered'] for header in self.top_level_headers] is_numbered_header = [header['is_numbered']
is_title = [header['is_introduction'] for header in self.top_level_headers] for header in self.top_level_headers]
is_title = [header['is_introduction']
for header in self.top_level_headers]
first_not_numbered = is_numbered_header and is_numbered_header[0] == 0 first_not_numbered = is_numbered_header and is_numbered_header[0] == 0
second_is_numbered_or_not_exist = all(is_numbered_header[1:2]) second_is_numbered_or_not_exist = all(is_numbered_header[1:2])
@@ -539,7 +585,19 @@ class HTMLDocxPreprocessor:
self.top_level_headers[i]['should_be_numbered'] = True self.top_level_headers[i]['should_be_numbered'] = True
def _process_headings(self): def _process_headings(self):
"""Function to process tags <h>.""" """
Function to process tags <h>.
Steps
----------
1. remove <b>, <span>
2. clean text in header from numbering and \n
Returns
-------
None
processed <h> tags
"""
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
# 1. remove <b>, <span> # 1. remove <b>, <span>
@@ -581,36 +639,52 @@ class HTMLDocxPreprocessor:
for i, item in enumerate(content): for i, item in enumerate(content):
if type(content[i]) is NavigableString: if type(content[i]) is NavigableString:
cleaned = re.sub(r'(\s+)+', ' ', content[i]) cleaned = re.sub(r'(\s+)+', ' ', content[i])
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
content[i].replace_with(this) content[i].replace_with(this)
content[i] = this content[i] = this
else: else:
self.apply_func_to_last_child(content[i], self.clean_tag_from_tabs) self.apply_func_to_last_child(
content[i], self.clean_tag_from_tabs)
content[0] = '' if content[0] == ' ' else content[0] content[0] = '' if content[0] == ' ' else content[0]
content = [item for item in content if item != ''] content = [item for item in content if item != '']
if type(content[0]) is NavigableString: if type(content[0]) is NavigableString:
cleaned = self.clean_title_from_numbering(content[0]) cleaned = self.clean_title_from_numbering(content[0])
this = BeautifulSoup.new_string(BeautifulSoup(features="lxml"), cleaned, NavigableString) this = BeautifulSoup.new_string(BeautifulSoup(
features="lxml"), cleaned, NavigableString)
content[0].replace_with(this) content[0].replace_with(this)
content[0] = this content[0] = this
else: else:
self.apply_func_to_last_child(content[0], self.clean_tag_from_numbering) self.apply_func_to_last_child(
content[0], self.clean_tag_from_numbering)
def _process_lists(self): def _process_lists(self):
""" """
Function to process tags <li>. Function
Unwrap <p> tags. - process tags <li>.
- unwrap <p> tags.
Parameters
----------
body_tag: Tag, soup object
Returns
-------
None
uwrap <p> tag with li
""" """
li_tags = self.body_tag.find_all("li") li_tags = self.body_tag.find_all("li")
for il_tag in li_tags: for li_tag in li_tags:
il_tag.attrs.update(il_tag.p.attrs) li_tag.attrs.update(li_tag.p.attrs)
il_tag.p.unwrap() li_tag.p.unwrap()
def process_html(self, access, html_path, book_id): def process_html(self, access=None, html_path='', book_id='local'):
"""Process html code to satisfy LiveCarta formatting.""" """Process html code to satisfy LiveCarta formatting."""
self.logger_object.log('Beginning of processing .html file.')
try: try:
self.logger_object.log(f'Processing TOC and headers.') self.logger_object.log(f'Processing TOC and headers.')
self._process_toc_links() self._process_toc_links()
@@ -628,18 +702,22 @@ class HTMLDocxPreprocessor:
self.logger_object.log('Tables processing.') self.logger_object.log('Tables processing.')
self._process_tables() self._process_tables()
self.logger_object.log(f'{self.tables_amount} tables have been processed.') self.logger_object.log(
f'{self.tables_amount} tables have been processed.')
self.logger_object.log('Hrefs processing.') self.logger_object.log('Hrefs processing.')
self._process_hrefs() self._process_hrefs()
self.logger_object.log('Footnotes processing.') self.logger_object.log('Footnotes processing.')
self._process_footnotes() self._process_footnotes()
self.logger_object.log(f'{len(self.footnotes)} footnotes have been processed.') self.logger_object.log(
f'{len(self.footnotes)} footnotes have been processed.')
self.logger_object.log('Image processing.') self.logger_object.log('Image processing.')
self._process_images(access=access, html_path=html_path, book_id=book_id) self._process_images(
self.logger_object.log(f'{len(self.images)} images have been processed.') access=access, html_path=html_path, book_id=book_id)
self.logger_object.log(
f'{len(self.images)} images have been processed.')
self._process_footer() self._process_footer()
self._process_div() self._process_div()
@@ -658,7 +736,8 @@ class HTMLDocxPreprocessor:
self.delete_content_before_toc() self.delete_content_before_toc()
except Exception as exc: except Exception as exc:
self.logger_object.log('Error has occurred while processing html.', logging.ERROR) self.logger_object.log(
'Error has occurred while processing html.', logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
if self.status_wrapper: if self.status_wrapper:
self.status_wrapper.set_error() self.status_wrapper.set_error()

View File

@@ -5,7 +5,7 @@ from copy import copy
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
class LibraHTML2JSONConverter: class LibreHTML2JSONConverter:
def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None): def __init__(self, content, footnotes, top_level_headers, logger_object, book_api_status=None):
self.content_dict = None self.content_dict = None
self.content = content self.content = content
@@ -15,12 +15,19 @@ class LibraHTML2JSONConverter:
self.book_api_status = book_api_status self.book_api_status = book_api_status
@staticmethod @staticmethod
def format_html(html_text): def format_html(html_text: str) -> str:
""" """
Function to remove useless symbols from html code. Function to remove useless symbols from html code.
Parameters
----------
html_text: str
text to process.
Returns
-------
new_text: str
cleaned text
:param html_text: Text to process.
:return: Cleaned text.
""" """
new_text = re.sub(r'([\n\t])', ' ', html_text) new_text = re.sub(r'([\n\t])', ' ', html_text)
return new_text return new_text
@@ -29,8 +36,15 @@ class LibraHTML2JSONConverter:
def header_to_livecarta_chapter_item(self, ind) -> (dict, int): def header_to_livecarta_chapter_item(self, ind) -> (dict, int):
""" """
Function process header and collects all content for it. Function process header and collects all content for it.
Parameters
----------
ind: int
index of header in content list.
Returns
-------
result, ind
:param ind: Index of header in content list.
""" """
if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS: if self.content[ind].name in LiveCartaConfig.SUPPORTED_HEADERS:
title = str(self.content[ind]) title = str(self.content[ind])
@@ -38,7 +52,8 @@ class LibraHTML2JSONConverter:
title = title.replace(f'</{self.content[ind].name}>', '') title = title.replace(f'</{self.content[ind].name}>', '')
title = re.sub(r'^\n', '', title) title = re.sub(r'^\n', '', title)
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag # extract outline from tag
curr_outline = int(re.sub(r"^h", "", self.content[ind].name))
result = { result = {
'title': f'{title}', 'title': f'{title}',
'contents': [], 'contents': [],
@@ -53,7 +68,8 @@ class LibraHTML2JSONConverter:
outline = int(re.sub(r"^h", "", self.content[ind].name)) outline = int(re.sub(r"^h", "", self.content[ind].name))
# - recursion step until h_i > h_initial # - recursion step until h_i > h_initial
if outline > curr_outline: if outline > curr_outline:
header_dict, ind = self.header_to_livecarta_chapter_item(ind) header_dict, ind = self.header_to_livecarta_chapter_item(
ind)
if ch_content: if ch_content:
result['contents'].append("".join(ch_content)) result['contents'].append("".join(ch_content))
ch_content = [] ch_content = []
@@ -108,7 +124,8 @@ class LibraHTML2JSONConverter:
chapter = [] chapter = []
while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS: while ind < len(self.content) and self.content[ind].name not in LiveCartaConfig.SUPPORTED_HEADERS:
if not self._is_empty_p_tag(self.content[ind]): if not self._is_empty_p_tag(self.content[ind]):
chapter.append(self.format_html(str(self.content[ind]))) chapter.append(self.format_html(
str(self.content[ind])))
ind += 1 ind += 1
if chapter: if chapter:
res = { res = {
@@ -121,9 +138,11 @@ class LibraHTML2JSONConverter:
if res: if res:
json_strc.append(res) json_strc.append(res)
ch_amt += 1 ch_amt += 1
self.logger_object.log(f'Chapter {ch_amt} has been added to structure.') self.logger_object.log(
f'Chapter {ch_amt} has been added to structure.')
except Exception as exc: except Exception as exc:
self.logger_object.log('Error has occurred while making json structure.', logging.ERROR) self.logger_object.log(
'Error has occurred while making json structure.', logging.ERROR)
self.logger_object.log_error_to_main_log() self.logger_object.log_error_to_main_log()
if self.book_api_status: if self.book_api_status:
self.book_api_status.set_error() self.book_api_status.set_error()

View File

@@ -14,21 +14,23 @@ from src.livecarta_config import LiveCartaConfig
cssutils.log.setLevel(CRITICAL) cssutils.log.setLevel(CRITICAL)
sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, sizes_pr = [-100, -1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, 1.63, 1.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38,
2.44, 2.5, 2.56, 2.63, 2.69, 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17px', '18px', '19px', '20px', '21px', sizes_px = ['0px', '10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px',
'22px', '23px', '24px', '25px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px', '17px', '18px', '19px', '20px', '21px', '22px', '23px', '24px', '25px',
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '26px', '27px', '28px', '29px', '30px', '31px', '32px', '33px', '34px',
'48px', '49px', '50px', '64px', '72px'] '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px',
'44px', '45px', '46px', '47px', '48px', '49px', '50px', '64px', '72px']
list_types = ['circle', 'disc', 'armenian', 'decimal', list_types = ['circle', 'disc', 'armenian', 'decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin', 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none'] 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_tag_values(value: str) -> str: def convert_tag_style_values(value: str) -> str:
""" """
Function Function
- converts values of tags from em/%/pt to px - converts values of tags from em/%/pt to px
@@ -42,8 +44,8 @@ def convert_tag_values(value: str) -> str:
value: str value: str
""" """
def find_closest_size(value): def find_closest_size(size_value):
possible_sizes = list(takewhile(lambda x: value > x, sizes_pr)) possible_sizes = list(takewhile(lambda x: size_value > x, sizes_pr))
last_possible_size_index = sizes_pr.index(possible_sizes[-1]) last_possible_size_index = sizes_pr.index(possible_sizes[-1])
return sizes_px[last_possible_size_index] return sizes_px[last_possible_size_index]
@@ -122,12 +124,13 @@ Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING shou
to suit livecarta style convention. to suit livecarta style convention.
""" """
LIVECARTA_STYLE_ATTRS_MAPPING = { LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_tag_values, 'text-indent': convert_tag_style_values,
'font-variant': lambda x: x, 'font-variant': lambda x: x,
'text-align': lambda x: x, 'text-align': lambda x: x,
'font': lambda x: '', 'font': lambda x: '',
'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or LiveCartaConfig.font_correspondence_table.get(x.capitalize()), 'font-family': lambda x: LiveCartaConfig.font_correspondence_table.get(x) or
'font-size': convert_tag_values, LiveCartaConfig.font_correspondence_table.get(x.capitalize()),
'font-size': convert_tag_style_values,
'color': get_text_color, 'color': get_text_color,
'background-color': get_bg_color, 'background-color': get_bg_color,
'background': get_bg_color, 'background': get_bg_color,
@@ -140,9 +143,9 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
'border-bottom': lambda x: x if x != '0' else '', 'border-bottom': lambda x: x if x != '0' else '',
'list-style-type': lambda x: x if x in list_types else 'disc', 'list-style-type': lambda x: x if x in list_types else 'disc',
'list-style-image': lambda x: 'disc', 'list-style-image': lambda x: 'disc',
'margin-left': convert_tag_values, 'margin-left': convert_tag_style_values,
'margin-top': convert_tag_values, 'margin-top': convert_tag_style_values,
'margin': convert_tag_values, 'margin': convert_tag_style_values,
} }
""" """
@@ -269,10 +272,10 @@ class TagStyleConverter:
item = item.split(':') item = item.split(':')
if item[0] in ['text-indent', 'margin-left', 'margin']: if item[0] in ['text-indent', 'margin-left', 'margin']:
if len(item[1].split(' ')) == 3: if len(item[1].split(' ')) == 3:
item[1] = convert_tag_values(item[1].split( item[1] = convert_tag_style_values(item[1].split(
' ')[-2]) # split returns middle value ' ')[-2]) # split returns middle value
else: else:
item[1] = convert_tag_values(item[1].split( item[1] = convert_tag_style_values(item[1].split(
' ')[-1]) # split returns last value ' ')[-1]) # split returns last value
clean_style += item[0] + ': ' + item[1] + '; ' clean_style += item[0] + ': ' + item[1] + '; '
@@ -343,7 +346,8 @@ class TagStyleConverter:
split_inline_style: dict = remove_extra_spaces(inline_style) split_inline_style: dict = remove_extra_spaces(inline_style)
# repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css # repetition check - if the tag had already had inline style
# that isn't in the css styles, add this to style parsed from css
repeat_styles = list(set(split_ultimate_style.keys()) repeat_styles = list(set(split_ultimate_style.keys())
& set(split_inline_style.keys())) & set(split_inline_style.keys()))
@@ -409,7 +413,8 @@ class TagStyleConverter:
if has_p_style_attrs: if has_p_style_attrs:
p_style += item + ';' p_style += item + ';'
initial_style = initial_style.replace(item + ';', '') initial_style = initial_style.replace(item + ';', '')
# here check that this style i exactly the same. Not 'align' when we have 'text-align', or 'border' when we have 'border-top' # here check that this style i exactly the same.
# Not 'align' when we have 'text-align', or 'border' when we have 'border-top'
styles_to_be_saved_in_span = [((attr + ':') in initial_style) & ( styles_to_be_saved_in_span = [((attr + ':') in initial_style) & (
'-' + attr not in initial_style) for attr in styles_cant_be_in_p] '-' + attr not in initial_style) for attr in styles_cant_be_in_p]
if any(styles_to_be_saved_in_span): if any(styles_to_be_saved_in_span):
@@ -549,4 +554,4 @@ if __name__ == '__main__':
'pr01s05.xhtml').get_body_content().decode() 'pr01s05.xhtml').get_body_content().decode()
html_soup = BeautifulSoup(html_, features='lxml') html_soup = BeautifulSoup(html_, features='lxml')
print(convert_html_soup_with_css_style(html_soup, css_cleaned)) print(convert_html_soup_with_css_style(html_soup, css_cleaned))

View File

@@ -1,6 +1,7 @@
from src.book_solver import BookSolver from src.book_solver import BookSolver
from src.epub_converter.epub_converter import EpubConverter from src.epub_converter.epub_converter import EpubConverter
class EpubBook(BookSolver): class EpubBook(BookSolver):
"""Class of .epub type book - child of BookSolver""" """Class of .epub type book - child of BookSolver"""
@@ -10,10 +11,19 @@ class EpubBook(BookSolver):
def get_converted_book(self): def get_converted_book(self):
""" """
1. Convert epub to html Function
2. Parse from line structure to nested structure Steps
----------
1. Converts .epub to .html
2. Parses from line structure to nested structure
Returns
----------
content_dict
json for LiveCarta platform
""" """
json_converter = EpubConverter(self.file_path, access=self.access, logger=self.logger_object) json_converter = EpubConverter(
self.file_path, access=self.access, logger=self.logger_object)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
self.status_wrapper.set_generating() return content_dict
return content_dict

View File

@@ -71,7 +71,7 @@ def update_images_src_links(body_tag: BeautifulSoup,
return path2aws_path return path2aws_path
def preprocess_table(body_tag: BeautifulSoup): def _preprocess_table(body_tag: BeautifulSoup):
"""Function to preprocess tables and tags(td|th|tr): style""" """Function to preprocess tables and tags(td|th|tr): style"""
tables = body_tag.find_all("table") tables = body_tag.find_all("table")
for table in tables: for table in tables:
@@ -99,7 +99,7 @@ def preprocess_table(body_tag: BeautifulSoup):
table.attrs['border'] = '1' table.attrs['border'] = '1'
def process_lists(body_tag: BeautifulSoup): def _process_lists(body_tag: BeautifulSoup):
""" """
Function Function
- process tags <li>. - process tags <li>.
@@ -121,7 +121,7 @@ def process_lists(body_tag: BeautifulSoup):
li_tag.p.unwrap() li_tag.p.unwrap()
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): def _insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
"""Function inserts span before tag aren't supported by livecarta""" """Function inserts span before tag aren't supported by livecarta"""
new_tag = main_tag.new_tag("span") new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or '' new_tag.attrs['id'] = id_ or ''
@@ -130,21 +130,21 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
tag.insert_before(new_tag) tag.insert_before(new_tag)
def clean_headings_content(content: BeautifulSoup, title: str): def _clean_headings_content(content: BeautifulSoup, title: str):
def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup): def add_span_to_save_ids_for_links(tag_to_be_removed: Tag, body_tag: BeautifulSoup):
if tag_to_be_removed.attrs.get('id'): if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag, _insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed, tag_to_be_removed,
id_=tag_to_be_removed.attrs.get( id_=tag_to_be_removed.attrs.get(
'id'), 'id'),
class_=tag_to_be_removed.attrs.get('class')) class_=tag_to_be_removed.attrs.get('class'))
for sub_tag in tag_to_be_removed.find_all(): for sub_tag in tag_to_be_removed.find_all():
if sub_tag.attrs.get('id'): if sub_tag.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag, _insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed, tag_to_be_removed,
id_=sub_tag.attrs['id'], id_=sub_tag.attrs['id'],
class_=sub_tag.attrs.get('class')) class_=sub_tag.attrs.get('class'))
title = title.lower() title = title.lower()
for child in content.contents: for child in content.contents:
@@ -165,7 +165,7 @@ def clean_headings_content(content: BeautifulSoup, title: str):
break break
def heading_tag_to_p_tag(body_tag): def _heading_tag_to_p_tag(body_tag):
"""Function to convert all lower level headings to p tags""" """Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern)) header_tags = body_tag.find_all(re.compile(pattern))
@@ -173,7 +173,7 @@ def heading_tag_to_p_tag(body_tag):
tag.name = 'p' tag.name = 'p'
def clean_title_from_numbering(title: str): def _clean_title_from_numbering(title: str):
"""Function removes numbering from titles""" """Function removes numbering from titles"""
title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(\s+)+', '', title)
# title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title # title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) # delete chapter numbering from the title
@@ -182,7 +182,7 @@ def clean_title_from_numbering(title: str):
return title return title
def replace_with_livecarta_anchor_tag(anchor, i): def _replace_with_livecarta_anchor_tag(anchor, i):
"""Function replace noteref_tag(anchor) with new livecarta tag""" """Function replace noteref_tag(anchor) with new livecarta tag"""
new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element' new_tag['class'] = 'footnote-element'
@@ -257,7 +257,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote': if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
footnote_tag = footnote_tag.parent footnote_tag = footnote_tag.parent
new_noterefs_tags.append( new_noterefs_tags.append(
replace_with_livecarta_anchor_tag(noteref_tag, i)) _replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text content = footnote_tag.text
# footnote_tag.decompose() # footnote_tag.decompose()
footnotes.append(content) footnotes.append(content)
@@ -292,7 +292,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
""" """
def preserve_class_in_aside_tag(tag_): def _preserve_class_in_aside_tag(tag_):
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)""" """to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
# this is for Wiley books with boxes # this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance( tag_class = tag_.attrs['class'] if not isinstance(
@@ -301,7 +301,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
if not tag_.parent.attrs.get('class'): if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class tag_.parent.attrs['class'] = tag_class
def preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool: def _preserve_class_in_section_tag(tag_: BeautifulSoup) -> bool:
""" """
Function saves css style inherited from class, copies class to child <p> Function saves css style inherited from class, copies class to child <p>
returns True, if <section> could be unwrapped returns True, if <section> could be unwrapped
@@ -332,13 +332,13 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
else: else:
return True return True
def add_span_to_save_ids_for_links(tag_to_be_removed): def _add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'): if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed, _insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'], id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class')) class_=tag_to_be_removed.attrs.get('class'))
def replace_div_tag_with_table(): def _replace_div_tag_with_table():
""" """
Function replace <div> with <table>: Function replace <div> with <table>:
1. Convert div with certain classes to tables 1. Convert div with certain classes to tables
@@ -350,11 +350,11 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
div_class = div.attrs['class'] if not isinstance( div_class = div.attrs['class'] if not isinstance(
div.attrs['class'], list) else div.attrs['class'][0] div.attrs['class'], list) else div.attrs['class'][0]
if div_class in ['C409', 'C409a']: if div_class in ['C409', 'C409a']:
wrap_block_tag_with_table( _wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9') body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')
elif div_class in ['C441', 'C816']: elif div_class in ['C441', 'C816']:
wrap_block_tag_with_table( _wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8') body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')
if div.attrs.get('style'): if div.attrs.get('style'):
@@ -363,7 +363,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
'background-color') + len('background-color') 'background-color') + len('background-color')
start_index_of_color = end_index + 2 start_index_of_color = end_index + 2
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7] bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
wrap_block_tag_with_table( _wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='', bg_color=bg_color) body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
elif div.attrs.get('style') == '': elif div.attrs.get('style') == '':
del div.attrs['style'] del div.attrs['style']
@@ -379,7 +379,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
if all(is_not_struct_tag): if all(is_not_struct_tag):
div.name = 'p' div.name = 'p'
continue continue
add_span_to_save_ids_for_links(div) _add_span_to_save_ids_for_links(div)
div.unwrap() div.unwrap()
# comments removal # comments removal
@@ -387,18 +387,18 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
for element in tag(text=lambda text: isinstance(text, Comment)): for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract() element.extract()
replace_div_tag_with_table() _replace_div_tag_with_table()
for s in body_tag.find_all("section"): for s in body_tag.find_all("section"):
could_be_unwrapped = True could_be_unwrapped = True
if s.attrs.get('class'): if s.attrs.get('class'):
could_be_unwrapped = preserve_class_in_section_tag(s) could_be_unwrapped = _preserve_class_in_section_tag(s)
add_span_to_save_ids_for_links(s) _add_span_to_save_ids_for_links(s)
if could_be_unwrapped: if could_be_unwrapped:
s.unwrap() s.unwrap()
for s in body_tag.find_all("article"): for s in body_tag.find_all("article"):
add_span_to_save_ids_for_links(s) _add_span_to_save_ids_for_links(s)
s.unwrap() s.unwrap()
for s in body_tag.find_all("figure"): for s in body_tag.find_all("figure"):
@@ -407,22 +407,22 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
s.attrs['style'] = "text-align: center;" s.attrs['style'] = "text-align: center;"
for s in body_tag.find_all("figcaption"): for s in body_tag.find_all("figcaption"):
add_span_to_save_ids_for_links(s) _add_span_to_save_ids_for_links(s)
s.unwrap() s.unwrap()
for s in body_tag.find_all("aside"): for s in body_tag.find_all("aside"):
s.name = 'blockquote' s.name = 'blockquote'
for s in body_tag.find_all("main"): for s in body_tag.find_all("main"):
add_span_to_save_ids_for_links(s) _add_span_to_save_ids_for_links(s)
s.unwrap() s.unwrap()
for s in body_tag.find_all("body"): for s in body_tag.find_all("body"):
add_span_to_save_ids_for_links(s) _add_span_to_save_ids_for_links(s)
s.unwrap() s.unwrap()
for s in body_tag.find_all("html"): for s in body_tag.find_all("html"):
add_span_to_save_ids_for_links(s) _add_span_to_save_ids_for_links(s)
s.unwrap() s.unwrap()
for s in body_tag.find_all("header"): for s in body_tag.find_all("header"):
@@ -442,7 +442,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
assert all( assert all(
parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.' parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
heading_tag_to_p_tag(body_tag) _heading_tag_to_p_tag(body_tag)
# wrap NavigableString with <p> # wrap NavigableString with <p>
for node in body_tag: for node in body_tag:
@@ -500,7 +500,7 @@ def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: Beautifu
return tags return tags
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None): def _wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
"""Function wraps <block> with <table>""" """Function wraps <block> with <table>"""
table = main_tag.new_tag("table") table = main_tag.new_tag("table")
table.attrs['border'] = border table.attrs['border'] = border
@@ -520,7 +520,7 @@ def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_co
return table return table
def clean_wiley_block(block): def _clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs: for hr in hrs:
hr.extract() hr.extract()
@@ -530,30 +530,30 @@ def clean_wiley_block(block):
h.insert_before(BeautifulSoup(features='lxml').new_tag("br")) h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
def preprocess_block_tags(chapter_tag): def _preprocess_block_tags(chapter_tag):
"""Function preprocessing <block> tags""" """Function preprocessing <block> tags"""
for block in chapter_tag.find_all("blockquote"): for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
clean_wiley_block(block) _clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get( color = '#DDDDDD' if block.attrs.get(
'class') == 'feature1' else None 'class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get( color = '#EEEEEE' if block.attrs.get(
'class') == 'feature2' else color 'class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, block, bg_color=color) _wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br")) block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap() block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}): for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
clean_wiley_block(future_block) _clean_wiley_block(future_block)
color = '#DDDDDD' if future_block.attrs.get( color = '#DDDDDD' if future_block.attrs.get(
'class') == 'feature1' else None 'class') == 'feature1' else None
color = '#EEEEEE' if future_block.attrs.get( color = '#EEEEEE' if future_block.attrs.get(
'class') == 'feature2' else color 'class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color) _wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
def prepare_formatted(text: str) -> str: def _prepare_formatted(text: str) -> str:
"""Function replaces special symbols with their Unicode representation""" """Function replaces special symbols with their Unicode representation"""
text = text.replace("<", "\x3C") text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E") text = text.replace(">", "\x3E")
@@ -563,7 +563,7 @@ def prepare_formatted(text: str) -> str:
return text return text
def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag: def _wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
"""Function wraps <span> with <table>""" """Function wraps <span> with <table>"""
table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag( table, tbody, tr, td = chapter_tag.new_tag("table"), chapter_tag.new_tag(
"tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td") "tbody"), chapter_tag.new_tag("tr"), chapter_tag.new_tag("td")
@@ -577,7 +577,7 @@ def wrap_preformatted_span_with_table(chapter_tag: Tag, span_tag: Tag) -> Tag:
return table return table
def preprocess_pre_tags(chapter_tag: BeautifulSoup): def _preprocess_pre_tags(chapter_tag: BeautifulSoup):
""" """
Function preprocessing <pre> tags Function preprocessing <pre> tags
Parameters Parameters
@@ -601,7 +601,7 @@ def preprocess_pre_tags(chapter_tag: BeautifulSoup):
for child in copy_contents: for child in copy_contents:
# Navigable String # Navigable String
if isinstance(child, NavigableString): if isinstance(child, NavigableString):
cleaned_text = prepare_formatted(str(child)) cleaned_text = _prepare_formatted(str(child))
sub_strings = re.split('\r\n|\n|\r', cleaned_text) sub_strings = re.split('\r\n|\n|\r', cleaned_text)
for string in sub_strings[:-1]: for string in sub_strings[:-1]:
new_tag.append(NavigableString(string)) new_tag.append(NavigableString(string))
@@ -612,24 +612,24 @@ def preprocess_pre_tags(chapter_tag: BeautifulSoup):
else: else:
for sub_child in child.children: for sub_child in child.children:
if isinstance(sub_child, NavigableString): if isinstance(sub_child, NavigableString):
cleaned_text = prepare_formatted(str(sub_child)) cleaned_text = _prepare_formatted(str(sub_child))
sub_child.replace_with(NavigableString(cleaned_text)) sub_child.replace_with(NavigableString(cleaned_text))
else: else:
sub_child.string = prepare_formatted(sub_child.text) sub_child.string = _prepare_formatted(sub_child.text)
cleaned_tag = child.extract() cleaned_tag = child.extract()
new_tag.append(cleaned_tag) new_tag.append(cleaned_tag)
if to_add_br: if to_add_br:
new_tag.append(BeautifulSoup( new_tag.append(BeautifulSoup(
features='lxml').new_tag('br')) features='lxml').new_tag('br'))
pre.replace_with(new_tag) pre.replace_with(new_tag)
table = wrap_preformatted_span_with_table(chapter_tag, new_tag) table = _wrap_preformatted_span_with_table(chapter_tag, new_tag)
# add <p> to save brs # add <p> to save brs
p_for_br = chapter_tag.new_tag("p") p_for_br = chapter_tag.new_tag("p")
p_for_br.string = "\xa0" p_for_br.string = "\xa0"
table.insert_after(p_for_br) table.insert_after(p_for_br)
def preprocess_code_tags(chapter_tag: BeautifulSoup): def _preprocess_code_tags(chapter_tag: BeautifulSoup):
""" """
Function Function
- transform <code>, <kdb>, <var> tags into span - transform <code>, <kdb>, <var> tags into span
@@ -658,7 +658,7 @@ def prepare_title(title_of_chapter: str) -> str:
title_str = BeautifulSoup(title_of_chapter, features='lxml').string title_str = BeautifulSoup(title_of_chapter, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip() title_str = re.sub(r' +', ' ', title_str).rstrip()
title_str = clean_title_from_numbering(title_str) title_str = _clean_title_from_numbering(title_str)
return title_str return title_str
@@ -696,18 +696,18 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
# 2. heading removal # 2. heading removal
if remove_title_from_chapter: if remove_title_from_chapter:
clean_headings_content(content_tag, title_str) _clean_headings_content(content_tag, title_str)
# 3. processing tags (<li>, <table>, <code>, <pre>, <block>) # 3. processing tags (<li>, <table>, <code>, <pre>, <block>)
process_lists(content_tag) _process_lists(content_tag)
preprocess_table(content_tag) _preprocess_table(content_tag)
preprocess_code_tags(content_tag) _preprocess_code_tags(content_tag)
preprocess_pre_tags(content_tag) _preprocess_pre_tags(content_tag)
preprocess_block_tags(content_tag) _preprocess_block_tags(content_tag)
# 4. class removal # 4. class removal
for tag in content_tag.find_all(recursive=True): for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']): 'footnote-element']):
del tag.attrs['class'] del tag.attrs['class']
return str(content_tag) return str(content_tag)

View File

@@ -1,6 +1,7 @@
import os import os
import argparse import argparse
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description="Utility for folders's clean up.") parser = argparse.ArgumentParser(description="Utility for folders's clean up.")
parser.add_argument('-f', '--folders', type=str, nargs='*', help='Names of the folders to be cleaned.') parser.add_argument('-f', '--folders', type=str, nargs='*', help='Names of the folders to be cleaned.')

View File

@@ -3,6 +3,7 @@ import sys
import argparse import argparse
import subprocess import subprocess
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description="Utility for checking installed packages.") parser = argparse.ArgumentParser(description="Utility for checking installed packages.")
parser.add_argument('-p', '--packages', type=str, nargs='*', help='Names of the packages.') parser.add_argument('-p', '--packages', type=str, nargs='*', help='Names of the packages.')

View File

@@ -4,7 +4,6 @@ from colorsys import hls_to_rgb
from webcolors import html4_hex_to_names, hex_to_rgb, rgb_to_name, rgb_percent_to_hex, rgb_to_hex, css3_names_to_hex from webcolors import html4_hex_to_names, hex_to_rgb, rgb_to_name, rgb_percent_to_hex, rgb_to_hex, css3_names_to_hex
def closest_colour_rgb(requested_color): def closest_colour_rgb(requested_color):
""" Function finds closes colour rgb """ """ Function finds closes colour rgb """
min_colours = {} min_colours = {}

View File

@@ -20,15 +20,15 @@ class ColoredFormatter(logging.Formatter):
def format(self, record): def format(self, record):
seq = self.MAPPING.get(record.levelname, 37) # default white seq = self.MAPPING.get(record.levelname, 37) # default white
record.levelname = ('{0}{1}m{2}{3}') \ record.levelname = '{0}{1}m{2}{3}' \
.format(self.PREFIX, seq, record.levelname, self.SUFFIX) .format(self.PREFIX, seq, record.levelname, self.SUFFIX)
return logging.Formatter.format(self, record) return logging.Formatter.format(self, record)
class BookLogger: class BookLogger:
def __init__(self, name, book_id, main_logger=None, def __init__(self, name, book_id, main_logger=None,
filemode='w+', logging_level=logging.INFO, logging_format= filemode='w+', logging_level=logging.INFO,
'%(asctime)s - %(levelname)s - %(message)s [%(filename)s:%(lineno)d in %(funcName)s]'): logging_format='%(asctime)s - %(levelname)s - %(message)s [%(filename)s:%(lineno)d in %(funcName)s]'):
""" """
Method for Logger configuration. Logger will write to file. Method for Logger configuration. Logger will write to file.
:param name: name of the Logger. :param name: name of the Logger.
@@ -107,4 +107,4 @@ class BookStatusWrapper:
self.set_status('[GENERATE]') self.set_status('[GENERATE]')
def set_error(self): def set_error(self):
self.set_status('[ERROR]') self.set_status('[ERROR]')

View File

@@ -82,7 +82,7 @@ def rgb2closest_html_color_name(color):
pass pass
if hue_diff in diff2base_color_dict: if hue_diff in diff2base_color_dict:
dist_cur_color =(hue_request - hue_html) ** 2 + (s_request - s_html) ** 2 + (v_request - v_html) ** 2 dist_cur_color = (hue_request - hue_html) ** 2 + (s_request - s_html) ** 2 + (v_request - v_html) ** 2
hue_prev, s_prev, v_prev = HTML_COLORS_HSV[diff2base_color_dict[hue_diff]] hue_prev, s_prev, v_prev = HTML_COLORS_HSV[diff2base_color_dict[hue_diff]]
dist_prev_color = (hue_request - hue_prev) ** 2 + (s_request - s_prev) ** 2 + (v_request - v_prev) ** 2 dist_prev_color = (hue_request - hue_prev) ** 2 + (s_request - s_prev) ** 2 + (v_request - v_prev) ** 2
if dist_cur_color < dist_prev_color: if dist_cur_color < dist_prev_color:
@@ -95,7 +95,7 @@ def rgb2closest_html_color_name(color):
if __name__ == '__main__': if __name__ == '__main__':
hex_colors = [ hex_colors = [
#'#945893', # '#945893',
# '#96F', # '#96F',
# '#000', # black # '#000', # black
# '#4C4C4C', # black # '#4C4C4C', # black
@@ -115,5 +115,5 @@ if __name__ == '__main__':
for c in hex_colors: for c in hex_colors:
n = rgb2closest_html_color_name(c) n = rgb2closest_html_color_name(c)
print(n) # "Actual colour:", c, ", closest colour name:", print(n) # "Actual colour:", c, ", closest colour name:",
# print() # print()