Wrote documentation for every func/class in .py

This commit is contained in:
Kiryl
2021-12-10 10:53:40 +03:00
parent ef3502cd0a
commit 4b1109e6b4
13 changed files with 198 additions and 172 deletions

View File

@@ -54,7 +54,6 @@ def convert_book(book_type: [DocxBook, EpubBook], params: dict, logger, book_id)
raise exc raise exc
logger.info(f'Book-{book_id} has been proceeded.') logger.info(f'Book-{book_id} has been proceeded.')
print('Book has been proceeded.')
def callback(ch, method, properties, body, logger, libra_locker): def callback(ch, method, properties, body, logger, libra_locker):

View File

@@ -1,10 +1,3 @@
""" This is Main Abstract class for solving a task of a book conversion
Having an id of coming book, gets book from server, runs conversion.
In parallel it updates status of a book conversion on admin panel.
Finally sends result to server.
Result is a json, JSON schema in book_schema.json
"""
import os import os
import json import json
import codecs import codecs
@@ -17,6 +10,14 @@ from src.util.helpers import BookLogger, BookStatusWrapper
class BookSolver: class BookSolver:
"""
This is Main Abstract class for solving a task of a book conversion
Having an id of coming book, gets book from server, runs conversion.
In parallel it updates status of a book conversion on admin panel.
Finally sends result to server.
Result is a json, JSON schema in book_schema.json
"""
__metaclass__ = ABCMeta __metaclass__ = ABCMeta
def __init__(self, book_id=0, access=None, main_logger=None): def __init__(self, book_id=0, access=None, main_logger=None):
@@ -55,9 +56,7 @@ class BookSolver:
self.file_path = pathlib.Path(file_path) self.file_path = pathlib.Path(file_path)
def get_book_file(self): def get_book_file(self):
""" """ Method for getting and saving book from server. """
Method for getting and saving book from server.
"""
try: try:
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file') self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
content = self.access.get_book(self.book_id) content = self.access.get_book(self.book_id)
@@ -92,6 +91,7 @@ class BookSolver:
self.logger_object.log('Error has occurred while writing json file.' + str(exc), logging.ERROR) self.logger_object.log('Error has occurred while writing json file.' + str(exc), logging.ERROR)
def send_json_content_to_server(self, content: dict): def send_json_content_to_server(self, content: dict):
""" Function sends json_content to site """
try: try:
self.access.send_book(self.book_id, content) self.access.send_book(self.book_id, content)
self.logger_object.log(f'JSON data has been sent to server.') self.logger_object.log(f'JSON data has been sent to server.')
@@ -108,8 +108,10 @@ class BookSolver:
return {} return {}
def test_conversion(self): def test_conversion(self):
'''Function """
without sending to server''' Function
- without sending to server
"""
self.logger_object.log('Beginning of the test.') self.logger_object.log('Beginning of the test.')
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'{self.book_type}') folder_path = os.path.join(folder_path, f'{self.book_type}')
@@ -121,9 +123,11 @@ class BookSolver:
self.logger_object.log('End of the test.') self.logger_object.log('End of the test.')
def conversion(self): def conversion(self):
'''Function """
with downloading book from server Function
with sending to server''' - with downloading book from server
- with sending to server
"""
try: try:
self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.') self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.')
self.get_book_file() self.get_book_file()
@@ -140,9 +144,11 @@ class BookSolver:
raise exc raise exc
def conversion_local(self): def conversion_local(self):
'''Function """
without downloading book from server (local) Function
with sending to server''' - without downloading book from server (local)
- with sending to server
"""
try: try:
self.logger_object.log(f'Data has been downloaded from tmp.json file: {self.file_path}') self.logger_object.log(f'Data has been downloaded from tmp.json file: {self.file_path}')
with codecs.open('json/tmp.json', 'r', encoding='utf-8') as f_json: with codecs.open('json/tmp.json', 'r', encoding='utf-8') as f_json:

View File

@@ -2,21 +2,22 @@ import re
from typing import Union from typing import Union
from ebooklib.epub import Section, Link from ebooklib.epub import Section, Link
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
"""
These are data structures which form mapping from NCX to python data structures.
"""
class NavPoint: class NavPoint:
"""
Class - Navigation Point, - every html|xhtml from epub
These are data structures which form mapping from NCX to python data structures.
"""
def __init__(self, obj: Union[Link, Section] = None, ): def __init__(self, obj: Union[Link, Section] = None, ):
self.href, self.id = self.parse_href_id(obj) self.href, self.id = self.parse_href_id(obj)
self.title = obj.title self.title = obj.title
@staticmethod @staticmethod
def parse_href_id(item: Union[Link, Section]): def parse_href_id(item: Union[Link, Section]):
"""Function parses href & id from item.href"""
reg = r'(.+\..+\#)(.+)' reg = r'(.+\..+\#)(.+)'
match = re.search(reg, item.href) match = re.search(reg, item.href)
href, div_id = None, None href, div_id = None, None
@@ -36,13 +37,8 @@ class NavPoint:
return '<NavPoint: %s, %s>' % (self.href, self.id) return '<NavPoint: %s, %s>' % (self.href, self.id)
"""
These are data structures which form mapping to livecarta json structure.
"""
def flatten(x): def flatten(x):
""" magic function from stackoverflow for list flattening """ """magic function from stackoverflow for list flattening"""
atom = lambda i: not isinstance(i, list) atom = lambda i: not isinstance(i, list)
nil = lambda i: not i nil = lambda i: not i
car = lambda i: i[0] car = lambda i: i[0]
@@ -54,12 +50,18 @@ def flatten(x):
class ChapterItem: class ChapterItem:
"""
Class of Chapter that could have subchapters
These are data structures which form mapping to livecarta json structure.
"""
def __init__(self, title, content, sub_items): def __init__(self, title, content, sub_items):
self.title = title self.title = title
self.content = content self.content = content
self.sub_items = sub_items self.sub_items = sub_items
def to_dict(self, lvl=1): def to_dict(self, lvl=1):
"""Function returns dictionary of chapter"""
sub_dicts = [] sub_dicts = []
if self.sub_items: if self.sub_items:
for i in self.sub_items: for i in self.sub_items:

View File

@@ -12,6 +12,7 @@ from src.book_solver import BookSolver
class DocxBook(BookSolver): class DocxBook(BookSolver):
"""Class of .docx type book - child of BookSolver"""
def __init__(self, book_id=0, access=None, html_path=None, def __init__(self, book_id=0, access=None, html_path=None,
main_logger=None, libra_locker=None): main_logger=None, libra_locker=None):
@@ -30,9 +31,7 @@ class DocxBook(BookSolver):
self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG) self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
def convert_doc_to_html(self): def convert_doc_to_html(self):
""" """Method for convert .docx document to .html file."""
Method for convert .docx document to .html file.
"""
self.logger_object.log(f'File - {self.file_path}.') self.logger_object.log(f'File - {self.file_path}.')
print(f'{self.file_path}') print(f'{self.file_path}')
self.logger_object.log('Beginning of conversion from .docx to .html.') self.logger_object.log('Beginning of conversion from .docx to .html.')
@@ -92,9 +91,7 @@ class DocxBook(BookSolver):
self.logger_object.log(f'Input file path after conversion: {self.html_path}.') self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
def read_html(self): def read_html(self):
""" """Method for reading .html file into beautiful soup tag."""
Method for reading .html file into beautiful soup tag.
"""
try: try:
html_text = open(self.html_path, 'r', encoding='utf8').read() html_text = open(self.html_path, 'r', encoding='utf8').read()
self.logger_object.log('HTML for book has been loaded.') self.logger_object.log('HTML for book has been loaded.')
@@ -130,7 +127,6 @@ class DocxBook(BookSolver):
1. Convert docx to html with libra office 1. Convert docx to html with libra office
2. Parse and clean html, get list of tags, get footnotes 2. Parse and clean html, get list of tags, get footnotes
3. Parse from line structure to nested structure with JSONConverter 3. Parse from line structure to nested structure with JSONConverter
""" """
self.convert_doc_to_html() self.convert_doc_to_html()
self.check_output_directory() self.check_output_directory()

View File

@@ -35,9 +35,7 @@ class HTMLDocxPreprocessor:
tag.unwrap() tag.unwrap()
def _clean_underline_links(self): def _clean_underline_links(self):
""" """Function cleans meaningless <u> tags before links."""
Function cleans meaningless <u> tags before links.
"""
underlines = self.body_tag.find_all("u") underlines = self.body_tag.find_all("u")
for u in underlines: for u in underlines:
if u.find_all('a'): if u.find_all('a'):
@@ -79,9 +77,7 @@ class HTMLDocxPreprocessor:
return re.sub(size + "pt", str(new_size) + "px", style) return re.sub(size + "pt", str(new_size) + "px", style)
def _font_to_span(self): def _font_to_span(self):
""" """Function to convert <font> tag to <span>. If font style is default, then remove this tag."""
Function to convert <font> tag to <span>. If font style is default, then remove this tag.
"""
fonts = self.body_tag.find_all("font") fonts = self.body_tag.find_all("font")
for font in fonts: for font in fonts:
face = font.get("face") face = font.get("face")
@@ -119,9 +115,7 @@ class HTMLDocxPreprocessor:
self.content = self.content[ind:] self.content = self.content[ind:]
def clean_trash(self): def clean_trash(self):
""" """Function to remove all styles and tags we don't need."""
Function to remove all styles and tags we don't need.
"""
self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$')) self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$'))
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$')) self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
@@ -140,9 +134,7 @@ class HTMLDocxPreprocessor:
table.decompose() table.decompose()
def _process_paragraph(self): def _process_paragraph(self):
""" """Function to process <p> tags (text-align and text-indent value)."""
Function to process <p> tags (text-align and text-indent value).
"""
paragraphs = self.body_tag.find_all('p') paragraphs = self.body_tag.find_all('p')
for p in paragraphs: for p in paragraphs:
@@ -193,9 +185,7 @@ class HTMLDocxPreprocessor:
p.attrs['style'] = style p.attrs['style'] = style
def _process_two_columns(self): def _process_two_columns(self):
""" """Function to process paragraphs which has two columns layout."""
Function to process paragraphs which has two columns layout.
"""
two_columns = self.body_tag.find_all("div", style="column-count: 2") two_columns = self.body_tag.find_all("div", style="column-count: 2")
for div in two_columns: for div in two_columns:
for child in div.children: for child in div.children:
@@ -204,9 +194,7 @@ class HTMLDocxPreprocessor:
div.unwrap() div.unwrap()
def _process_tables(self): def _process_tables(self):
""" """Function to process tables. Set "border" attribute."""
Function to process tables. Set "border" attribute.
"""
tables = self.body_tag.find_all("table") tables = self.body_tag.find_all("table")
for table in tables: for table in tables:
tds = table.find_all("td") tds = table.find_all("td")
@@ -296,9 +284,7 @@ class HTMLDocxPreprocessor:
return content.strip() return content.strip()
def _process_footnotes(self): def _process_footnotes(self):
""" """Function returns list of footnotes and delete them from html_soup."""
Function returns list of footnotes and delete them from html_soup.
"""
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc') footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$')) footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_anchors) footnote_amt = len(footnote_anchors)
@@ -404,9 +390,7 @@ class HTMLDocxPreprocessor:
div.decompose() div.decompose()
def _process_div(self): def _process_div(self):
""" """Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.
"""
divs = self.body_tag.find_all("div") divs = self.body_tag.find_all("div")
for div in divs: for div in divs:
@@ -423,9 +407,7 @@ class HTMLDocxPreprocessor:
return len(toc_links) > 0 return len(toc_links) > 0
def _process_toc_links(self): def _process_toc_links(self):
""" """Function to extract nodes which contains TOC links, remove links from file and detect headers."""
Function to extract nodes which contains TOC links, remove links from file and detect headers.
"""
toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')}) toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')})
headers = [link.parent for link in toc_links] headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1> outline_level = "1" # All the unknown outlines will be predicted as <h1>
@@ -448,13 +430,11 @@ class HTMLDocxPreprocessor:
@staticmethod @staticmethod
def clean_title_from_numbering(title: str): def clean_title_from_numbering(title: str):
""" """Function to remove digits from headers."""
Function to remove digits from headers.
"""
title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title return title
@staticmethod @staticmethod
@@ -485,9 +465,7 @@ class HTMLDocxPreprocessor:
self.apply_func_to_last_child(children[0], func) self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self): def _preprocessing_headings(self):
""" """Function to convert all lower level headings to p tags"""
Function to convert all lower level headings to p tags
"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern)) header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags: for tag in header_tags:
@@ -561,9 +539,7 @@ class HTMLDocxPreprocessor:
self.top_level_headers[i]['should_be_numbered'] = True self.top_level_headers[i]['should_be_numbered'] = True
def _process_headings(self): def _process_headings(self):
""" """Function to process tags <h>."""
Function to process tags <h>.
"""
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$")) header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
# 1. remove <b>, <span> # 1. remove <b>, <span>
@@ -634,9 +610,7 @@ class HTMLDocxPreprocessor:
il_tag.p.unwrap() il_tag.p.unwrap()
def process_html(self, access, html_path, book_id): def process_html(self, access, html_path, book_id):
""" """Process html code to satisfy LiveCarta formatting."""
Process html code to satisfy LiveCarta formatting.
"""
try: try:
self.logger_object.log(f'Processing TOC and headers.') self.logger_object.log(f'Processing TOC and headers.')
self._process_toc_links() self._process_toc_links()

View File

@@ -90,9 +90,7 @@ class LibraHTML2JSONConverter:
return True return True
def convert_to_dict(self): def convert_to_dict(self):
""" """Function which convert list of html nodes to appropriate json structure."""
Function which convert list of html nodes to appropriate json structure.
"""
json_strc = [] json_strc = []
ind = 0 ind = 0
ch_num = 0 ch_num = 0

View File

@@ -11,9 +11,9 @@ from itertools import takewhile
from src.util.color_reader import str2hex from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig from src.livecarta_config import LiveCartaConfig
cssutils.log.setLevel(CRITICAL) cssutils.log.setLevel(CRITICAL)
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56, sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69, 1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0] 2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
@@ -29,6 +29,7 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
def convert_font_size(value): def convert_font_size(value):
""" Function converts font-size in mapping """
if 'pt' in value: if 'pt' in value:
if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE: if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
return '' return ''
@@ -58,6 +59,7 @@ def convert_font_size(value):
def convert_indents(value): def convert_indents(value):
""" Function converts text-indent and margin-left values to px """
# 30px = 3.2% = 1.25em = 23pt # 30px = 3.2% = 1.25em = 23pt
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)') text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)')
has_style_attrs = re.search(text_indent_regexp, value) has_style_attrs = re.search(text_indent_regexp, value)
@@ -115,13 +117,6 @@ LIVECARTA_STYLE_ATTRS = {
'margin-left': [] 'margin-left': []
} }
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
def get_bg_color(x): def get_bg_color(x):
color = str2hex(x) color = str2hex(x)
@@ -135,6 +130,12 @@ def get_text_color(x):
return color return color
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents, 'text-indent': convert_indents,
'font-variant': lambda x: x, 'font-variant': lambda x: x,
@@ -178,8 +179,10 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
def check_style_to_be_tag(style) -> List[tuple]: def check_style_to_be_tag(style) -> List[tuple]:
""" Some css style properties converts to tags. """
Search for them and prepare list of properties to be removed from style string""" Some css style properties converts to tags.
Search for them and prepare list of properties to be removed from style string
"""
to_remove = [] to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG: for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style: if f'{k[0]}:{k[1]}' in style:
@@ -208,6 +211,7 @@ def update_css_style_types_to_livecarta_convention(css_rule, style_type):
def build_css_content(css_content): def build_css_content(css_content):
""" Build css content with livecarta convention """
sheet = cssutils.parseString(css_content, validate=False) sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet: for css_rule in sheet:
@@ -231,6 +235,7 @@ class TagStyleConverter:
@staticmethod @staticmethod
def remove_white_if_no_bgcolor(style_, tag): def remove_white_if_no_bgcolor(style_, tag):
""" Function remove white color if there is no text bg color """
if 'background' in style_: if 'background' in style_:
return style_ return style_
@@ -260,8 +265,7 @@ class TagStyleConverter:
@staticmethod @staticmethod
def process_indents_to_px(split_style: list) -> str: def process_indents_to_px(split_style: list) -> str:
# clean with convert_indents() style string and make new clean_style """ Function cleans using convert_indents() style string and returns new clean_style """
clean_style = '' clean_style = ''
for item in split_style: for item in split_style:
item = item.split(':') item = item.split(':')
@@ -276,7 +280,7 @@ class TagStyleConverter:
has_margin_left = re.search(margin_left_regexp, clean_style) has_margin_left = re.search(margin_left_regexp, clean_style)
has_text_indent = re.search(text_indent_regexp, clean_style) has_text_indent = re.search(text_indent_regexp, clean_style)
#formula_of_indent: indent = abs(margin_left - text_indent) # formula_of_indent: indent = abs(margin_left - text_indent)
if has_margin_left: if has_margin_left:
num_ml = abs(int("".join( num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2)))))) filter(str.isdigit, str(has_margin_left.group(2))))))
@@ -302,6 +306,7 @@ class TagStyleConverter:
def preprocess_style(self): def preprocess_style(self):
def remove_extra_spaces(style: str) -> List: def remove_extra_spaces(style: str) -> List:
""" Function to remove extra spaces in style to process clean_style """
# replace all spaces between '; & letter' to ';' # replace all spaces between '; & letter' to ';'
style = re.sub(r"; *", ";", style) style = re.sub(r"; *", ";", style)
split_style = style.split(';') split_style = style.split(';')
@@ -381,7 +386,7 @@ class TagStyleConverter:
@staticmethod @staticmethod
def wrap_span_in_p_to_save_style_attrs(tag): def wrap_span_in_p_to_save_style_attrs(tag):
'''Function designed to save style attrs that cannot be in p -> span''' """ Function designed to save style attrs that cannot be in p -> span """
if tag.name == 'p' and tag.attrs.get('style'): if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']] if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
@@ -414,6 +419,7 @@ class TagStyleConverter:
@staticmethod @staticmethod
def wrap_span_in_li_to_save_style_attrs(tag): def wrap_span_in_li_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in li -> span """
if tag.name == 'li' and tag.attrs.get('style'): if tag.name == 'li' and tag.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']] attr not in ['text-align', 'list-style-type']]
@@ -441,6 +447,7 @@ class TagStyleConverter:
@staticmethod @staticmethod
def wrap_span_in_ul_ol_to_save_style_attrs(tag): def wrap_span_in_ul_ol_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in ul/ol -> span """
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'): if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [ styles_cant_be_in_ul_ol = [
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']] attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
@@ -465,6 +472,7 @@ class TagStyleConverter:
@staticmethod @staticmethod
def wrap_span_in_h_to_save_style_attrs(tag): def wrap_span_in_h_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in h -> span """
h_regexp = re.compile('(^h[1-9]$)') h_regexp = re.compile('(^h[1-9]$)')
if re.search(h_regexp, tag.name) and tag.attrs.get('style'): if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
@@ -487,6 +495,7 @@ class TagStyleConverter:
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str): def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
""" Function adds styles from .css to inline style """
css_text = css_text.replace( css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '') '@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = [] livecarta_tmp_ids = []

View File

@@ -20,7 +20,7 @@ from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \ from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \
update_src_links_in_images, preprocess_footnotes update_images_src_links, preprocess_footnotes
class EpubConverter: class EpubConverter:
@@ -48,7 +48,7 @@ class EpubConverter:
# flag to be updated while ebooklib.toc is parsed # flag to be updated while ebooklib.toc is parsed
self.id_anchor_exist_in_nav_points = False self.id_anchor_exist_in_nav_points = False
self.img_href2img_bytes = {} # file path to bytes self.img_href2img_bytes = {} # file path to bytes
self.old_image_path2aws_path = {} # file path from <a> to generated aws path self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote self.footnotes: List[Tag] = [] # end of the footnote
@@ -124,12 +124,12 @@ class EpubConverter:
return css_content return css_content
def build_html_and_css_relations(self): def build_html_and_css_relations(self):
''' """
This function is designed to get 2 dictionaries: This function is designed to get 2 dictionaries:
The first is css_href2css_content. It is created to connect href of css to content of css The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
...2... = key2value ...2... = key2value
''' """
# dictionary: href of html to related css files # dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list) html_href2css_href: defaultdict = defaultdict(list)
@@ -159,10 +159,10 @@ class EpubConverter:
return html_href2css_href, css_href2css_content, return html_href2css_href, css_href2css_content,
def add_css_styles_to_html_soup(self): def add_css_styles_to_html_soup(self):
''' """
This function is designed to update html_href2html_body_soup This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content And add to html_inline_style css_style_content
''' """
for html_href in self.html_href2html_body_soup: for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href): if self.html_href2css_href.get(html_href):
css = '' css = ''
@@ -179,6 +179,7 @@ class EpubConverter:
return links return links
# t_nodes = []
def build_adjacency_list_from_toc(self, element, lvl=0): def build_adjacency_list_from_toc(self, element, lvl=0):
""" """
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
@@ -211,25 +212,31 @@ class EpubConverter:
sub_nodes = [] sub_nodes = []
for i in second: for i in second:
# if 'chapter' in (i.title.lower() if isinstance(i, Link) else i[0].title.lower()):
# self.t_nodes.append(self.build_adjacency_list_from_toc(i, lvl))
# else:
sub_nodes.append( sub_nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1)) self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[nav_point] = sub_nodes self.adjacency_list[nav_point] = sub_nodes
self.hrefs_added_to_toc.add(nav_point.href) self.hrefs_added_to_toc.add(nav_point.href)
return nav_point return nav_point
elif isinstance(element, list) and (lvl == 0): elif isinstance(element, list) and (lvl == 0):
sub_nodes = [] nodes = []
for i in element: for i in element:
sub_nodes.append( nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1)) self.build_adjacency_list_from_toc(i, lvl + 1))
# for j in self.t_nodes:
self.adjacency_list[-1] = sub_nodes # nodes.append(j)
# self.t_nodes = []
#
# self.adjacency_list[-1] = nodes
else: else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}' assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
def is_toc_empty(self): def is_toc_empty(self):
""" Function checks is toc empty """
# there is no toc in ebook or no top chapters # there is no toc in ebook or no top chapters
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return True return True
@@ -247,6 +254,7 @@ class EpubConverter:
self.hrefs_added_to_toc.add(nav_point.href) self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added): def add_not_added_files_to_adjacency_list(self, not_added):
""" Function add files that not added to adjacency list """
for i, file in enumerate(not_added): for i, file in enumerate(not_added):
nav_point = NavPoint( nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file)) Section(f'To check #{i}, filename: {file}', file))
@@ -315,6 +323,11 @@ class EpubConverter:
return full_path[0] return full_path[0]
def process_internal_links(self): def process_internal_links(self):
"""
Function
- processing internal links in a book
- make ids unique
"""
# 1. rebuild ids to be unique in all documents # 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc: for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}): for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
@@ -429,6 +442,7 @@ class EpubConverter:
self.build_one_chapter(sub_node) self.build_one_chapter(sub_node)
def define_chapters_content(self): def define_chapters_content(self):
""" Function build chapters content starts from top level chapters """
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points: if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points: for point in top_level_nav_points:
@@ -441,12 +455,12 @@ class EpubConverter:
nav_point.href, nav_point.id)] nav_point.href, nav_point.id)]
else: else:
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href] content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
self.old_image_path2aws_path = update_src_links_in_images(content, self.book_image_src_path2aws_path = update_images_src_links(content,
self.img_href2img_bytes, self.img_href2img_bytes,
path_to_html=nav_point.href, path_to_html=nav_point.href,
access=self.access, access=self.access,
path2aws_path=self.old_image_path2aws_path, path2aws_path=self.book_image_src_path2aws_path,
book_id=lambda x: self.file.stem if hasattr(self.file, self.file.stem) else 'book_id') book_id=self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title) title_preprocessed = prepare_title(title)
@@ -466,6 +480,7 @@ class EpubConverter:
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes) return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self): def convert_to_dict(self):
""" Function which convert list of html nodes to appropriate json structure. """
top_level_nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = [] top_level_chapters = []
@@ -491,7 +506,7 @@ if __name__ == "__main__":
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0) logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubConverter('../../epub/9781641051217.epub', json_converter = EpubConverter('../../epub/9781614382263.epub',
logger=logger_object) logger=logger_object)
tmp = json_converter.convert_to_dict() tmp = json_converter.convert_to_dict()

View File

@@ -2,12 +2,17 @@ from src.book_solver import BookSolver
from src.epub_converter.epub_converter import EpubConverter from src.epub_converter.epub_converter import EpubConverter
class EpubBook(BookSolver): class EpubBook(BookSolver):
""" Class of .epub type book - child of BookSolver """
def __init__(self, book_id=0, access=None, main_logger=None): def __init__(self, book_id=0, access=None, main_logger=None):
super().__init__(book_id, access, main_logger) super().__init__(book_id, access, main_logger)
self.book_type = 'epub' self.book_type = 'epub'
def get_converted_book(self): def get_converted_book(self):
"""
1. Convert epub to html
2. Parse from line structure to nested structure
"""
json_converter = EpubConverter(self.file_path, access=self.access, logger=self.logger_object) json_converter = EpubConverter(self.file_path, access=self.access, logger=self.logger_object)
content_dict = json_converter.convert_to_dict() content_dict = json_converter.convert_to_dict()
self.status_wrapper.set_generating() self.status_wrapper.set_generating()

View File

@@ -10,6 +10,7 @@ from src.livecarta_config import LiveCartaConfig
def save_image_locally(img_file_path, img_content, book_id): def save_image_locally(img_file_path, img_content, book_id):
""" Function saves all images locally """
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join( new_path = pathlib.Path(os.path.join(
folder_path, f'../json/img_{book_id}/')) folder_path, f'../json/img_{book_id}/'))
@@ -24,17 +25,19 @@ def save_image_locally(img_file_path, img_content, book_id):
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id): def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
link = access.send_image( """ Function saves all images to Amazon web service """
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content) img_file_path, doc_id=book_id, img_content=img_content)
return link return link_path
def update_src_links_in_images(body_tag: Tag, def update_images_src_links(body_tag: Tag,
href2img_content: dict, href2img_content: dict,
path_to_html, path_to_html,
access=None, access=None,
path2aws_path=None, path2aws_path=None,
book_id=None): book_id=None):
""" Function makes dictionary image_src_path -> Amazon web service_path """
img_tags = body_tag.find_all('img') img_tags = body_tag.find_all('img')
for img in img_tags: for img in img_tags:
@@ -65,16 +68,16 @@ def update_src_links_in_images(body_tag: Tag,
del img.attrs['height'] del img.attrs['height']
if img.attrs.get('style'): if img.attrs.get('style'):
del img.attrs['style'] del img.attrs['style']
return path2aws_path return path2aws_path
def preprocess_table(body_tag: BeautifulSoup): def preprocess_table(body_tag: BeautifulSoup):
""" Function to preprocess tables and tags(td|th|tr): style """
tables = body_tag.find_all("table") tables = body_tag.find_all("table")
for table in tables: for table in tables:
tds = table.find_all(re.compile("td|th|tr")) ts = table.find_all(re.compile("td|th|tr"))
for td in tds: for t_tag in ts:
style = td.get('style') style = t_tag.get('style')
width = '' width = ''
if style: if style:
width_match = re.search( width_match = re.search(
@@ -84,13 +87,13 @@ def preprocess_table(body_tag: BeautifulSoup):
units = width_match.group(2) units = width_match.group(2)
width = size+'px' width = size+'px'
td.attrs['width'] = td.get('width') or width t_tag.attrs['width'] = t_tag.get('width') or width
if td.attrs.get('style'): if t_tag.attrs.get('style'):
td.attrs['style'] = td.attrs['style'].replace('border:0;', '') t_tag.attrs['style'] = t_tag.attrs['style'].replace('border:0;', '')
if td.attrs.get('style') == '': elif t_tag.attrs.get('style') == '':
del td.attrs['style'] del t_tag.attrs['style']
if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']: if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
table.attrs['border'] = '1' table.attrs['border'] = '1'
@@ -110,6 +113,7 @@ def process_lists(body_tag):
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_): def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
""" Function inserts span before tag to be removed(aren't supported by livecarta) """
new_tag = main_tag.new_tag("span") new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or '' new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or '' new_tag.attrs['class'] = class_ or ''
@@ -153,9 +157,7 @@ def clean_headings_content(content: Tag, title: str):
def heading_tag_to_p_tag(body_tag): def heading_tag_to_p_tag(body_tag):
""" """ Function to convert all lower level headings to p tags """
Function to convert all lower level headings to p tags
"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$' pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern)) header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags: for tag in header_tags:
@@ -163,17 +165,16 @@ def heading_tag_to_p_tag(body_tag):
def clean_title_from_numbering(title: str): def clean_title_from_numbering(title: str):
""" """ Function removes numbering from titles """
Function to remove digits from headers.
"""
title = re.sub(r'^(\s+)+', '', title) title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title return title
def replace_with_livecarta_anchor_tag(anchor, i): def replace_with_livecarta_anchor_tag(anchor, i):
""" Function replace noteref_tag(anchor) with new livecarta tag """
new_tag = BeautifulSoup(features='lxml').new_tag('sup') new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element' new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1 new_tag['data-id'] = i + 1
@@ -188,11 +189,11 @@ def replace_with_livecarta_anchor_tag(anchor, i):
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-> Tuple[list, list, list]: -> Tuple[list, list, list]:
""" """
This function preprocessing footnotes
This function should be earlier that adding fonts in pipeline. This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p> <p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside> <aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
""" """
footnotes = [] footnotes = []
noterefs_tags = source_html_tag.find_all( noterefs_tags = source_html_tag.find_all(
@@ -205,12 +206,14 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
new_footnotes_tags = [] new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags] [tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str): def parse_a_tag_href(s: str) -> Tuple[str, str]:
""" Returns name of file & id of an anchor """
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.' assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#') f, id_ = s.split('#')
return f, id_ return f, id_
def verify_footnote_tag(tags: list): def verify_footnote_tag(tags: list):
""" Function verifies is tag - footnote """
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}' assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0: if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id)) anchored_tags = list(target_html_tag.find_all(id=element_id))
@@ -275,7 +278,7 @@ def unwrap_structural_tags(body_tag):
""" """
def _preserve_class_in_aside_tag(tag_): def _preserve_class_in_aside_tag(tag_):
# to save css style inherited from class, copy class to aside tag (which is parent to tag_) """ to save css style inherited from class, copy class to aside tag (which is parent to tag_) """
# this is for Wiley books with boxes # this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance( tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0] tag_.attrs['class'], list) else tag_.attrs['class'][0]
@@ -284,10 +287,11 @@ def unwrap_structural_tags(body_tag):
tag_.parent.attrs['class'] = tag_class tag_.parent.attrs['class'] = tag_class
def preserve_class_in_section_tag(tag_) -> bool: def preserve_class_in_section_tag(tag_) -> bool:
# to save css style inherited from class, copy class to child <p> """
to save css style inherited from class, copy class to child <p>
returns True, if <section> could be unwrapped
"""
# this is for Wiley books with boxes # this is for Wiley books with boxes
# returns True, if <section> could be unwrapped
tag_class = tag_.attrs['class'] if not isinstance( tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0] tag_.attrs['class'], list) else tag_.attrs['class'][0]
if 'feature' not in tag_class: if 'feature' not in tag_class:
@@ -312,6 +316,10 @@ def unwrap_structural_tags(body_tag):
class_=tag_to_be_removed.attrs.get('class')) class_=tag_to_be_removed.attrs.get('class'))
def replace_div_tag_with_table(): def replace_div_tag_with_table():
"""Function replace <div> with <table>:
1. Convert div with certain classes to tables
2. Add background color to div with background-color
"""
for div in body_tag.find_all("div"): for div in body_tag.find_all("div"):
if div.attrs.get('class'): if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance( div_class = div.attrs['class'] if not isinstance(
@@ -348,12 +356,12 @@ def unwrap_structural_tags(body_tag):
continue continue
add_span_to_save_ids_for_links(div) add_span_to_save_ids_for_links(div)
div.unwrap() div.unwrap()
# comments removal # comments removal
for tag in body_tag.find_all(): for tag in body_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)): for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract() element.extract()
replace_div_tag_with_table() replace_div_tag_with_table()
for s in body_tag.find_all("section"): for s in body_tag.find_all("section"):
@@ -458,23 +466,8 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
return tags return tags
def wrap_preformatted_span_with_table(main_tag, old_tag):
table = main_tag.new_tag("table")
table.attrs['border'] = '1px #ccc;'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['bgcolor'] = '#f5f5f5'
# td.attrs['border-radius'] = '4px'
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None): def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
""" Function wraps <block> with <table> """
table = main_tag.new_tag("table") table = main_tag.new_tag("table")
table.attrs['border'] = border table.attrs['border'] = border
table.attrs['align'] = 'center' table.attrs['align'] = 'center'
@@ -497,7 +490,6 @@ def clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")}) hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs: for hr in hrs:
hr.extract() hr.extract()
print(hr)
h = block.find(re.compile("h[1-9]")) h = block.find(re.compile("h[1-9]"))
if h: if h:
h.name = "p" h.name = "p"
@@ -505,6 +497,7 @@ def clean_wiley_block(block):
def preprocess_block_tags(chapter_tag): def preprocess_block_tags(chapter_tag):
""" Function preprocessing <block> tags """
for block in chapter_tag.find_all("blockquote"): for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']: if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
clean_wiley_block(block) clean_wiley_block(block)
@@ -527,7 +520,7 @@ def preprocess_block_tags(chapter_tag):
def prepare_formatted(text): def prepare_formatted(text):
# replace <,> to save them as is in html code """ Function replaces special symbols with their Unicode representation """
text = text.replace("<", "\x3C") text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E") text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp; text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
@@ -536,7 +529,25 @@ def prepare_formatted(text):
return text return text
def wrap_preformatted_span_with_table(main_tag, old_tag):
""" Function wraps <span> with <table> """
table = main_tag.new_tag("table")
table.attrs['border'] = '1px #ccc;'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['bgcolor'] = '#f5f5f5'
# td.attrs['border-radius'] = '4px'
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def preprocess_pre_tags(chapter_tag): def preprocess_pre_tags(chapter_tag):
""" Function preprocessing <pre> tags """
for pre in chapter_tag.find_all("pre"): for pre in chapter_tag.find_all("pre"):
new_tag = BeautifulSoup(features='lxml').new_tag("span") new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy() new_tag.attrs = pre.attrs.copy()
@@ -575,7 +586,7 @@ def preprocess_pre_tags(chapter_tag):
def preprocess_code_tags(chapter_tag): def preprocess_code_tags(chapter_tag):
# function that emulates style of <code>, <kdb>, <var> """ Function that emulates style of <code>, <kdb>, <var> """
for code in chapter_tag.find_all(re.compile("code|kdb|var")): for code in chapter_tag.find_all(re.compile("code|kdb|var")):
code.name = 'span' code.name = 'span'
if code.parent.name == "pre": if code.parent.name == "pre":
@@ -584,9 +595,7 @@ def preprocess_code_tags(chapter_tag):
def prepare_title(title_of_chapter: str) -> str: def prepare_title(title_of_chapter: str) -> str:
""" """ Function finalise processing/cleaning title """
Final processing/cleaning function.
"""
title_str = BeautifulSoup(title_of_chapter, features='lxml').string title_str = BeautifulSoup(title_of_chapter, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str) title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip() title_str = re.sub(r' +', ' ', title_str).rstrip()
@@ -596,7 +605,11 @@ def prepare_title(title_of_chapter: str) -> str:
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str: def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
""" """
Final processing/cleaning function. Function finalise processing/cleaning content
1. cleaning \n
2. heading removal
3. processing tags
4. class removal
""" """
# 0. cleaning \n # 0. cleaning \n
to_remove = [] to_remove = []
@@ -609,13 +622,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
# 1. heading removal # 1. heading removal
if remove_title_from_chapter: if remove_title_from_chapter:
clean_headings_content(content_tag, title_str) clean_headings_content(content_tag, title_str)
# 2. processing tags (<li>, <table>, <code>, <pre>, <block>)
process_lists(content_tag) process_lists(content_tag)
preprocess_table(content_tag) preprocess_table(content_tag)
preprocess_code_tags(content_tag) preprocess_code_tags(content_tag)
preprocess_pre_tags(content_tag) preprocess_pre_tags(content_tag)
preprocess_block_tags(content_tag) preprocess_block_tags(content_tag)
# 2. class removal # 3. class removal
for tag in content_tag.find_all(recursive=True): for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor', if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']): 'footnote-element']):

View File

@@ -1,5 +1,5 @@
class LiveCartaConfig: class LiveCartaConfig:
"""Class of values that LiveCarta platform using and supports"""
SUPPORTED_LEVELS = 5 SUPPORTED_LEVELS = 5
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"} SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"} HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}

View File

@@ -6,6 +6,7 @@ from webcolors import html4_hex_to_names, hex_to_rgb, rgb_to_name, rgb_percent_t
def closest_colour_rgb(requested_color): def closest_colour_rgb(requested_color):
""" Function finds closes colour rgb """
min_colours = {} min_colours = {}
for key, name in html4_hex_to_names.items(): for key, name in html4_hex_to_names.items():
r_c, g_c, b_c = hex_to_rgb(key) r_c, g_c, b_c = hex_to_rgb(key)
@@ -18,6 +19,7 @@ def closest_colour_rgb(requested_color):
def rgb2color_name(color): def rgb2color_name(color):
""" Transform rgb -> color name """
try: try:
closest_name = actual_name = rgb_to_name(color, 'html4') closest_name = actual_name = rgb_to_name(color, 'html4')
except ValueError: except ValueError:
@@ -30,6 +32,7 @@ def rgb2color_name(color):
def hex2color_name(color): def hex2color_name(color):
""" Transform hex -> color name """
try: try:
color = hex_to_rgb(color) color = hex_to_rgb(color)
except ValueError: except ValueError:
@@ -47,6 +50,7 @@ def hex2color_name(color):
def str2closest_html_color_name(s: str): def str2closest_html_color_name(s: str):
""" Transform str -> closest color name """
if 'rgb' in s: if 'rgb' in s:
rgb_str = 'rgba' if ('rgba' in s) else 'rgb' rgb_str = 'rgba' if ('rgba' in s) else 'rgb'
s = s.replace(rgb_str, '').replace('(', '').replace(')', '') s = s.replace(rgb_str, '').replace('(', '').replace(')', '')
@@ -80,6 +84,7 @@ def str2closest_html_color_name(s: str):
def rgba2rgb(r, g, b, alpha): def rgba2rgb(r, g, b, alpha):
""" Transform rgba -> rgb """
r_background, g_background, b_background = 255, 255, 255 r_background, g_background, b_background = 255, 255, 255
r_new = int((1 - alpha) * r_background + alpha * r) r_new = int((1 - alpha) * r_background + alpha * r)
g_new = int((1 - alpha) * g_background + alpha * g) g_new = int((1 - alpha) * g_background + alpha * g)
@@ -88,6 +93,7 @@ def rgba2rgb(r, g, b, alpha):
def str2hex(s: str): def str2hex(s: str):
""" Transform str -> hex """
if '#' in s and (len(s) <= 7): if '#' in s and (len(s) <= 7):
return s.lower() return s.lower()

View File

@@ -3,6 +3,7 @@ import logging
class ColoredFormatter(logging.Formatter): class ColoredFormatter(logging.Formatter):
""" Class to prettify logger and command line output """
MAPPING = { MAPPING = {
'DEBUG': 37, # white 'DEBUG': 37, # white
'INFO': 36, # cyan 'INFO': 36, # cyan
@@ -61,9 +62,7 @@ class BookLogger:
self.logger.log(msg=message, level=logging_level, stacklevel=2) self.logger.log(msg=message, level=logging_level, stacklevel=2)
def log_error_to_main_log(self, message=''): def log_error_to_main_log(self, message=''):
""" """ Method for logging error to main log file. """
Method for logging error to main log file.
"""
if self.main_logger: if self.main_logger:
if not message: if not message:
message = f'Error in book conversion. Check log file.' message = f'Error in book conversion. Check log file.'
@@ -71,6 +70,8 @@ class BookLogger:
class BookStatusWrapper: class BookStatusWrapper:
"""Class sets/updates statuses of Converter on Platform"""
def __init__(self, access, logger_object, book_id=0): def __init__(self, access, logger_object, book_id=0):
self.access = access self.access = access
self.logger_object = logger_object self.logger_object = logger_object