Wrote documentation for every func/class in .py

This commit is contained in:
Kiryl
2021-12-10 10:53:40 +03:00
parent ef3502cd0a
commit 4b1109e6b4
13 changed files with 198 additions and 172 deletions

View File

@@ -54,7 +54,6 @@ def convert_book(book_type: [DocxBook, EpubBook], params: dict, logger, book_id)
raise exc
logger.info(f'Book-{book_id} has been proceeded.')
print('Book has been proceeded.')
def callback(ch, method, properties, body, logger, libra_locker):

View File

@@ -1,10 +1,3 @@
""" This is Main Abstract class for solving a task of a book conversion
Having an id of coming book, gets book from server, runs conversion.
In parallel it updates status of a book conversion on admin panel.
Finally sends result to server.
Result is a json, JSON schema in book_schema.json
"""
import os
import json
import codecs
@@ -17,6 +10,14 @@ from src.util.helpers import BookLogger, BookStatusWrapper
class BookSolver:
"""
This is Main Abstract class for solving a task of a book conversion
Having an id of coming book, gets book from server, runs conversion.
In parallel it updates status of a book conversion on admin panel.
Finally sends result to server.
Result is a json, JSON schema in book_schema.json
"""
__metaclass__ = ABCMeta
def __init__(self, book_id=0, access=None, main_logger=None):
@@ -55,9 +56,7 @@ class BookSolver:
self.file_path = pathlib.Path(file_path)
def get_book_file(self):
"""
Method for getting and saving book from server.
"""
""" Method for getting and saving book from server. """
try:
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
content = self.access.get_book(self.book_id)
@@ -92,6 +91,7 @@ class BookSolver:
self.logger_object.log('Error has occurred while writing json file.' + str(exc), logging.ERROR)
def send_json_content_to_server(self, content: dict):
""" Function sends json_content to site """
try:
self.access.send_book(self.book_id, content)
self.logger_object.log(f'JSON data has been sent to server.')
@@ -108,8 +108,10 @@ class BookSolver:
return {}
def test_conversion(self):
'''Function
without sending to server'''
"""
Function
- without sending to server
"""
self.logger_object.log('Beginning of the test.')
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'{self.book_type}')
@@ -121,9 +123,11 @@ class BookSolver:
self.logger_object.log('End of the test.')
def conversion(self):
'''Function
with downloading book from server
with sending to server'''
"""
Function
- with downloading book from server
- with sending to server
"""
try:
self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.')
self.get_book_file()
@@ -140,9 +144,11 @@ class BookSolver:
raise exc
def conversion_local(self):
'''Function
without downloading book from server (local)
with sending to server'''
"""
Function
- without downloading book from server (local)
- with sending to server
"""
try:
self.logger_object.log(f'Data has been downloaded from tmp.json file: {self.file_path}')
with codecs.open('json/tmp.json', 'r', encoding='utf-8') as f_json:

View File

@@ -2,21 +2,22 @@ import re
from typing import Union
from ebooklib.epub import Section, Link
from src.livecarta_config import LiveCartaConfig
"""
These are data structures which form mapping from NCX to python data structures.
"""
class NavPoint:
"""
Class - Navigation Point, - every html|xhtml from epub
These are data structures which form mapping from NCX to python data structures.
"""
def __init__(self, obj: Union[Link, Section] = None, ):
self.href, self.id = self.parse_href_id(obj)
self.title = obj.title
@staticmethod
def parse_href_id(item: Union[Link, Section]):
"""Function parses href & id from item.href"""
reg = r'(.+\..+\#)(.+)'
match = re.search(reg, item.href)
href, div_id = None, None
@@ -36,13 +37,8 @@ class NavPoint:
return '<NavPoint: %s, %s>' % (self.href, self.id)
"""
These are data structures which form mapping to livecarta json structure.
"""
def flatten(x):
""" magic function from stackoverflow for list flattening """
"""magic function from stackoverflow for list flattening"""
atom = lambda i: not isinstance(i, list)
nil = lambda i: not i
car = lambda i: i[0]
@@ -54,12 +50,18 @@ def flatten(x):
class ChapterItem:
"""
Class of Chapter that could have subchapters
These are data structures which form mapping to livecarta json structure.
"""
def __init__(self, title, content, sub_items):
self.title = title
self.content = content
self.sub_items = sub_items
def to_dict(self, lvl=1):
"""Function returns dictionary of chapter"""
sub_dicts = []
if self.sub_items:
for i in self.sub_items:
@@ -86,4 +88,4 @@ class ChapterItem:
}
def __str__(self):
return '<Chapter: %s>' % self.title
return '<Chapter: %s>' % self.title

View File

@@ -12,6 +12,7 @@ from src.book_solver import BookSolver
class DocxBook(BookSolver):
"""Class of .docx type book - child of BookSolver"""
def __init__(self, book_id=0, access=None, html_path=None,
main_logger=None, libra_locker=None):
@@ -30,9 +31,7 @@ class DocxBook(BookSolver):
self.logger_object.log(f'Any error while libra conversion for book_{self.book_id}: {result.stderr}', logging.DEBUG)
def convert_doc_to_html(self):
"""
Method for convert .docx document to .html file.
"""
"""Method for convert .docx document to .html file."""
self.logger_object.log(f'File - {self.file_path}.')
print(f'{self.file_path}')
self.logger_object.log('Beginning of conversion from .docx to .html.')
@@ -92,9 +91,7 @@ class DocxBook(BookSolver):
self.logger_object.log(f'Input file path after conversion: {self.html_path}.')
def read_html(self):
"""
Method for reading .html file into beautiful soup tag.
"""
"""Method for reading .html file into beautiful soup tag."""
try:
html_text = open(self.html_path, 'r', encoding='utf8').read()
self.logger_object.log('HTML for book has been loaded.')
@@ -130,7 +127,6 @@ class DocxBook(BookSolver):
1. Convert docx to html with libra office
2. Parse and clean html, get list of tags, get footnotes
3. Parse from line structure to nested structure with JSONConverter
"""
self.convert_doc_to_html()
self.check_output_directory()

View File

@@ -35,9 +35,7 @@ class HTMLDocxPreprocessor:
tag.unwrap()
def _clean_underline_links(self):
"""
Function cleans meaningless <u> tags before links.
"""
"""Function cleans meaningless <u> tags before links."""
underlines = self.body_tag.find_all("u")
for u in underlines:
if u.find_all('a'):
@@ -79,9 +77,7 @@ class HTMLDocxPreprocessor:
return re.sub(size + "pt", str(new_size) + "px", style)
def _font_to_span(self):
"""
Function to convert <font> tag to <span>. If font style is default, then remove this tag.
"""
"""Function to convert <font> tag to <span>. If font style is default, then remove this tag."""
fonts = self.body_tag.find_all("font")
for font in fonts:
face = font.get("face")
@@ -119,9 +115,7 @@ class HTMLDocxPreprocessor:
self.content = self.content[ind:]
def clean_trash(self):
"""
Function to remove all styles and tags we don't need.
"""
"""Function to remove all styles and tags we don't need."""
self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$'))
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
@@ -140,9 +134,7 @@ class HTMLDocxPreprocessor:
table.decompose()
def _process_paragraph(self):
"""
Function to process <p> tags (text-align and text-indent value).
"""
"""Function to process <p> tags (text-align and text-indent value)."""
paragraphs = self.body_tag.find_all('p')
for p in paragraphs:
@@ -193,9 +185,7 @@ class HTMLDocxPreprocessor:
p.attrs['style'] = style
def _process_two_columns(self):
"""
Function to process paragraphs which has two columns layout.
"""
"""Function to process paragraphs which has two columns layout."""
two_columns = self.body_tag.find_all("div", style="column-count: 2")
for div in two_columns:
for child in div.children:
@@ -204,9 +194,7 @@ class HTMLDocxPreprocessor:
div.unwrap()
def _process_tables(self):
"""
Function to process tables. Set "border" attribute.
"""
"""Function to process tables. Set "border" attribute."""
tables = self.body_tag.find_all("table")
for table in tables:
tds = table.find_all("td")
@@ -296,9 +284,7 @@ class HTMLDocxPreprocessor:
return content.strip()
def _process_footnotes(self):
"""
Function returns list of footnotes and delete them from html_soup.
"""
"""Function returns list of footnotes and delete them from html_soup."""
footnote_anchors = self.body_tag.find_all('a', class_='sdfootnoteanc')
footnote_content = self.body_tag.find_all('div', id=re.compile(r'^sdfootnote\d+$'))
footnote_amt = len(footnote_anchors)
@@ -404,9 +390,7 @@ class HTMLDocxPreprocessor:
div.decompose()
def _process_div(self):
"""
Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.
"""
"""Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay."""
divs = self.body_tag.find_all("div")
for div in divs:
@@ -423,9 +407,7 @@ class HTMLDocxPreprocessor:
return len(toc_links) > 0
def _process_toc_links(self):
"""
Function to extract nodes which contains TOC links, remove links from file and detect headers.
"""
"""Function to extract nodes which contains TOC links, remove links from file and detect headers."""
toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')})
headers = [link.parent for link in toc_links]
outline_level = "1" # All the unknown outlines will be predicted as <h1>
@@ -448,13 +430,11 @@ class HTMLDocxPreprocessor:
@staticmethod
def clean_title_from_numbering(title: str):
"""
Function to remove digits from headers.
"""
"""Function to remove digits from headers."""
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
@staticmethod
@@ -485,9 +465,7 @@ class HTMLDocxPreprocessor:
self.apply_func_to_last_child(children[0], func)
def _preprocessing_headings(self):
"""
Function to convert all lower level headings to p tags
"""
"""Function to convert all lower level headings to p tags"""
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags:
@@ -561,9 +539,7 @@ class HTMLDocxPreprocessor:
self.top_level_headers[i]['should_be_numbered'] = True
def _process_headings(self):
"""
Function to process tags <h>.
"""
"""Function to process tags <h>."""
header_tags = self.body_tag.find_all(re.compile("^h[1-9]$"))
# 1. remove <b>, <span>
@@ -634,9 +610,7 @@ class HTMLDocxPreprocessor:
il_tag.p.unwrap()
def process_html(self, access, html_path, book_id):
"""
Process html code to satisfy LiveCarta formatting.
"""
"""Process html code to satisfy LiveCarta formatting."""
try:
self.logger_object.log(f'Processing TOC and headers.')
self._process_toc_links()

View File

@@ -90,9 +90,7 @@ class LibraHTML2JSONConverter:
return True
def convert_to_dict(self):
"""
Function which convert list of html nodes to appropriate json structure.
"""
"""Function which convert list of html nodes to appropriate json structure."""
json_strc = []
ind = 0
ch_num = 0

View File

@@ -11,9 +11,9 @@ from itertools import takewhile
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
cssutils.log.setLevel(CRITICAL)
sizes_pr = [-1, 0.5, 0.56, 0.63, 0.69, 0.75, 0.81, 0.88, 0.94, 1.0, 1.06, 1.13, 1.19, 1.25, 1.31, 1.38, 1.44, 1.5, 1.56,
1.63, 1.69, 1.75, 1.81, 1.88, 1.94, 2.0, 2.06, 2.13, 2.19, 2.25, 2.31, 2.38, 2.44, 2.5, 2.56, 2.63, 2.69,
2.75, 2.81, 2.88, 2.94, 3.0, 4.0, 5.0]
@@ -29,6 +29,7 @@ list_types = ['circle', 'disc', 'armenian', 'decimal',
def convert_font_size(value):
""" Function converts font-size in mapping """
if 'pt' in value:
if int(value.replace('pt', '')) == LiveCartaConfig.LIVECARTA_DEFAULT_FONT_SIZE:
return ''
@@ -58,6 +59,7 @@ def convert_font_size(value):
def convert_indents(value):
""" Function converts text-indent and margin-left values to px """
# 30px = 3.2% = 1.25em = 23pt
text_indent_regexp = re.compile(r'(-*\w+%)|((-*\w*).*em)|(-*\w+pt)')
has_style_attrs = re.search(text_indent_regexp, value)
@@ -115,13 +117,6 @@ LIVECARTA_STYLE_ATTRS = {
'margin-left': []
}
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
def get_bg_color(x):
color = str2hex(x)
@@ -135,6 +130,12 @@ def get_text_color(x):
return color
"""
LIVECARTA_STYLE_ATTRS_MAPPING = { property: mapping function }
Warning, if LIVECARTA_STYLE_ATTRS is changed, LIVECARTA_STYLE_ATTRS_MAPPING should be updated
to suit livecarta style convention.
"""
LIVECARTA_STYLE_ATTRS_MAPPING = {
'text-indent': convert_indents,
'font-variant': lambda x: x,
@@ -178,8 +179,10 @@ LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
def check_style_to_be_tag(style) -> List[tuple]:
""" Some css style properties converts to tags.
Search for them and prepare list of properties to be removed from style string"""
"""
Some css style properties converts to tags.
Search for them and prepare list of properties to be removed from style string
"""
to_remove = []
for k in LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG:
if f'{k[0]}:{k[1]}' in style:
@@ -208,6 +211,7 @@ def update_css_style_types_to_livecarta_convention(css_rule, style_type):
def build_css_content(css_content):
""" Build css content with livecarta convention """
sheet = cssutils.parseString(css_content, validate=False)
for css_rule in sheet:
@@ -231,6 +235,7 @@ class TagStyleConverter:
@staticmethod
def remove_white_if_no_bgcolor(style_, tag):
""" Function remove white color if there is no text bg color """
if 'background' in style_:
return style_
@@ -260,8 +265,7 @@ class TagStyleConverter:
@staticmethod
def process_indents_to_px(split_style: list) -> str:
# clean with convert_indents() style string and make new clean_style
""" Function cleans using convert_indents() style string and returns new clean_style """
clean_style = ''
for item in split_style:
item = item.split(':')
@@ -276,7 +280,7 @@ class TagStyleConverter:
has_margin_left = re.search(margin_left_regexp, clean_style)
has_text_indent = re.search(text_indent_regexp, clean_style)
#formula_of_indent: indent = abs(margin_left - text_indent)
# formula_of_indent: indent = abs(margin_left - text_indent)
if has_margin_left:
num_ml = abs(int("".join(
filter(str.isdigit, str(has_margin_left.group(2))))))
@@ -302,6 +306,7 @@ class TagStyleConverter:
def preprocess_style(self):
def remove_extra_spaces(style: str) -> List:
""" Function to remove extra spaces in style to process clean_style """
# replace all spaces between '; & letter' to ';'
style = re.sub(r"; *", ";", style)
split_style = style.split(';')
@@ -381,7 +386,7 @@ class TagStyleConverter:
@staticmethod
def wrap_span_in_p_to_save_style_attrs(tag):
'''Function designed to save style attrs that cannot be in p -> span'''
""" Function designed to save style attrs that cannot be in p -> span """
if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom', 'border-top']]
@@ -414,6 +419,7 @@ class TagStyleConverter:
@staticmethod
def wrap_span_in_li_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in li -> span """
if tag.name == 'li' and tag.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type']]
@@ -441,6 +447,7 @@ class TagStyleConverter:
@staticmethod
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in ul/ol -> span """
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [
attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
@@ -465,6 +472,7 @@ class TagStyleConverter:
@staticmethod
def wrap_span_in_h_to_save_style_attrs(tag):
""" Function designed to save style attrs that cannot be in h -> span """
h_regexp = re.compile('(^h[1-9]$)')
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
@@ -487,6 +495,7 @@ class TagStyleConverter:
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
""" Function adds styles from .css to inline style """
css_text = css_text.replace(
'@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []

View File

@@ -20,7 +20,7 @@ from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \
update_src_links_in_images, preprocess_footnotes
update_images_src_links, preprocess_footnotes
class EpubConverter:
@@ -48,7 +48,7 @@ class EpubConverter:
# flag to be updated while ebooklib.toc is parsed
self.id_anchor_exist_in_nav_points = False
self.img_href2img_bytes = {} # file path to bytes
self.old_image_path2aws_path = {} # file path from <a> to generated aws path
self.book_image_src_path2aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
@@ -124,12 +124,12 @@ class EpubConverter:
return css_content
def build_html_and_css_relations(self):
'''
"""
This function is designed to get 2 dictionaries:
The first is css_href2css_content. It is created to connect href of css to content of css
The second is html_href2css_href. It is created to connect href of html to css files(hrefs of them) which are used on this html
...2... = key2value
'''
"""
# dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list)
@@ -159,10 +159,10 @@ class EpubConverter:
return html_href2css_href, css_href2css_content,
def add_css_styles_to_html_soup(self):
'''
"""
This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content
'''
"""
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
css = ''
@@ -179,6 +179,7 @@ class EpubConverter:
return links
# t_nodes = []
def build_adjacency_list_from_toc(self, element, lvl=0):
"""
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
@@ -211,25 +212,31 @@ class EpubConverter:
sub_nodes = []
for i in second:
# if 'chapter' in (i.title.lower() if isinstance(i, Link) else i[0].title.lower()):
# self.t_nodes.append(self.build_adjacency_list_from_toc(i, lvl))
# else:
sub_nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[nav_point] = sub_nodes
self.hrefs_added_to_toc.add(nav_point.href)
return nav_point
elif isinstance(element, list) and (lvl == 0):
sub_nodes = []
nodes = []
for i in element:
sub_nodes.append(
nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[-1] = sub_nodes
# for j in self.t_nodes:
# nodes.append(j)
# self.t_nodes = []
#
# self.adjacency_list[-1] = nodes
else:
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
def is_toc_empty(self):
""" Function checks is toc empty """
# there is no toc in ebook or no top chapters
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return True
@@ -247,6 +254,7 @@ class EpubConverter:
self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added):
""" Function add files that not added to adjacency list """
for i, file in enumerate(not_added):
nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file))
@@ -315,6 +323,11 @@ class EpubConverter:
return full_path[0]
def process_internal_links(self):
"""
Function
- processing internal links in a book
- make ids unique
"""
# 1. rebuild ids to be unique in all documents
for toc_href in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={'id': re.compile(r'.+')}):
@@ -429,6 +442,7 @@ class EpubConverter:
self.build_one_chapter(sub_node)
def define_chapters_content(self):
""" Function build chapters content starts from top level chapters """
top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points:
for point in top_level_nav_points:
@@ -441,12 +455,12 @@ class EpubConverter:
nav_point.href, nav_point.id)]
else:
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
self.old_image_path2aws_path = update_src_links_in_images(content,
self.img_href2img_bytes,
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.old_image_path2aws_path,
book_id=lambda x: self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')
self.book_image_src_path2aws_path = update_images_src_links(content,
self.img_href2img_bytes,
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.book_image_src_path2aws_path,
book_id=self.file.stem if hasattr(self.file, self.file.stem) else 'book_id')
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed = prepare_title(title)
@@ -466,6 +480,7 @@ class EpubConverter:
return ChapterItem(title_preprocessed, content_preprocessed, sub_nodes)
def convert_to_dict(self):
""" Function which convert list of html nodes to appropriate json structure. """
top_level_nav_points = self.adjacency_list[-1]
top_level_chapters = []
@@ -491,7 +506,7 @@ if __name__ == "__main__":
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubConverter('../../epub/9781641051217.epub',
json_converter = EpubConverter('../../epub/9781614382263.epub',
logger=logger_object)
tmp = json_converter.convert_to_dict()

View File

@@ -2,12 +2,17 @@ from src.book_solver import BookSolver
from src.epub_converter.epub_converter import EpubConverter
class EpubBook(BookSolver):
""" Class of .epub type book - child of BookSolver """
def __init__(self, book_id=0, access=None, main_logger=None):
super().__init__(book_id, access, main_logger)
self.book_type = 'epub'
def get_converted_book(self):
"""
1. Convert epub to html
2. Parse from line structure to nested structure
"""
json_converter = EpubConverter(self.file_path, access=self.access, logger=self.logger_object)
content_dict = json_converter.convert_to_dict()
self.status_wrapper.set_generating()

View File

@@ -10,6 +10,7 @@ from src.livecarta_config import LiveCartaConfig
def save_image_locally(img_file_path, img_content, book_id):
""" Function saves all images locally """
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(
folder_path, f'../json/img_{book_id}/'))
@@ -24,17 +25,19 @@ def save_image_locally(img_file_path, img_content, book_id):
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
link = access.send_image(
""" Function saves all images to Amazon web service """
link_path = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link
return link_path
def update_src_links_in_images(body_tag: Tag,
href2img_content: dict,
path_to_html,
access=None,
path2aws_path=None,
book_id=None):
def update_images_src_links(body_tag: Tag,
href2img_content: dict,
path_to_html,
access=None,
path2aws_path=None,
book_id=None):
""" Function makes dictionary image_src_path -> Amazon web service_path """
img_tags = body_tag.find_all('img')
for img in img_tags:
@@ -65,16 +68,16 @@ def update_src_links_in_images(body_tag: Tag,
del img.attrs['height']
if img.attrs.get('style'):
del img.attrs['style']
return path2aws_path
def preprocess_table(body_tag: BeautifulSoup):
""" Function to preprocess tables and tags(td|th|tr): style """
tables = body_tag.find_all("table")
for table in tables:
tds = table.find_all(re.compile("td|th|tr"))
for td in tds:
style = td.get('style')
ts = table.find_all(re.compile("td|th|tr"))
for t_tag in ts:
style = t_tag.get('style')
width = ''
if style:
width_match = re.search(
@@ -84,13 +87,13 @@ def preprocess_table(body_tag: BeautifulSoup):
units = width_match.group(2)
width = size+'px'
td.attrs['width'] = td.get('width') or width
t_tag.attrs['width'] = t_tag.get('width') or width
if td.attrs.get('style'):
td.attrs['style'] = td.attrs['style'].replace('border:0;', '')
if t_tag.attrs.get('style'):
t_tag.attrs['style'] = t_tag.attrs['style'].replace('border:0;', '')
if td.attrs.get('style') == '':
del td.attrs['style']
elif t_tag.attrs.get('style') == '':
del t_tag.attrs['style']
if not table.attrs.get('border') or table.attrs.get('border') in ['0', '0px']:
table.attrs['border'] = '1'
@@ -110,6 +113,7 @@ def process_lists(body_tag):
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
""" Function inserts span before tag to be removed(aren't supported by livecarta) """
new_tag = main_tag.new_tag("span")
new_tag.attrs['id'] = id_ or ''
new_tag.attrs['class'] = class_ or ''
@@ -153,9 +157,7 @@ def clean_headings_content(content: Tag, title: str):
def heading_tag_to_p_tag(body_tag):
"""
Function to convert all lower level headings to p tags
"""
""" Function to convert all lower level headings to p tags """
pattern = f'^h[{LiveCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = body_tag.find_all(re.compile(pattern))
for tag in header_tags:
@@ -163,17 +165,16 @@ def heading_tag_to_p_tag(body_tag):
def clean_title_from_numbering(title: str):
"""
Function to remove digits from headers.
"""
""" Function removes numbering from titles """
title = re.sub(r'^(\s+)+', '', title)
title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
# title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) # delete chapter I, (ABC) from the title
return title
def replace_with_livecarta_anchor_tag(anchor, i):
""" Function replace noteref_tag(anchor) with new livecarta tag """
new_tag = BeautifulSoup(features='lxml').new_tag('sup')
new_tag['class'] = 'footnote-element'
new_tag['data-id'] = i + 1
@@ -188,11 +189,11 @@ def replace_with_livecarta_anchor_tag(anchor, i):
def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, noteref_attr_name='epub:type') \
-> Tuple[list, list, list]:
"""
This function preprocessing footnotes
This function should be earlier that adding fonts in pipeline.
<p>Here is an example footnote<sup><a epub:type="noteref" href="#n1">1</a></sup></p>
<aside epub:type="footnote" id="n1"><p>With a footnote here.</p></aside>
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(
@@ -205,12 +206,14 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
def parse_a_tag_href(s: str):
def parse_a_tag_href(s: str) -> Tuple[str, str]:
""" Returns name of file & id of an anchor """
assert '#' in s, f'Error. Unexpected href: {s} in a tag. Href must contain an id.'
f, id_ = s.split('#')
return f, id_
def verify_footnote_tag(tags: list):
""" Function verifies is tag - footnote """
assert len(tags) <= 1, f'Error, Multiple id: {href}.\n{tags}'
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
@@ -275,7 +278,7 @@ def unwrap_structural_tags(body_tag):
"""
def _preserve_class_in_aside_tag(tag_):
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
""" to save css style inherited from class, copy class to aside tag (which is parent to tag_) """
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
@@ -284,10 +287,11 @@ def unwrap_structural_tags(body_tag):
tag_.parent.attrs['class'] = tag_class
def preserve_class_in_section_tag(tag_) -> bool:
# to save css style inherited from class, copy class to child <p>
"""
to save css style inherited from class, copy class to child <p>
returns True, if <section> could be unwrapped
"""
# this is for Wiley books with boxes
# returns True, if <section> could be unwrapped
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
if 'feature' not in tag_class:
@@ -312,6 +316,10 @@ def unwrap_structural_tags(body_tag):
class_=tag_to_be_removed.attrs.get('class'))
def replace_div_tag_with_table():
"""Function replace <div> with <table>:
1. Convert div with certain classes to tables
2. Add background color to div with background-color
"""
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(
@@ -348,12 +356,12 @@ def unwrap_structural_tags(body_tag):
continue
add_span_to_save_ids_for_links(div)
div.unwrap()
# comments removal
for tag in body_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
replace_div_tag_with_table()
for s in body_tag.find_all("section"):
@@ -458,23 +466,8 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
return tags
def wrap_preformatted_span_with_table(main_tag, old_tag):
table = main_tag.new_tag("table")
table.attrs['border'] = '1px #ccc;'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['bgcolor'] = '#f5f5f5'
# td.attrs['border-radius'] = '4px'
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def wrap_block_tag_with_table(main_tag, old_tag, width='95', border='1px', bg_color=None):
""" Function wraps <block> with <table> """
table = main_tag.new_tag("table")
table.attrs['border'] = border
table.attrs['align'] = 'center'
@@ -497,7 +490,6 @@ def clean_wiley_block(block):
hrs = block.find_all("p", attrs={"class": re.compile(".+ hr")})
for hr in hrs:
hr.extract()
print(hr)
h = block.find(re.compile("h[1-9]"))
if h:
h.name = "p"
@@ -505,6 +497,7 @@ def clean_wiley_block(block):
def preprocess_block_tags(chapter_tag):
""" Function preprocessing <block> tags """
for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
clean_wiley_block(block)
@@ -527,7 +520,7 @@ def preprocess_block_tags(chapter_tag):
def prepare_formatted(text):
# replace <,> to save them as is in html code
""" Function replaces special symbols with their Unicode representation """
text = text.replace("<", "\x3C")
text = text.replace(">", "\x3E")
text = text.replace('\t', "\xa0 \xa0 ") # &nbsp; &nbsp;
@@ -536,7 +529,25 @@ def prepare_formatted(text):
return text
def wrap_preformatted_span_with_table(main_tag, old_tag):
""" Function wraps <span> with <table> """
table = main_tag.new_tag("table")
table.attrs['border'] = '1px #ccc;'
table.attrs['style'] = 'width:100%;'
tbody = main_tag.new_tag("tbody")
tr = main_tag.new_tag("tr")
td = main_tag.new_tag("td")
td.attrs['bgcolor'] = '#f5f5f5'
# td.attrs['border-radius'] = '4px'
old_tag.wrap(td)
td.wrap(tr)
tr.wrap(tbody)
tbody.wrap(table)
return table
def preprocess_pre_tags(chapter_tag):
""" Function preprocessing <pre> tags """
for pre in chapter_tag.find_all("pre"):
new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy()
@@ -575,7 +586,7 @@ def preprocess_pre_tags(chapter_tag):
def preprocess_code_tags(chapter_tag):
# function that emulates style of <code>, <kdb>, <var>
""" Function that emulates style of <code>, <kdb>, <var> """
for code in chapter_tag.find_all(re.compile("code|kdb|var")):
code.name = 'span'
if code.parent.name == "pre":
@@ -584,9 +595,7 @@ def preprocess_code_tags(chapter_tag):
def prepare_title(title_of_chapter: str) -> str:
"""
Final processing/cleaning function.
"""
""" Function finalise processing/cleaning title """
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
@@ -596,7 +605,11 @@ def prepare_title(title_of_chapter: str) -> str:
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""
Final processing/cleaning function.
Function finalise processing/cleaning content
1. cleaning \n
2. heading removal
3. processing tags
4. class removal
"""
# 0. cleaning \n
to_remove = []
@@ -609,13 +622,15 @@ def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_fro
# 1. heading removal
if remove_title_from_chapter:
clean_headings_content(content_tag, title_str)
# 2. processing tags (<li>, <table>, <code>, <pre>, <block>)
process_lists(content_tag)
preprocess_table(content_tag)
preprocess_code_tags(content_tag)
preprocess_pre_tags(content_tag)
preprocess_block_tags(content_tag)
# 2. class removal
# 3. class removal
for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']):

View File

@@ -1,5 +1,5 @@
class LiveCartaConfig:
"""Class of values that LiveCarta platform using and supports"""
SUPPORTED_LEVELS = 5
SUPPORTED_HEADERS = {"h1", "h2", "h3", "h4", "h5"}
HEADERS_LEVELS = {"h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9"}

View File

@@ -6,6 +6,7 @@ from webcolors import html4_hex_to_names, hex_to_rgb, rgb_to_name, rgb_percent_t
def closest_colour_rgb(requested_color):
""" Function finds closes colour rgb """
min_colours = {}
for key, name in html4_hex_to_names.items():
r_c, g_c, b_c = hex_to_rgb(key)
@@ -18,6 +19,7 @@ def closest_colour_rgb(requested_color):
def rgb2color_name(color):
""" Transform rgb -> color name """
try:
closest_name = actual_name = rgb_to_name(color, 'html4')
except ValueError:
@@ -30,6 +32,7 @@ def rgb2color_name(color):
def hex2color_name(color):
""" Transform hex -> color name """
try:
color = hex_to_rgb(color)
except ValueError:
@@ -47,6 +50,7 @@ def hex2color_name(color):
def str2closest_html_color_name(s: str):
""" Transform str -> closest color name """
if 'rgb' in s:
rgb_str = 'rgba' if ('rgba' in s) else 'rgb'
s = s.replace(rgb_str, '').replace('(', '').replace(')', '')
@@ -80,6 +84,7 @@ def str2closest_html_color_name(s: str):
def rgba2rgb(r, g, b, alpha):
""" Transform rgba -> rgb """
r_background, g_background, b_background = 255, 255, 255
r_new = int((1 - alpha) * r_background + alpha * r)
g_new = int((1 - alpha) * g_background + alpha * g)
@@ -88,6 +93,7 @@ def rgba2rgb(r, g, b, alpha):
def str2hex(s: str):
""" Transform str -> hex """
if '#' in s and (len(s) <= 7):
return s.lower()

View File

@@ -3,6 +3,7 @@ import logging
class ColoredFormatter(logging.Formatter):
""" Class to prettify logger and command line output """
MAPPING = {
'DEBUG': 37, # white
'INFO': 36, # cyan
@@ -61,9 +62,7 @@ class BookLogger:
self.logger.log(msg=message, level=logging_level, stacklevel=2)
def log_error_to_main_log(self, message=''):
"""
Method for logging error to main log file.
"""
""" Method for logging error to main log file. """
if self.main_logger:
if not message:
message = f'Error in book conversion. Check log file.'
@@ -71,6 +70,8 @@ class BookLogger:
class BookStatusWrapper:
"""Class sets/updates statuses of Converter on Platform"""
def __init__(self, access, logger_object, book_id=0):
self.access = access
self.logger_object = logger_object