Make todos & refactor code

This commit is contained in:
Kiryl
2021-11-02 12:06:34 +03:00
parent 8c37482616
commit 479695e185
5 changed files with 314 additions and 242 deletions

View File

@@ -95,19 +95,19 @@ class Access:
else:
raise Exception(f'{response.status_code}')
def get_doc(self, doc_id):
def get_book(self, book_id):
if self.is_time_for_refreshing():
self.refresh_token()
self.refreshing.wait()
response = requests.get(f'{self.url}/doc-convert/{doc_id}/file', headers=self.headers)
response = requests.get(f'{self.url}/doc-convert/{book_id}/file', headers=self.headers)
if response.status_code == 404:
raise FileNotFoundError('404 Not Found: file have not found.')
elif response.status_code == 200:
content = response.content
else:
raise Exception(f'Error in getting doc from url: {self.url}/doc-convert/{doc_id}/file, '
raise Exception(f'Error in getting doc from url: {self.url}/doc-convert/{book_id}/file, '
f'status code:{response.status_code}')
return content

View File

@@ -5,11 +5,10 @@ In parallel it updates status of a book conversion on admin panel.
Finally sends result to server.
Result is a json, JSON schema in book_schema.json
"""
import codecs
import json
import logging
import os
import json
import codecs
import logging
import pathlib
from abc import abstractmethod, ABCMeta
@@ -61,11 +60,11 @@ class BookSolver:
"""
try:
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
content = self.access.get_doc(self.book_id)
content = self.access.get_book(self.book_id)
self.logger_object.log('File was received from server.')
self.save_book_file(content)
except FileNotFoundError as f_err:
self.logger_object.log("Can't get docx from server.", logging.ERROR)
self.logger_object.log("Can't get file from server.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise f_err
except Exception as exc:
@@ -109,8 +108,9 @@ class BookSolver:
return {}
def test_conversion(self):
'''Function
without sending to server'''
self.logger_object.log('Beginning of the test.')
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'{self.book_type}')
file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
@@ -121,6 +121,9 @@ class BookSolver:
self.logger_object.log('End of the test.')
def conversion(self):
'''Function
with downloading book from server
with sending to server'''
try:
self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.')
self.get_book_file()
@@ -137,14 +140,14 @@ class BookSolver:
raise exc
def conversion_local(self):
'''Function
without downloading book from server (local)
with sending to server'''
try:
with open('tmp.json') as f:
d = json.load(f)
self.send_json_content_to_server(d)
self.logger_object.log(f'End of the conversion to LiveCarta format. Check {self.output_path}.')
self.logger_object.log(f'Data has been downloaded from tmp.json file: {self.output_path}')
with codecs.open('json/tmp.json', 'r', encoding='utf-8') as f_json:
content_dict = json.load(f_json)
self.send_json_content_to_server(content_dict)
except Exception as exc:
self.status_wrapper.set_error()
self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
self.logger_object.log_error_to_main_log(str(exc))
raise exc
self.logger_object.log('Error has occurred while reading json file.' + str(exc), logging.ERROR)

View File

@@ -8,8 +8,9 @@ from bs4 import BeautifulSoup
from premailer import transform
from itertools import takewhile
from src.livecarta_config import LiveCartaConfig
from src.util.color_reader import str2hex
from src.livecarta_config import LiveCartaConfig
cssutils.log.setLevel(CRITICAL)
@@ -211,9 +212,9 @@ def build_css_content(css_content):
class TagStyleConverter:
def __init__(self, tag_with_initial_style, tag_with_ultimate_style):
self.tag_with_initial_style = tag_with_initial_style # tag with inline style to be updated with style attribute
self.tag_initial_name = tag_with_initial_style.name
def __init__(self, tag_with_inline_style, tag_with_ultimate_style):
self.tag_with_inline_style = tag_with_inline_style # tag with inline style to be updated with style attribute
self.tag_initial_name = tag_with_inline_style.name
self.tag_with_ultimate_style = tag_with_ultimate_style # tag with inline style + style parsed from css file
self.style = self.preprocess_style()
@@ -293,32 +294,39 @@ class TagStyleConverter:
ultimate_style = ultimate_style.replace('background:', 'background-color:')
ultimate_style = ultimate_style.replace('list-style-image', 'list-style-type')
split_ultimate_style = ultimate_style.split(';') # make for repetition check and convert to px
split_ultimate_style = ultimate_style.replace('; ',';').split(';')
# check for another ; in style string in preprocess_style()
# when we split style by ; and we have at the end ; that's why we have '' in list
while '' in split_ultimate_style:
split_ultimate_style.remove('')
ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
if self.tag_with_initial_style.attrs.get('style'):
# replace all spaces between ': & letter' to ':'
split_ultimate_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_ultimate_style]
initial_style = self.tag_with_initial_style.attrs['style']
split_initial_style = initial_style.split(';')
if self.tag_with_inline_style.attrs.get('style'):
inline_style = self.tag_with_inline_style.attrs['style']
# check for another ; in style string in preprocess_style()
while '' in split_initial_style:
split_initial_style.remove('')
split_inline_style = inline_style.replace('; ',';').split(';')
# repetition check - if tag had already had inline style, add this to style parsed from css
repeat_styles = list(set(split_ultimate_style) & set(split_initial_style))
# when we split style by ; and we have at the end ; that's why we have '' in list
while '' in split_inline_style:
split_inline_style.remove('')
# replace all spaces between ': & letter' to ':'
split_inline_style = [el.replace(re.search(r'(:\s*)', el).group(1), ':') for el in split_inline_style]
# repetition check - if the tag had already had inline style that isn't in the css styles, add this to style parsed from css
repeat_styles = list(set(split_ultimate_style) & set(split_inline_style))
for item in repeat_styles:
split_initial_style.remove(item)
split_inline_style.remove(item)
if split_initial_style:
# if initial style is not empty - start convert and add to ultimate style
if split_inline_style:
# if inline style is not empty - start convert and add to ultimate style
print('we enter repetition check', '\n')
initial_style: str = self.process_indents_to_px(split_initial_style)
ultimate_style += initial_style
inline_style: str = self.process_indents_to_px(split_inline_style)
ultimate_style += inline_style
ultimate_style: str = self.process_indents_to_px(split_ultimate_style)
return ultimate_style
def change_attrs_with_corresponding_tags(self):
@@ -330,15 +338,15 @@ class TagStyleConverter:
self.style = self.style.replace(s, '')
self.style = self.style.strip()
if i == 0:
self.tag_with_initial_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tags.append(self.tag_with_initial_style)
self.tag_with_inline_style.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tags.append(self.tag_with_inline_style)
else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag)
new_tags.append(new_tag)
top_tag = self.tag_with_initial_style
top_tag = self.tag_with_inline_style
if new_tags:
tmp_attrs = top_tag.attrs.copy()
@@ -355,10 +363,12 @@ class TagStyleConverter:
@staticmethod
def wrap_span_in_p_to_save_style_attrs(tag):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom']]
'''Function designed to save style attrs that cannot be in p -> span
that cannot be in span -> p'''
if tag.name == 'p' and tag.attrs.get('style'):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent', 'border-bottom']]
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
if any(styles_to_be_saved):
tag.name = 'span'
@@ -388,83 +398,81 @@ class TagStyleConverter:
tag.wrap(p_tag)
@staticmethod
def add_span_to_save_style_attrs_in_li(t):
if t.name == 'li' and t.attrs.get('style'):
def wrap_span_in_li_to_save_style_attrs(tag):
if tag.name == 'li' and tag.attrs.get('style'):
styles_cant_be_in_li = [attr for attr in LIVECARTA_STYLE_ATTRS if
attr not in ['text-align', 'list-style-type', 'border-bottom']]
attr not in ['text-align', 'list-style-type']]
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_li]
if any(check):
t.name = 'span'
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_li]
if any(styles_to_be_saved):
tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('li')
old_style = t.attrs['style']
new_style = ''
span_style = tag.attrs['style']
li_style = ''
for possible_li_attrs_regexp in [re.compile(r'(text-align:(\w+);)'),
re.compile(r'(list-style-type:(\w+);)')]:
has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
new_style += has_li_style_attrs.group(1)
old_style = old_style.replace(has_li_style_attrs.group(1), '')
li_style += has_li_style_attrs.group(1)
span_style = span_style.replace(has_li_style_attrs.group(1), '')
li_tag.attrs['style'] = new_style
t.attrs['style'] = old_style
t.wrap(li_tag)
li_tag.attrs['style'] = li_style
tag.attrs['style'] = span_style
tag.wrap(li_tag)
@staticmethod
def add_span_to_save_style_attrs_in_ul_ol(t):
if t.name in ['ul', 'ol'] and t.attrs.get('style'):
def wrap_span_in_ul_ol_to_save_style_attrs(tag):
if tag.name in ['ul', 'ol'] and tag.attrs.get('style'):
styles_cant_be_in_ul_ol = [attr for attr in LIVECARTA_STYLE_ATTRS if attr not in ['list-style-type']]
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_ul_ol]
check = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_ul_ol]
if any(check):
t.name = 'span'
tag.name = 'span'
li_tag = BeautifulSoup(features='lxml').new_tag('ul')
old_style = t.attrs['style']
span_style = tag.attrs['style']
possible_li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attrs = re.search(possible_li_attrs_regexp, old_style)
has_li_style_attrs = re.search(possible_li_attrs_regexp, span_style)
if has_li_style_attrs and has_li_style_attrs.group(1):
new_style = has_li_style_attrs.group(1)
old_style = old_style.replace(new_style, '')
li_tag.attrs['style'] = new_style
t.attrs['style'] = old_style
t.wrap(li_tag)
oul_style = has_li_style_attrs.group(1)
span_style = span_style.replace(oul_style, '')
li_tag.attrs['style'] = oul_style
tag.attrs['style'] = span_style
tag.wrap(li_tag)
@staticmethod
def add_span_to_save_style_attrs(t):
no_style_in_livecarta_regexp = re.compile('(^h[1-9]$)')
def wrap_span_in_h_to_save_style_attrs(tag):
h_regexp = re.compile('(^h[1-9]$)')
if re.search(no_style_in_livecarta_regexp, t.name) and t.attrs.get('style'):
new_tag = BeautifulSoup(features='lxml').new_tag(t.name)
t.name = 'span'
t.wrap(new_tag)
style = t.attrs['style']
if re.search(h_regexp, tag.name) and tag.attrs.get('style'):
h_tag = BeautifulSoup(features='lxml').new_tag(tag.name)
tag.name = 'span'
tag.wrap(h_tag)
style = tag.attrs['style']
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, style)
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
tag.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
def convert_initial_tag(self):
self.tag_with_initial_style = self.change_attrs_with_corresponding_tags()
self.wrap_span_in_p_to_save_style_attrs(self.tag_with_initial_style)
self.add_span_to_save_style_attrs_in_li(self.tag_with_initial_style)
self.add_span_to_save_style_attrs_in_ul_ol(self.tag_with_initial_style)
self.add_span_to_save_style_attrs(self.tag_with_initial_style)
return self.tag_with_initial_style
self.tag_with_inline_style = self.change_attrs_with_corresponding_tags()
self.wrap_span_in_p_to_save_style_attrs(self.tag_with_inline_style)
self.wrap_span_in_li_to_save_style_attrs(self.tag_with_inline_style)
self.wrap_span_in_ul_ol_to_save_style_attrs(self.tag_with_inline_style)
self.wrap_span_in_h_to_save_style_attrs(self.tag_with_inline_style)
return self.tag_with_inline_style
def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = []
h_regex = f'(^h[1-9]$)'
could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
could_have_style_in_livecarta_regexp = re.compile('(^div$)|(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|(^h[1-9]$)')
tags_with_possible_style_attr = html_soup.find_all(could_have_style_in_livecarta_regexp)
for i, x in enumerate(tags_with_possible_style_attr):
x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i)
# here we add css styles to inline style
# sometimes in html_with_css_styles
html_with_css_styles: str = transform(str(html_soup), css_text=css_text,
remove_classes=False,
external_styles=False,
@@ -474,6 +482,7 @@ def convert_html_soup_with_css_style(html_soup: BeautifulSoup, css_text: str):
inline_soup = BeautifulSoup(html_with_css_styles, features='lxml')
# go through tags with possible style attrs
for i in livecarta_tmp_ids:
tag_with_initial_style = html_soup.find(attrs={'livecarta_id': i})
tag_with_ultimate_style = inline_soup.find(attrs={'livecarta_id': i})

View File

@@ -1,27 +1,28 @@
import os
import re
import json
import codecs
import logging
import os
from os.path import dirname, normpath, join
from itertools import chain
from collections import defaultdict
from typing import Dict, Union, List
from os.path import dirname, normpath, join
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup, Tag
from ebooklib.epub import Link, Section
from bs4 import BeautifulSoup, Tag
from src.util.helpers import BookLogger
from src.livecarta_config import LiveCartaConfig
from src.data_objects import ChapterItem, NavPoint
from src.epub_converter.css_reader import build_css_content, convert_html_soup_with_css_style
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title_and_content, \
from src.epub_converter.html_epub_preprocessor import unwrap_structural_tags, get_tags_between_chapter_marks, prepare_title, prepare_content, \
update_src_links_in_images, preprocess_footnotes
class EpubConverter:
def __init__(self, file, access=None, logger=None):
self.file = file
@@ -29,9 +30,9 @@ class EpubConverter:
self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file)
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
self.html_href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
self.hrefs_added_to_toc = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
# key = -1 for top level NavPoints
@@ -42,8 +43,8 @@ class EpubConverter:
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
self.internal_anchors = set()
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
self.href2img_bytes = {} # file path to bytes
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
self.img_href2img_bytes = {} # file path to bytes
self.old_image_path2aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote
@@ -54,11 +55,11 @@ class EpubConverter:
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name
content = x.content
self.href2img_bytes[file_name] = content
self.img_href2img_bytes[file_name] = content
self.logger.log('HTML files reading.')
self.html_href2html_body_soup: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.html_href2html_body_soup: Dict[str,
BeautifulSoup] = self.build_href2soup_content()
self.logger.log('CSS files processing.')
self.html_href2css_href, self.css_href2css_content = self.build_html_and_css_relations()
@@ -84,12 +85,14 @@ class EpubConverter:
# build simple toc from spine if needed
if self.is_toc_empty():
self.build_adjacency_list_from_spine()
not_added = [x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
not_added = [
x for x in self.html_href2html_body_soup if x not in self.hrefs_added_to_toc]
self.logger.log(f'Html documents not added to TOC: {not_added}.')
self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.')
self.label_chapters_ids_with_tmp_id()
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
# used only after parsed toc, ids from toc needed
self.process_html_soup_structure_to_line()
self.process_internal_links()
self.logger.log(f'Building chapters content.')
self.define_chapters_content()
@@ -110,7 +113,8 @@ class EpubConverter:
path_to_css_from_html = css_href
html_folder = dirname(html_href)
path_to_css_from_root = normpath(join(html_folder, path_to_css_from_html)).replace('\\', '/')
path_to_css_from_root = normpath(
join(html_folder, path_to_css_from_html)).replace('\\', '/')
css_obj = self.ebooklib_book.get_item_with_href(path_to_css_from_root)
assert css_obj, f'Css style {css_href} was not in manifest.'
css_content: str = css_obj.get_content().decode()
@@ -124,14 +128,16 @@ class EpubConverter:
...2... = key2value
'''
html_href2css_href: defaultdict = defaultdict(list) # dictionary: href of html to related css files
# dictionary: href of html to related css files
html_href2css_href: defaultdict = defaultdict(list)
css_href2css_content: dict = {}
for item in self.ebooklib_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
html_content = item.content
html_href = item.file_name
soup_html_content = BeautifulSoup(html_content, features='lxml')
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}): #check if file links to css file
# check if file links to css file
for tag in soup_html_content.find_all('link', attrs={"type": "text/css"}):
if tag.attrs.get('rel') and ('alternate' in tag.attrs['rel']):
continue
css_href = tag.attrs.get('href')
@@ -144,7 +150,8 @@ class EpubConverter:
for i, tag in enumerate(soup_html_content.find_all('style')):
css_content = tag.string
html_href2css_href[html_href].append(f'href{i}')
css_href2css_content[f'href{i}'] = build_css_content(css_content)
css_href2css_content[f'href{i}'] = build_css_content(
css_content)
return html_href2css_href, css_href2css_content,
@@ -153,14 +160,14 @@ class EpubConverter:
This function is designed to update html_href2html_body_soup
And add to html_inline_style css_style_content
'''
for href in self.html_href2html_body_soup:
if self.html_href2css_href.get(href):
css =''
for key in self.html_href2css_href[href]:
css += self.css_href2css_content[key]
content: BeautifulSoup = self.html_href2html_body_soup[href]
for html_href in self.html_href2html_body_soup:
if self.html_href2css_href.get(html_href):
css = ''
for css_href in self.html_href2css_href[html_href]:
css += self.css_href2css_content[css_href]
content: BeautifulSoup = self.html_href2html_body_soup[html_href]
content = convert_html_soup_with_css_style(content, css)
self.html_href2html_body_soup[href] = content
self.html_href2html_body_soup[html_href] = content
def build_manifest_id2html_href(self):
links = dict()
@@ -173,18 +180,18 @@ class EpubConverter:
"""
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib.toc
key = -1 if root, value = None if leaf
key = -1 if root(top chapters),
value = None if leaf(least chapters)
:param element: [Link, tuple, list] - element that appears in TOC( usually parsed from nav.ncx)
:param lvl: level of depth
:param element: [Link, tuple, list] - element that appears in TOC(usually parsed from nav.ncx)
:param lvl: level of depth
"""
if isinstance(element, Link):
# todo: check if link exists
nav_point = NavPoint(element)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
self.html_href2subchapter_ids[nav_point.href].append(nav_point.id)
self.adjacency_list[nav_point] = None
self.hrefs_added_to_toc.add(nav_point.href)
return nav_point
@@ -195,11 +202,12 @@ class EpubConverter:
nav_point = NavPoint(first)
if nav_point.id:
self.id_anchor_exist_in_nav_points = True
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
self.html_href2subchapter_ids[nav_point.href].append(nav_point.id)
sub_nodes = []
for i in second:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
sub_nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[nav_point] = sub_nodes
self.hrefs_added_to_toc.add(nav_point.href)
@@ -208,39 +216,43 @@ class EpubConverter:
elif isinstance(element, list) and (lvl == 0):
sub_nodes = []
for i in element:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
sub_nodes.append(
self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[-1] = sub_nodes
else:
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
assert 0, f'Error. Element is not tuple/Link/list instance: {type(element)}'
def is_toc_empty(self):
# there is no toc in ebook or no top chapters
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return True
return False
def build_adjacency_list_from_spine(self):
manifest_id2href = self.build_manifest_id2html_href()
manifest_id2html_href = self.build_manifest_id2html_href()
self.adjacency_list = {
-1: []
}
for id_, _ in self.ebooklib_book.spine:
nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
nav_point = NavPoint(
Section(manifest_id2html_href[id_], manifest_id2html_href[id_]))
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added):
for i, file in enumerate(not_added):
nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file))
nav_point = NavPoint(
Section(f'To check #{i}, filename: {file}', file))
self.adjacency_list[-1].append(nav_point)
self.hrefs_added_to_toc.add(file)
def label_chapters_ids_with_tmp_id(self):
for href in self.html_href2html_body_soup:
ids = self.href2subchapter_ids[href]
for html_href in self.html_href2html_body_soup:
ids = self.html_href2subchapter_ids[html_href]
for i in ids:
soup = self.html_href2html_body_soup[href]
soup = self.html_href2html_body_soup[html_href]
tag = soup.find(id=i)
new_h = soup.new_tag('tmp')
new_h.attrs['class'] = 'converter-chapter-mark'
@@ -249,9 +261,9 @@ class EpubConverter:
def process_html_soup_structure_to_line(self):
# go to line structure
for href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[href]
self.html_href2html_body_soup[href] = unwrap_structural_tags(soup)
for html_href in self.html_href2html_body_soup:
soup = self.html_href2html_body_soup[html_href]
self.html_href2html_body_soup[html_href] = unwrap_structural_tags(soup)
@staticmethod
def create_unique_id(href, id_):
@@ -280,8 +292,10 @@ class EpubConverter:
:return:
"""
dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
full_path = [path for path in self.hrefs_added_to_toc if normed_path in path]
normed_path = os.path.normpath(os.path.join(
dir_name, href_in_link)).replace('\\', '/')
full_path = [
path for path in self.hrefs_added_to_toc if normed_path in path]
if not full_path:
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.')
@@ -291,7 +305,7 @@ class EpubConverter:
if len(full_path) > 1:
self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
f' while {internal_link_tag} processing. The first one will be chosen.')
return full_path[0]
def process_internal_links(self):
@@ -308,13 +322,15 @@ class EpubConverter:
tag.attrs['id'] = new_id
# 2.a) process anchor which is a whole xhtml file
internal_link_reg1 = re.compile(r'(^(?!https?://).+\.(htm|html|xhtml)$)')
internal_link_reg1 = re.compile(
r'(^(?!https?://).+\.(htm|html|xhtml)$)')
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
a_tag_href = internal_link_tag.attrs['href']
# find full path
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc:
continue
new_id = self.create_unique_id(a_tag_href_matched_to_toc, '')
@@ -322,7 +338,8 @@ class EpubConverter:
if new_id not in self.internal_anchors:
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
new_anchor_span = self.create_new_anchor_span(soup, new_id)
anchor_soup.insert(0, new_anchor_span) # insert a new span to the begin of the file
# insert a new span to the begin of the file
anchor_soup.insert(0, new_anchor_span)
self.internal_anchors.add(new_id)
del internal_link_tag.attrs['href']
@@ -332,20 +349,26 @@ class EpubConverter:
for toc_href in self.hrefs_added_to_toc:
soup = self.html_href2html_body_soup[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split(
'#')
# find full path
if a_tag_href:
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href,
internal_link_tag)
else:
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
a_tag_href_matched_to_toc = os.path.normpath(
toc_href).replace('\\', '/')
if not a_tag_href_matched_to_toc:
continue
new_id = self.create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
new_id = self.create_unique_id(
a_tag_href_matched_to_toc, a_tag_id)
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
anchor_tags = anchor_soup.find_all(attrs={'id': new_id})
anchor_tags = anchor_tags or anchor_soup.find_all(attrs={'id': a_tag_id}) # if link is a footnote
anchor_tags = anchor_soup.find_all(attrs={'id': new_id, })
anchor_tags = anchor_tags or anchor_soup.find_all(
attrs={'id': a_tag_id}) # if link is a footnote
if anchor_tags:
if len(anchor_tags) > 1:
@@ -359,7 +382,8 @@ class EpubConverter:
internal_link_tag.attrs['placeholder'] = '{{tempStyleToAnchor-' + new_id + '}}'
# create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs['id'] not in self.internal_anchors:
new_anchor_span = self.create_new_anchor_span(soup, new_id)
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id)
del anchor_tag.attrs['id']
@@ -386,11 +410,13 @@ class EpubConverter:
"""
if nav_point.id:
soup = self.html_href2html_body_soup[nav_point.href]
chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup)
chapter_tags = get_tags_between_chapter_marks(
first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser')
for tag in chapter_tags:
new_tree.append(tag)
self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] = new_tree
self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)] = new_tree
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
@@ -405,25 +431,27 @@ class EpubConverter:
def node_to_livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
title = nav_point.title
if nav_point.id:
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
content: BeautifulSoup = self.href_chapter_id2soup_html[(
nav_point.href, nav_point.id)]
else:
content: BeautifulSoup = self.html_href2html_body_soup[nav_point.href]
self.old_image_path2aws_path = update_src_links_in_images(content,
self.href2img_bytes,
self.img_href2img_bytes,
path_to_html=nav_point.href,
access=self.access,
path2aws_path=self.old_image_path2aws_path)
is_chapter = lvl <= LiveCartaConfig.SUPPORTED_LEVELS
title_preprocessed, content_preprocessed = prepare_title_and_content(title, content,
remove_title_from_chapter=is_chapter)
title_preprocessed = prepare_title(title)
content_preprocessed = prepare_content(title_preprocessed, content,
remove_title_from_chapter=is_chapter)
sub_nodes = []
# warning! not EpubHtmlItems won;t be added to chapter
# warning! not EpubHtmlItems won't be added to chapter
if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[nav_point]:
sub_chapter_item = self.node_to_livecarta_chapter_item(sub_node, lvl + 1)
sub_chapter_item = self.node_to_livecarta_chapter_item(
sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item)
if self.logger:
@@ -451,16 +479,16 @@ class EpubConverter:
if __name__ == "__main__":
logger = logging.getLogger('epub')
file_handler = logging.StreamHandler()
logger.addHandler(file_handler)
file_handler = logging.FileHandler('../epub.log', mode='w+')
stream_handler = logging.StreamHandler()
logger.addHandler(stream_handler)
file_handler = logging.FileHandler('../../epub.log', mode='w+')
logger.addHandler(file_handler)
logger_object = BookLogger(name=f'epub', main_logger=logger, book_id=0)
json_converter = EpubConverter('../../epub/Cook.epub',
json_converter = EpubConverter('../../epub/9781634259804.epub',
logger=logger_object)
tmp = json_converter.convert_to_dict()
with codecs.open('tmp.json', 'w', encoding='utf-8') as f:
json.dump(tmp, f, ensure_ascii=False)
with codecs.open('../../json/tmp.json', 'w', encoding='utf-8') as f:
json.dump(tmp, f, ensure_ascii=False)

View File

@@ -11,7 +11,8 @@ from src.livecarta_config import LiveCartaConfig
def save_image_locally(img_file_path, img_content, book_id):
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
new_path = pathlib.Path(os.path.join(folder_path, f'../json/img_{book_id}/'))
new_path = pathlib.Path(os.path.join(
folder_path, f'../json/img_{book_id}/'))
new_path.mkdir(exist_ok=True)
new_img_path = new_path / os.path.basename(img_file_path)
@@ -23,7 +24,8 @@ def save_image_locally(img_file_path, img_content, book_id):
def save_image_to_aws(access: Access, img_file_path, img_content: bytes, book_id):
link = access.send_image(img_file_path, doc_id=book_id, img_content=img_content)
link = access.send_image(
img_file_path, doc_id=book_id, img_content=img_content)
return link
@@ -37,7 +39,8 @@ def update_src_links_in_images(body_tag: Tag,
for img in img_tags:
path_to_img_from_html = img.attrs.get('src')
html_folder = os.path.dirname(path_to_html)
path_to_img_from_root = os.path.normpath(os.path.join(html_folder, path_to_img_from_html)).replace('\\', '/')
path_to_img_from_root = os.path.normpath(os.path.join(
html_folder, path_to_img_from_html)).replace('\\', '/')
assert path_to_img_from_root in href2img_content, \
f'Image {path_to_img_from_html} in file {path_to_html} was not added to manifest.'
@@ -47,10 +50,12 @@ def update_src_links_in_images(body_tag: Tag,
if path_to_img_from_root in path2aws_path:
new_folder = path2aws_path[path_to_img_from_root]
else:
new_folder = save_image_to_aws(access, path_to_img_from_root, img_content, 'book_id')
new_folder = save_image_to_aws(
access, path_to_img_from_root, img_content, 'book_id')
path2aws_path[path_to_img_from_root] = new_folder
else:
new_folder = save_image_locally(path_to_img_from_root, img_content, 'book_id')
new_folder = save_image_locally(
path_to_img_from_root, img_content, 'book_id')
img.attrs['src'] = str(new_folder)
if img.attrs.get('width'):
@@ -71,7 +76,8 @@ def preprocess_table(body_tag: BeautifulSoup):
style = td.get('style')
width = ''
if style:
width_match = re.search(r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
width_match = re.search(
r"[^-]width: ?(\d+\.?\d*)(p[tx])", style)
if width_match:
size = width_match.group(1)
units = width_match.group(2)
@@ -96,10 +102,10 @@ def process_lists(body_tag):
"""
li_tags = body_tag.find_all("li")
for il_tag in li_tags:
if il_tag.p:
il_tag.attrs.update(il_tag.p.attrs)
il_tag.p.unwrap()
for li_tag in li_tags:
if li_tag.p:
li_tag.attrs.update(li_tag.p.attrs)
li_tag.p.unwrap()
def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
@@ -111,11 +117,12 @@ def insert_span_with_attrs_before_tag(main_tag, tag, id_, class_):
def clean_headings_content(content: Tag, title: str):
def _add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
def add_span_to_save_ids_for_links(tag_to_be_removed, body_tag):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(body_tag,
tag_to_be_removed,
id_=tag_to_be_removed.attrs.get('id'),
id_=tag_to_be_removed.attrs.get(
'id'),
class_=tag_to_be_removed.attrs.get('class'))
for sub_tag in tag_to_be_removed.find_all():
@@ -136,10 +143,10 @@ def clean_headings_content(content: Tag, title: str):
text = re.sub(r' +', ' ', text).strip()
text = text.lower()
if title == text:
_add_span_to_save_ids_for_links(child, content)
add_span_to_save_ids_for_links(child, content)
child.extract()
elif (title in text) and (child.name in ['h1', 'h2', 'h3']):
_add_span_to_save_ids_for_links(child, content)
add_span_to_save_ids_for_links(child, content)
child.extract()
break
@@ -187,9 +194,12 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
"""
footnotes = []
noterefs_tags = source_html_tag.find_all(attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set([tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
noterefs_tags = source_html_tag.find_all(
attrs={noteref_attr_name: 'noteref'})
bad_noterefs_tags = set(
[tag for tag in noterefs_tags if not tag.attrs.get('href')])
noterefs_tags = [
tag for tag in noterefs_tags if tag not in bad_noterefs_tags]
new_noterefs_tags = []
new_footnotes_tags = []
[tag.decompose() for tag in bad_noterefs_tags]
@@ -204,7 +214,8 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
if len(tags) == 0:
anchored_tags = list(target_html_tag.find_all(id=element_id))
if len(anchored_tags):
print(f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
print(
f'Warning. Href for tag is detected as footnote:\n{noteref_tag}')
return anchored_tags
else:
assert 0, f'Error, No element with id: {href} found.'
@@ -219,7 +230,8 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
else:
target_html_tag = href2soup_html.get(file)
if not target_html_tag:
print(f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
print(
f'Error while footnotes processing. For {noteref_tag} invalid path: {file}.')
continue
possible_footnote = 'note|footnote|endnote|rearenote'
@@ -230,11 +242,13 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
footnote_tag = expected_footnote_tags[0]
if footnote_tag.parent.attrs.get('role') and footnote_tag.parent.attrs.get('role') == 'doc-endnote':
footnote_tag = footnote_tag.parent
new_noterefs_tags.append(replace_with_livecarta_anchor_tag(noteref_tag, i))
new_noterefs_tags.append(
replace_with_livecarta_anchor_tag(noteref_tag, i))
content = footnote_tag.text
# footnote_tag.decompose()
footnotes.append(content)
footnote_tag = footnote_tag.find(attrs={'role': 'doc-backlink'}) or footnote_tag
footnote_tag = footnote_tag.find(
attrs={'role': 'doc-backlink'}) or footnote_tag
new_footnotes_tags.append(footnote_tag)
return footnotes, new_noterefs_tags, new_footnotes_tags
@@ -262,7 +276,8 @@ def unwrap_structural_tags(body_tag):
def _preserve_class_in_aside_tag(tag_):
# to save css style inherited from class, copy class to aside tag (which is parent to tag_)
# this is for Wiley books with boxes
tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
if tag_.parent.name == 'aside':
if not tag_.parent.attrs.get('class'):
tag_.parent.attrs['class'] = tag_class
@@ -272,7 +287,8 @@ def unwrap_structural_tags(body_tag):
# this is for Wiley books with boxes
# returns True, if <section> could be unwrapped
tag_class = tag_.attrs['class'] if not isinstance(tag_.attrs['class'], list) else tag_.attrs['class'][0]
tag_class = tag_.attrs['class'] if not isinstance(
tag_.attrs['class'], list) else tag_.attrs['class'][0]
if 'feature' not in tag_class:
return True
child_p_tags = tag_.find_all("p")
@@ -288,51 +304,56 @@ def unwrap_structural_tags(body_tag):
else:
return True
def add_table_to_abc_books(tag_, border, bg_color):
wrap_block_tag_with_table(body_tag, old_tag=tag_, width='100', border=border, bg_color=bg_color)
def add_span_to_save_ids_for_links(tag_to_be_removed):
if tag_to_be_removed.attrs.get('id'):
insert_span_with_attrs_before_tag(main_tag=body_tag, tag=tag_to_be_removed,
id_=tag_to_be_removed.attrs['id'],
class_=tag_to_be_removed.attrs.get('class'))
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
def replace_div_tag_with_table():
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(
div.attrs['class'], list) else div.attrs['class'][0]
if div_class in ['C409', 'C409a']:
wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid 3px', bg_color='#e7e7e9')
elif div_class in ['C441', 'C816']:
wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='solid #6e6e70 1px', bg_color='#e7e7e8')
if div.attrs.get('style'):
if 'background-color' in div.attrs['style']:
end_index = div.attrs['style'].find(
'background-color') + len('background-color')
start_index_of_color = end_index + 2
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color + 7]
wrap_block_tag_with_table(
body_tag, old_tag=div, width='100', border='', bg_color=bg_color)
elif div.attrs.get('style') == '':
del div.attrs['style']
structural_tags_names = [
'div', 'section', 'article', 'main', 'body', 'html', 'aside', 'canvas', 'data',
'figure', 'footer', 'iframe', 'span', 'p'
]
if div.contents:
is_not_struct_tag = [
child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
add_span_to_save_ids_for_links(div)
div.unwrap()
# comments removal
for tag in body_tag.find_all():
for element in tag(text=lambda text: isinstance(text, Comment)):
element.extract()
for div in body_tag.find_all("div"):
if div.attrs.get('class'):
div_class = div.attrs['class'] if not isinstance(div.attrs['class'], list) else div.attrs['class'][0]
if div_class in ['C409', 'C409a']:
add_table_to_abc_books(div, border='solid 3px', bg_color='#e7e7e9')
elif div_class in ['C441', 'C816']:
add_table_to_abc_books(div, border='solid #6e6e70 1px', bg_color='#e7e7e8')
if div.attrs.get('style'):
if 'background-color' in div.attrs['style']:
end_index = div.attrs['style'].find('background-color') + len('background-color')
start_index_of_color = end_index + 2
bg_color = div.attrs['style'][start_index_of_color:start_index_of_color+7]
add_table_to_abc_books(div, border='', bg_color=bg_color)
if div.attrs.get('style') == '':
del div.attrs['style']
if div.contents:
is_not_struct_tag = [child.name not in structural_tags_names for child in div.contents]
if all(is_not_struct_tag):
div.name = 'p'
continue
add_span_to_save_ids_for_links(div)
div.unwrap()
replace_div_tag_with_table()
for s in body_tag.find_all("section"):
could_be_unwrapped = True
@@ -348,7 +369,8 @@ def unwrap_structural_tags(body_tag):
for s in body_tag.find_all("figure"):
s.name = 'p'
s.attrs['style'] = "text-align: center;" # to center image inside this tag
# to center image inside this tag
s.attrs['style'] = "text-align: center;"
for s in body_tag.find_all("figcaption"):
add_span_to_save_ids_for_links(s)
@@ -383,7 +405,8 @@ def unwrap_structural_tags(body_tag):
x.parent.unwrap() # todo warning! could reflect on formatting/internal links in some cases
parents_marks_are_body = [x.parent == body_tag for x in marks]
assert all(parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
assert all(
parents_marks_are_body), 'Anchor for chapter is deeper than 2 level. Chapters can not be parsed.'
heading_tag_to_p_tag(body_tag)
@@ -411,7 +434,8 @@ def get_tags_between_chapter_marks(first_id, href, html_soup):
:param html_soup: soup object of current file
:return: list [Tag, NavigableString]; chapter's tags
"""
marked_tags = html_soup.find(attrs={'id': first_id, 'class': 'converter-chapter-mark'})
marked_tags = html_soup.find(
attrs={'id': first_id, 'class': 'converter-chapter-mark'})
if marked_tags:
next_tag = marked_tags.next_sibling
tags = []
@@ -484,16 +508,20 @@ def preprocess_block_tags(chapter_tag):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get('class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get('class') == 'feature2' else color
color = '#DDDDDD' if block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get(
'class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
clean_wiley_block(future_block)
color = '#DDDDDD' if future_block.attrs.get('class') == 'feature1' else None
color = '#EEEEEE' if future_block.attrs.get('class') == 'feature2' else color
color = '#DDDDDD' if future_block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if future_block.attrs.get(
'class') == 'feature2' else color
wrap_block_tag_with_table(chapter_tag, future_block, bg_color=color)
@@ -512,7 +540,8 @@ def preprocess_pre_tags(chapter_tag):
new_tag = BeautifulSoup(features='lxml').new_tag("span")
new_tag.attrs = pre.attrs.copy()
spans = pre.find_all("span")
to_add_br = len(spans) > 1 # if in <pre> there are multiple <span>, we need to add <br> after each content
# if in <pre> there are multiple <span>, we need to add <br> after each content
to_add_br = len(spans) > 1
for child in pre.children:
if isinstance(child, NavigableString):
@@ -520,7 +549,8 @@ def preprocess_pre_tags(chapter_tag):
sub_strings = re.split('\r\n|\n|\r', cleaned_text)
for string in sub_strings:
new_tag.append(NavigableString(string))
new_tag.append(BeautifulSoup(features='lxml').new_tag('br'))
new_tag.append(BeautifulSoup(
features='lxml').new_tag('br'))
else:
for sub_child in child.children:
if isinstance(sub_child, NavigableString):
@@ -531,7 +561,8 @@ def preprocess_pre_tags(chapter_tag):
cleaned_tag = child.extract()
new_tag.append(cleaned_tag)
if to_add_br:
new_tag.append(BeautifulSoup(features='lxml').new_tag('br'))
new_tag.append(BeautifulSoup(
features='lxml').new_tag('br'))
new_tag.attrs['style'] = "font-family: courier new,courier,monospace; " \
"font-size: 14px; white-space: nowrap;"
@@ -551,40 +582,41 @@ def preprocess_code_tags(chapter_tag):
code.attrs['style'] = 'color:#c7254e; font-size: 14px; font-family: courier new,courier,monospace;'
def prepare_title_and_content(title, chapter_tag: BeautifulSoup, remove_title_from_chapter) -> Tuple[str, str]:
def prepare_title(title_of_chapter: str) -> str:
"""
Final processing/cleaning function.
:param title: title of the chapter
:param chapter_tag: soup object
:param remove_title_from_chapter: bool
:return: tuple[str, str]
"""
title_str = BeautifulSoup(title, features='lxml').string
title_str = BeautifulSoup(title_of_chapter, features='lxml').string
title_str = re.sub(r'([\n\t\xa0])', ' ', title_str)
title_str = re.sub(r' +', ' ', title_str).rstrip()
title_str = clean_title_from_numbering(title_str)
return title_str
def prepare_content(title_str: str, content_tag: BeautifulSoup, remove_title_from_chapter: bool) -> str:
"""
Final processing/cleaning function.
"""
# 0. cleaning \n
to_remove = []
for child in chapter_tag.contents:
for child in content_tag.contents:
if isinstance(child, NavigableString):
s = re.sub(r'([\n\t])', '', child.string)
if s == '':
to_remove.append(child)
[x.extract() for x in to_remove]
# 1. heading removal
if remove_title_from_chapter:
clean_headings_content(chapter_tag, title_str)
process_lists(chapter_tag)
preprocess_table(chapter_tag)
preprocess_code_tags(chapter_tag)
preprocess_pre_tags(chapter_tag)
preprocess_block_tags(chapter_tag)
clean_headings_content(content_tag, title_str)
process_lists(content_tag)
preprocess_table(content_tag)
preprocess_code_tags(content_tag)
preprocess_pre_tags(content_tag)
preprocess_block_tags(content_tag)
# 2. class removal
for tag in chapter_tag.find_all(recursive=True):
for tag in content_tag.find_all(recursive=True):
if hasattr(tag, 'attrs') and tag.attrs.get('class') and (tag.attrs.get('class') not in ['link-anchor',
'footnote-element']):
del tag.attrs['class']
# content_str = re.sub(r'([\n\t\xa0])', ' ', str(content_tag))
title_str = clean_title_from_numbering(title_str)
return title_str, str(chapter_tag)
return str(content_tag)