forked from LiveCarta/BookConverter
Synchronize with dev branch
This commit is contained in:
@@ -132,6 +132,8 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
|
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
|
||||||
|
|
||||||
|
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
|
||||||
"""
|
"""
|
||||||
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
|
||||||
('font-weight', 'bold'): 'strong',
|
('font-weight', 'bold'): 'strong',
|
||||||
@@ -231,22 +233,22 @@ class TagStyleConverter:
|
|||||||
# if tag had already had inline style, add this to style parsed from css
|
# if tag had already had inline style, add this to style parsed from css
|
||||||
if self.tag.attrs.get('style') and self.tag.attrs['style'] not in style:
|
if self.tag.attrs.get('style') and self.tag.attrs['style'] not in style:
|
||||||
style += self.tag.attrs['style']
|
style += self.tag.attrs['style']
|
||||||
print(style)
|
|
||||||
return style
|
return style
|
||||||
|
|
||||||
def change_attrs_with_corresponding_tags(self):
|
def change_attrs_with_corresponding_tags(self):
|
||||||
# adds <b>, <u>, <sup>, etc
|
# adds <b>, <u>, <sup>, etc
|
||||||
to_remove = check_style_to_be_tag(self.style)
|
to_remove = check_style_to_be_tag(self.style)
|
||||||
new_tags = []
|
new_tags = []
|
||||||
for i, (p, v) in enumerate(to_remove):
|
for i, (attr, value) in enumerate(to_remove):
|
||||||
s = f'{p}:{v};'
|
s = f'{attr}:{value};'
|
||||||
self.style = self.style.replace(s, '')
|
self.style = self.style.replace(s, '')
|
||||||
self.style = self.style.strip()
|
self.style = self.style.strip()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
|
self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
||||||
new_tags.append(self.tag)
|
new_tags.append(self.tag)
|
||||||
else:
|
else:
|
||||||
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)]
|
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
|
||||||
new_tag = BeautifulSoup(features='lxml').new_tag(name)
|
new_tag = BeautifulSoup(features='lxml').new_tag(name)
|
||||||
new_tags[-1].wrap(new_tag)
|
new_tags[-1].wrap(new_tag)
|
||||||
new_tags.append(new_tag)
|
new_tags.append(new_tag)
|
||||||
@@ -267,34 +269,34 @@ class TagStyleConverter:
|
|||||||
return top_tag
|
return top_tag
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def wrap_p_to_save_style_attrs(t):
|
def wrap_span_in_p_to_save_style_attrs(tag):
|
||||||
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
|
||||||
if attr not in ['text-align', 'text-indent']]
|
if attr not in ['text-align', 'text-indent']]
|
||||||
|
|
||||||
if t.name == 'p' and t.attrs.get('style'):
|
if tag.name == 'p' and tag.attrs.get('style'):
|
||||||
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_p]
|
styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
|
||||||
if any(check):
|
if any(styles_to_be_saved):
|
||||||
t.name = 'span'
|
tag.name = 'span'
|
||||||
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
p_tag = BeautifulSoup(features='lxml').new_tag('p')
|
||||||
old_style = t.attrs['style']
|
span_style = tag.attrs['style']
|
||||||
new_style = ''
|
p_style = ''
|
||||||
possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)')
|
possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)')
|
||||||
has_p_style_attrs = re.search(possible_p_attrs_regexp, old_style)
|
has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
|
||||||
if has_p_style_attrs:
|
if has_p_style_attrs:
|
||||||
if has_p_style_attrs.group(1):
|
if has_p_style_attrs.group(1):
|
||||||
new_style += has_p_style_attrs.group(1)
|
p_style += has_p_style_attrs.group(1)
|
||||||
old_style = old_style.replace(has_p_style_attrs.group(1), '')
|
span_style = span_style.replace(has_p_style_attrs.group(1), '')
|
||||||
if has_p_style_attrs.group(3):
|
if has_p_style_attrs.group(3):
|
||||||
new_style += has_p_style_attrs.group(3)
|
p_style += has_p_style_attrs.group(3)
|
||||||
old_style = old_style.replace(has_p_style_attrs.group(3), '')
|
span_style = span_style.replace(has_p_style_attrs.group(3), '')
|
||||||
|
|
||||||
p_tag.attrs['style'] = new_style
|
p_tag.attrs['style'] = p_style
|
||||||
|
|
||||||
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
|
||||||
has_li_style_attr = re.search(li_attrs_regexp, old_style)
|
has_li_style_attr = re.search(li_attrs_regexp, span_style)
|
||||||
old_style = old_style if not has_li_style_attr else old_style.replace(has_li_style_attr.group(1), '')
|
span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '')
|
||||||
t.attrs['style'] = old_style
|
tag.attrs['style'] = span_style
|
||||||
t.wrap(p_tag)
|
tag.wrap(p_tag)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_span_to_save_style_attrs_in_li(t):
|
def add_span_to_save_style_attrs_in_li(t):
|
||||||
@@ -354,25 +356,24 @@ class TagStyleConverter:
|
|||||||
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
|
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
|
||||||
|
|
||||||
def convert_initial_tag(self):
|
def convert_initial_tag(self):
|
||||||
del self.tag.attrs['livecarta_id']
|
|
||||||
self.tag = self.change_attrs_with_corresponding_tags()
|
self.tag = self.change_attrs_with_corresponding_tags()
|
||||||
self.wrap_p_to_save_style_attrs(self.tag)
|
self.wrap_span_in_p_to_save_style_attrs(self.tag)
|
||||||
self.add_span_to_save_style_attrs_in_li(self.tag)
|
self.add_span_to_save_style_attrs_in_li(self.tag)
|
||||||
self.add_span_to_save_style_attrs_in_ul_ol(self.tag)
|
self.add_span_to_save_style_attrs_in_ul_ol(self.tag)
|
||||||
self.add_span_to_save_style_attrs(self.tag)
|
self.add_span_to_save_style_attrs(self.tag)
|
||||||
return self.tag
|
return self.tag
|
||||||
|
|
||||||
|
|
||||||
def add_inline_style_to_html_soup(soup1, css_text):
|
def add_inline_style_to_html_soup(soup1: BeautifulSoup, css_text: str):
|
||||||
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
|
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
|
||||||
livecarta_tmp_ids = []
|
livecarta_tmp_ids = []
|
||||||
h_regex = f'(^h[1-9]$)'
|
h_regex = f'(^h[1-9]$)'
|
||||||
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
|
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
|
||||||
elements_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
|
tags_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
|
||||||
for i, x in enumerate(elements_with_possible_style_attr):
|
for i, x in enumerate(tags_with_possible_style_attr):
|
||||||
x.attrs['livecarta_id'] = i
|
x.attrs['livecarta_id'] = i
|
||||||
livecarta_tmp_ids.append(i)
|
livecarta_tmp_ids.append(i)
|
||||||
html_with_inline_style = transform(str(soup1), css_text=css_text,
|
html_with_inline_style: str = transform(str(soup1), css_text=css_text,
|
||||||
remove_classes=False,
|
remove_classes=False,
|
||||||
external_styles=False,
|
external_styles=False,
|
||||||
allow_network=False,
|
allow_network=False,
|
||||||
@@ -382,11 +383,11 @@ def add_inline_style_to_html_soup(soup1, css_text):
|
|||||||
for i in livecarta_tmp_ids:
|
for i in livecarta_tmp_ids:
|
||||||
tag = soup1.find(attrs={'livecarta_id': i})
|
tag = soup1.find(attrs={'livecarta_id': i})
|
||||||
tag_with_style = soup2.find(attrs={'livecarta_id': i})
|
tag_with_style = soup2.find(attrs={'livecarta_id': i})
|
||||||
|
del tag.attrs['livecarta_id']
|
||||||
if tag_with_style.attrs.get('style'):
|
if tag_with_style.attrs.get('style'):
|
||||||
style_converter = TagStyleConverter(tag, tag_with_style)
|
style_converter = TagStyleConverter(tag, tag_with_style)
|
||||||
style_converter.convert_initial_tag()
|
style_converter.convert_initial_tag()
|
||||||
else:
|
|
||||||
del tag.attrs['livecarta_id']
|
|
||||||
return soup1
|
return soup1
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from threading import Event
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from html_docx_preprocessor import HTMLDocxPreprocessor
|
from html_docx_preprocessor import HTMLDocxPreprocessor
|
||||||
from libra_html2json_converter import LibraHTML2JSONConverter
|
from libra_html2json_converter import LibraHTML2JSONConverter
|
||||||
from src.book_solver import BookSolver
|
from solver import BookSolver
|
||||||
|
|
||||||
|
|
||||||
class DocxBook(BookSolver):
|
class DocxBook(BookSolver):
|
||||||
|
|||||||
@@ -28,10 +28,28 @@ class EpubConverter:
|
|||||||
self.access = access
|
self.access = access
|
||||||
self.logger: BookLogger = logger
|
self.logger: BookLogger = logger
|
||||||
self.ebooklib_book = epub.read_epub(file)
|
self.ebooklib_book = epub.read_epub(file)
|
||||||
|
|
||||||
|
self.href2soup_html: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
|
||||||
|
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
|
||||||
|
self.added_to_toc_hrefs = set() # enumerate all file paths that where added to TOC
|
||||||
|
|
||||||
|
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
|
||||||
|
# key = -1 for top level NavPoints
|
||||||
|
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
|
||||||
|
|
||||||
|
# container for all chapters soup objects
|
||||||
|
# here soup object is only part of the .xhtml file
|
||||||
|
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
|
||||||
|
|
||||||
self.internal_anchors = set()
|
self.internal_anchors = set()
|
||||||
|
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
|
||||||
|
self.href2img_bytes = {} # file path to bytes
|
||||||
|
self.old_image_path2_aws_path = {} # file path from <a> to generated aws path
|
||||||
|
self.footnotes_contents: List[str] = [] # to be sent on server as is
|
||||||
|
self.noterefs: List[Tag] = [] # start of the footnote
|
||||||
|
self.footnotes: List[Tag] = [] # end of the footnote
|
||||||
|
|
||||||
self.logger.log('Image processing.')
|
self.logger.log('Image processing.')
|
||||||
self.href2img_bytes = {}
|
|
||||||
self.old_image_path2_aws_path = {}
|
|
||||||
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
|
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
|
||||||
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
|
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
|
||||||
file_name = x.file_name
|
file_name = x.file_name
|
||||||
@@ -39,8 +57,7 @@ class EpubConverter:
|
|||||||
self.href2img_bytes[file_name] = content
|
self.href2img_bytes[file_name] = content
|
||||||
|
|
||||||
self.logger.log('HTML files reading.')
|
self.logger.log('HTML files reading.')
|
||||||
self.id_anchor_exist_in_nav_points = False
|
self.href2soup_html = self.build_href2soup_content()
|
||||||
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
|
|
||||||
|
|
||||||
self.logger.log('CSS files processing.')
|
self.logger.log('CSS files processing.')
|
||||||
self.css_href2content, self.html_href2css_href = self.build_css_content()
|
self.css_href2content, self.html_href2css_href = self.build_css_content()
|
||||||
@@ -48,9 +65,6 @@ class EpubConverter:
|
|||||||
self.add_css_styles2soup()
|
self.add_css_styles2soup()
|
||||||
|
|
||||||
self.logger.log('Footnotes processing.')
|
self.logger.log('Footnotes processing.')
|
||||||
self.footnotes_contents: List[str] = []
|
|
||||||
self.noterefs = []
|
|
||||||
self.footnotes: List[Tag] = []
|
|
||||||
for href in self.href2soup_html:
|
for href in self.href2soup_html:
|
||||||
content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href],
|
content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href],
|
||||||
self.href2soup_html)
|
self.href2soup_html)
|
||||||
@@ -65,19 +79,18 @@ class EpubConverter:
|
|||||||
|
|
||||||
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
|
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
|
||||||
self.logger.log('TOC processing.')
|
self.logger.log('TOC processing.')
|
||||||
self.href2subchapter_ids = defaultdict(list)
|
|
||||||
self.added_to_toc_hrefs = set()
|
|
||||||
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # nav_point2nav_points
|
|
||||||
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
|
||||||
# build simple toc from spine if needed
|
# build simple toc from spine if needed
|
||||||
if not self.is_toc_valid():
|
if self.is_toc_empty():
|
||||||
self.build_adjacency_list_from_spine()
|
self.build_adjacency_list_from_spine()
|
||||||
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
|
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
|
||||||
self.logger.log(f'Html documents not added to TOC: {not_added}.')
|
self.logger.log(f'Html documents not added to TOC: {not_added}.')
|
||||||
self.add_not_added_files_to_adjacency_list(not_added)
|
self.add_not_added_files_to_adjacency_list(not_added)
|
||||||
|
self.logger.log(f'Html internal links and structure processing.')
|
||||||
|
self.label_chapters_ids_with_tmp_id()
|
||||||
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
|
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
|
||||||
self.process_internal_links()
|
self.process_internal_links()
|
||||||
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
|
self.logger.log(f'Building chapters content.')
|
||||||
self.define_chapters_content()
|
self.define_chapters_content()
|
||||||
|
|
||||||
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
|
||||||
@@ -129,7 +142,7 @@ class EpubConverter:
|
|||||||
for href in self.href2soup_html:
|
for href in self.href2soup_html:
|
||||||
if self.html_href2css_href.get(href):
|
if self.html_href2css_href.get(href):
|
||||||
css: str = self.css_href2content[self.html_href2css_href[href]]
|
css: str = self.css_href2content[self.html_href2css_href[href]]
|
||||||
content = self.href2soup_html[href]
|
content: BeautifulSoup = self.href2soup_html[href]
|
||||||
content = add_inline_style_to_html_soup(content, css)
|
content = add_inline_style_to_html_soup(content, css)
|
||||||
self.href2soup_html[href] = content
|
self.href2soup_html[href] = content
|
||||||
|
|
||||||
@@ -142,7 +155,7 @@ class EpubConverter:
|
|||||||
|
|
||||||
def build_adjacency_list_from_toc(self, element, lvl=0):
|
def build_adjacency_list_from_toc(self, element, lvl=0):
|
||||||
"""
|
"""
|
||||||
self.adjacency_list builds based on TOC nested structure
|
self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc
|
||||||
|
|
||||||
key = -1 if root, value = None if leaf
|
key = -1 if root, value = None if leaf
|
||||||
|
|
||||||
@@ -152,29 +165,29 @@ class EpubConverter:
|
|||||||
|
|
||||||
if isinstance(element, Link):
|
if isinstance(element, Link):
|
||||||
# todo: check if link exists
|
# todo: check if link exists
|
||||||
node = NavPoint(element)
|
nav_point = NavPoint(element)
|
||||||
if node.id:
|
if nav_point.id:
|
||||||
self.id_anchor_exist_in_nav_points = True
|
self.id_anchor_exist_in_nav_points = True
|
||||||
self.href2subchapter_ids[node.href].append(node.id)
|
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
|
||||||
self.adjacency_list[node] = None
|
self.adjacency_list[nav_point] = None
|
||||||
self.added_to_toc_hrefs.add(node.href)
|
self.added_to_toc_hrefs.add(nav_point.href)
|
||||||
return node
|
return nav_point
|
||||||
|
|
||||||
elif isinstance(element, tuple):
|
elif isinstance(element, tuple):
|
||||||
first, second = element
|
first, second = element
|
||||||
assert isinstance(first, Section)
|
assert isinstance(first, Section)
|
||||||
node = NavPoint(first)
|
nav_point = NavPoint(first)
|
||||||
if node.id:
|
if nav_point.id:
|
||||||
self.id_anchor_exist_in_nav_points = True
|
self.id_anchor_exist_in_nav_points = True
|
||||||
self.href2subchapter_ids[node.href].append(node.id)
|
self.href2subchapter_ids[nav_point.href].append(nav_point.id)
|
||||||
|
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
for i in second:
|
for i in second:
|
||||||
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
|
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
|
||||||
|
|
||||||
self.adjacency_list[node] = sub_nodes
|
self.adjacency_list[nav_point] = sub_nodes
|
||||||
self.added_to_toc_hrefs.add(node.href)
|
self.added_to_toc_hrefs.add(nav_point.href)
|
||||||
return node
|
return nav_point
|
||||||
|
|
||||||
elif isinstance(element, list) and (lvl == 0):
|
elif isinstance(element, list) and (lvl == 0):
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
@@ -186,10 +199,10 @@ class EpubConverter:
|
|||||||
else:
|
else:
|
||||||
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
|
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
|
||||||
|
|
||||||
def is_toc_valid(self):
|
def is_toc_empty(self):
|
||||||
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
|
||||||
return False
|
|
||||||
return True
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def build_adjacency_list_from_spine(self):
|
def build_adjacency_list_from_spine(self):
|
||||||
manifest_id2href = self.build_manifest_id2href()
|
manifest_id2href = self.build_manifest_id2href()
|
||||||
@@ -197,18 +210,17 @@ class EpubConverter:
|
|||||||
-1: []
|
-1: []
|
||||||
}
|
}
|
||||||
for id_, _ in self.ebooklib_book.spine:
|
for id_, _ in self.ebooklib_book.spine:
|
||||||
node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
|
nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
|
||||||
self.adjacency_list[-1].append(node)
|
self.adjacency_list[-1].append(nav_point)
|
||||||
self.added_to_toc_hrefs.add(node.href)
|
self.added_to_toc_hrefs.add(nav_point.href)
|
||||||
|
|
||||||
def add_not_added_files_to_adjacency_list(self, not_added):
|
def add_not_added_files_to_adjacency_list(self, not_added):
|
||||||
for i, file in enumerate(not_added):
|
for i, file in enumerate(not_added):
|
||||||
node = NavPoint(Section(f'To check #{i}, filename: {file}', file))
|
nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file))
|
||||||
self.adjacency_list[-1].append(node)
|
self.adjacency_list[-1].append(nav_point)
|
||||||
self.added_to_toc_hrefs.add(file)
|
self.added_to_toc_hrefs.add(file)
|
||||||
|
|
||||||
def process_html_soup_structure_to_line(self):
|
def label_chapters_ids_with_tmp_id(self):
|
||||||
# mark
|
|
||||||
for href in self.href2soup_html:
|
for href in self.href2soup_html:
|
||||||
ids = self.href2subchapter_ids[href]
|
ids = self.href2subchapter_ids[href]
|
||||||
for i in ids:
|
for i in ids:
|
||||||
@@ -219,6 +231,7 @@ class EpubConverter:
|
|||||||
new_h.attrs['id'] = i
|
new_h.attrs['id'] = i
|
||||||
tag.insert_before(new_h)
|
tag.insert_before(new_h)
|
||||||
|
|
||||||
|
def process_html_soup_structure_to_line(self):
|
||||||
# go to line structure
|
# go to line structure
|
||||||
for href in self.href2soup_html:
|
for href in self.href2soup_html:
|
||||||
soup = self.href2soup_html[href]
|
soup = self.href2soup_html[href]
|
||||||
@@ -236,18 +249,31 @@ class EpubConverter:
|
|||||||
new_anchor_span.string = "\xa0"
|
new_anchor_span.string = "\xa0"
|
||||||
return new_anchor_span
|
return new_anchor_span
|
||||||
|
|
||||||
def match_href_to_path_from_toc(self, href, href_in_link, internal_link_tag):
|
def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
|
||||||
dir_name = os.path.dirname(href)
|
"""
|
||||||
|
TOC: a/b/c.xhtml
|
||||||
|
|
||||||
|
b/c.xhtml -> a/b/c.xhtml
|
||||||
|
c.xhtml -> a/b/c.xhtml
|
||||||
|
|
||||||
|
Used to find full path to file that is parsed from tag link
|
||||||
|
|
||||||
|
:param cur_file_path: path to current file with tag link
|
||||||
|
:param href_in_link: filename got from tag link, like file1.xhtml
|
||||||
|
:param internal_link_tag: tag object that is parsed now
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
dir_name = os.path.dirname(cur_file_path)
|
||||||
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
|
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
|
||||||
full_path = [path for path in self.added_to_toc_hrefs if normed_path in path]
|
full_path = [path for path in self.added_to_toc_hrefs if normed_path in path]
|
||||||
if not full_path:
|
if not full_path:
|
||||||
self.logger.log(f'Error in {href} file. No {normed_path} file found in added to TOC documents. '
|
self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
|
||||||
f'While processing href in {internal_link_tag}.')
|
f'While processing href in {internal_link_tag}.')
|
||||||
internal_link_tag.attrs['converter-mark'] = 'bad-link'
|
internal_link_tag.attrs['converter-mark'] = 'bad-link'
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if len(full_path) > 1:
|
if len(full_path) > 1:
|
||||||
self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}'
|
self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
|
||||||
f' while {internal_link_tag} processing. The first one will be chosen.')
|
f' while {internal_link_tag} processing. The first one will be chosen.')
|
||||||
|
|
||||||
return full_path[0]
|
return full_path[0]
|
||||||
@@ -272,7 +298,7 @@ class EpubConverter:
|
|||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
|
||||||
a_tag_href = internal_link_tag.attrs['href']
|
a_tag_href = internal_link_tag.attrs['href']
|
||||||
# find full path
|
# find full path
|
||||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
||||||
if not a_tag_href_matched_to_toc:
|
if not a_tag_href_matched_to_toc:
|
||||||
continue
|
continue
|
||||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
|
||||||
@@ -291,9 +317,12 @@ class EpubConverter:
|
|||||||
soup = self.href2soup_html[toc_href]
|
soup = self.href2soup_html[toc_href]
|
||||||
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
|
||||||
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
|
||||||
a_tag_href = a_tag_href or toc_href
|
|
||||||
# find full path
|
# find full path
|
||||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
|
if a_tag_href:
|
||||||
|
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href,
|
||||||
|
internal_link_tag)
|
||||||
|
else:
|
||||||
|
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
|
||||||
if not a_tag_href_matched_to_toc:
|
if not a_tag_href_matched_to_toc:
|
||||||
continue
|
continue
|
||||||
new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
|
new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
|
||||||
@@ -326,7 +355,7 @@ class EpubConverter:
|
|||||||
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
|
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
|
||||||
f' Old id={a_tag_id}')
|
f' Old id={a_tag_id}')
|
||||||
|
|
||||||
def build_one_chapter(self, node):
|
def build_one_chapter(self, nav_point):
|
||||||
"""
|
"""
|
||||||
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
|
||||||
|
|
||||||
@@ -339,34 +368,34 @@ class EpubConverter:
|
|||||||
and id of the next chapter/subchapter
|
and id of the next chapter/subchapter
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if node.id:
|
if nav_point.id:
|
||||||
soup = self.href2soup_html[node.href]
|
soup = self.href2soup_html[nav_point.href]
|
||||||
chapter_tags = get_tags_between_chapter_marks(first_id=node.id, href=node.href, html_soup=soup)
|
chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup)
|
||||||
new_tree = BeautifulSoup('', 'html.parser')
|
new_tree = BeautifulSoup('', 'html.parser')
|
||||||
for tag in chapter_tags:
|
for tag in chapter_tags:
|
||||||
new_tree.append(tag)
|
new_tree.append(tag)
|
||||||
self.href_chapter_id2soup_html[(node.href, node.id)] = new_tree
|
self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] = new_tree
|
||||||
|
|
||||||
if self.adjacency_list.get(node):
|
if self.adjacency_list.get(nav_point):
|
||||||
for sub_node in self.adjacency_list[node]:
|
for sub_node in self.adjacency_list[nav_point]:
|
||||||
self.build_one_chapter(sub_node)
|
self.build_one_chapter(sub_node)
|
||||||
|
|
||||||
def define_chapters_content(self):
|
def define_chapters_content(self):
|
||||||
nav_points = self.adjacency_list[-1]
|
top_level_nav_points = self.adjacency_list[-1]
|
||||||
if self.id_anchor_exist_in_nav_points:
|
if self.id_anchor_exist_in_nav_points:
|
||||||
for point in nav_points:
|
for point in top_level_nav_points:
|
||||||
self.build_one_chapter(point)
|
self.build_one_chapter(point)
|
||||||
|
|
||||||
def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem:
|
def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
|
||||||
title = node.title
|
title = nav_point.title
|
||||||
if node.id:
|
if nav_point.id:
|
||||||
content: BeautifulSoup = self.href_chapter_id2soup_html[(node.href, node.id)]
|
content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
|
||||||
else:
|
else:
|
||||||
content: BeautifulSoup = self.href2soup_html[node.href]
|
content: BeautifulSoup = self.href2soup_html[nav_point.href]
|
||||||
|
|
||||||
self.old_image_path2_aws_path = update_src_links_in_images(content,
|
self.old_image_path2_aws_path = update_src_links_in_images(content,
|
||||||
self.href2img_bytes,
|
self.href2img_bytes,
|
||||||
path_to_html=node.href,
|
path_to_html=nav_point.href,
|
||||||
access=self.access,
|
access=self.access,
|
||||||
path2aws_path=self.old_image_path2_aws_path)
|
path2aws_path=self.old_image_path2_aws_path)
|
||||||
|
|
||||||
@@ -376,8 +405,8 @@ class EpubConverter:
|
|||||||
|
|
||||||
sub_nodes = []
|
sub_nodes = []
|
||||||
# warning! not EpubHtmlItems won;t be added to chapter
|
# warning! not EpubHtmlItems won;t be added to chapter
|
||||||
if self.adjacency_list.get(node):
|
if self.adjacency_list.get(nav_point):
|
||||||
for sub_node in self.adjacency_list[node]:
|
for sub_node in self.adjacency_list[nav_point]:
|
||||||
sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1)
|
sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1)
|
||||||
sub_nodes.append(sub_chapter_item)
|
sub_nodes.append(sub_chapter_item)
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from epub_converter import EpubConverter
|
from epub_converter import EpubConverter
|
||||||
from src.book_solver import BookSolver
|
from solver import BookSolver
|
||||||
|
|
||||||
|
|
||||||
class EpubBook(BookSolver):
|
class EpubBook(BookSolver):
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from typing import List
|
|||||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||||
|
|
||||||
from livecarta_config import LawCartaConfig
|
from livecarta_config import LawCartaConfig
|
||||||
from src.util.helpers import BookLogger, BookStatusWrapper
|
from util.helpers import BookLogger, BookStatusWrapper
|
||||||
|
|
||||||
|
|
||||||
class HTMLDocxPreprocessor:
|
class HTMLDocxPreprocessor:
|
||||||
|
|||||||
139
src/solver.py
Normal file
139
src/solver.py
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
""" This is Main Abstract class for solving a task of a book conversion
|
||||||
|
|
||||||
|
Having an id of coming book, gets book from server, runs conversion.
|
||||||
|
In parallel it updates status of a book conversion on admin panel.
|
||||||
|
Finally sends result to server.
|
||||||
|
Result is a json, JSON schema in book_schema.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from abc import abstractmethod, ABCMeta
|
||||||
|
|
||||||
|
from livecarta_config import LawCartaConfig
|
||||||
|
from util.helpers import BookLogger, BookStatusWrapper
|
||||||
|
|
||||||
|
|
||||||
|
class BookSolver:
|
||||||
|
__metaclass__ = ABCMeta
|
||||||
|
|
||||||
|
def __init__(self, book_id=0, access=None, main_logger=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'):
|
||||||
|
self.book_type = None
|
||||||
|
self.book_id = book_id
|
||||||
|
self.access = access
|
||||||
|
self.file_path = None # path to book file, appears after downloading from server
|
||||||
|
self.output_path = None # path to json file
|
||||||
|
self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
|
||||||
|
logging_format=logging_format,
|
||||||
|
book_id=book_id,
|
||||||
|
main_logger=main_logger)
|
||||||
|
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
|
||||||
|
|
||||||
|
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
|
||||||
|
"Length of headers doesn't match allowed levels."
|
||||||
|
|
||||||
|
def save_book_file(self, content):
|
||||||
|
"""
|
||||||
|
Save binary content of file to .docx/.epub.
|
||||||
|
:param content: binary content of the file.
|
||||||
|
"""
|
||||||
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
folder_path = os.path.join(folder_path, f'{self.book_type}/{self.book_id}')
|
||||||
|
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
|
||||||
|
try:
|
||||||
|
with open(file_path, 'wb+') as file:
|
||||||
|
file.write(content)
|
||||||
|
self.logger_object.log(f'File was saved to folder: {folder_path}.')
|
||||||
|
except Exception as exc:
|
||||||
|
self.logger_object.log(f"Error in writing {self.book_type} file.", logging.ERROR)
|
||||||
|
self.logger_object.log_error_to_main_log()
|
||||||
|
raise exc
|
||||||
|
|
||||||
|
self.file_path = pathlib.Path(file_path)
|
||||||
|
|
||||||
|
def get_book_file(self):
|
||||||
|
"""
|
||||||
|
Method for getting and saving book from server.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
|
||||||
|
content = self.access.get_doc(self.book_id)
|
||||||
|
self.logger_object.log('File was received from server.')
|
||||||
|
self.save_book_file(content)
|
||||||
|
except FileNotFoundError as f_err:
|
||||||
|
self.logger_object.log("Can't get docx from server.", logging.ERROR)
|
||||||
|
self.logger_object.log_error_to_main_log()
|
||||||
|
raise f_err
|
||||||
|
except Exception as exc:
|
||||||
|
raise exc
|
||||||
|
|
||||||
|
def check_output_directory(self):
|
||||||
|
if self.output_path is None:
|
||||||
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
output_path = os.path.join(folder_path, f'json/{self.book_id}.json')
|
||||||
|
self.output_path = output_path
|
||||||
|
|
||||||
|
self.output_path = pathlib.Path(self.output_path)
|
||||||
|
self.logger_object.log(f'Output file path: {self.output_path}')
|
||||||
|
|
||||||
|
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.output_path.touch(exist_ok=True)
|
||||||
|
|
||||||
|
def write_to_json(self, content: dict):
|
||||||
|
self.check_output_directory()
|
||||||
|
try:
|
||||||
|
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(content, f, ensure_ascii=False)
|
||||||
|
self.logger_object.log(f'Data has been saved to .json file: {self.output_path}')
|
||||||
|
except Exception as exc:
|
||||||
|
self.logger_object.log('Error has occurred while writing json file.' + str(exc), logging.ERROR)
|
||||||
|
|
||||||
|
def send_json_content_to_server(self, content: dict):
|
||||||
|
try:
|
||||||
|
self.access.send_book(self.book_id, content)
|
||||||
|
self.logger_object.log(f'JSON data has been sent to server.')
|
||||||
|
except Exception as exc:
|
||||||
|
self.logger_object.log('Error has occurred while sending json content.', logging.ERROR)
|
||||||
|
self.logger_object.log_error_to_main_log()
|
||||||
|
self.status_wrapper.set_error()
|
||||||
|
raise exc
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_converted_book(self):
|
||||||
|
self.logger_object.log('Beginning of processing json output.')
|
||||||
|
self.status_wrapper.set_generating()
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def test_conversion(self):
|
||||||
|
self.logger_object.log('Beginning of the test.')
|
||||||
|
|
||||||
|
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
folder_path = os.path.join(folder_path, f'{self.book_type}')
|
||||||
|
file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
|
||||||
|
self.file_path = pathlib.Path(file_path)
|
||||||
|
self.logger_object.log(f'Test on {self.book_type}: {self.file_path}')
|
||||||
|
content_dict = self.get_converted_book()
|
||||||
|
self.write_to_json(content_dict)
|
||||||
|
self.logger_object.log('End of the test.')
|
||||||
|
|
||||||
|
def conversion(self):
|
||||||
|
try:
|
||||||
|
self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.')
|
||||||
|
self.get_book_file()
|
||||||
|
self.status_wrapper.set_processing()
|
||||||
|
content_dict = self.get_converted_book()
|
||||||
|
self.write_to_json(content_dict)
|
||||||
|
self.send_json_content_to_server(content_dict)
|
||||||
|
self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
self.status_wrapper.set_error()
|
||||||
|
self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
|
||||||
|
self.logger_object.log_error_to_main_log(str(exc))
|
||||||
|
raise exc
|
||||||
|
|
||||||
Reference in New Issue
Block a user