Synchronize with dev branch

This commit is contained in:
Kiryl
2021-09-24 12:34:54 +03:00
parent 331b70316b
commit 21ed86920b
6 changed files with 269 additions and 100 deletions

View File

@@ -23,9 +23,9 @@ sizes_px = ['10px', '10px', '11px', '12px', '13px', '14px', '15px', '16px', '17p
'35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px', '35px', '36px', '37px', '38px', '39px', '40px', '41px', '42px', '43px', '44px', '45px', '46px', '47px',
'48px', '49px', '50px', '64px', '72px'] '48px', '49px', '50px', '64px', '72px']
list_types = ['circle', 'disc', 'armenian','decimal', list_types = ['circle', 'disc', 'armenian', 'decimal',
'decimal-leading-zero', 'georgian', 'lower-alpha','lower-latin', 'decimal-leading-zero', 'georgian', 'lower-alpha', 'lower-latin',
'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none' ] 'lower-roman', 'upper-alpha', 'upper-latin', 'upper-roman', 'none']
def convert_font_size(value): def convert_font_size(value):
@@ -132,6 +132,8 @@ LIVECARTA_STYLE_ATTRS_MAPPING = {
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag } LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { (property, value): tag }
<p style="font-weight:600> foo </p> -> <p><strong>foo</strong></p>
""" """
LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = { LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG = {
('font-weight', 'bold'): 'strong', ('font-weight', 'bold'): 'strong',
@@ -231,22 +233,22 @@ class TagStyleConverter:
# if tag had already had inline style, add this to style parsed from css # if tag had already had inline style, add this to style parsed from css
if self.tag.attrs.get('style') and self.tag.attrs['style'] not in style: if self.tag.attrs.get('style') and self.tag.attrs['style'] not in style:
style += self.tag.attrs['style'] style += self.tag.attrs['style']
print(style)
return style return style
def change_attrs_with_corresponding_tags(self): def change_attrs_with_corresponding_tags(self):
# adds <b>, <u>, <sup>, etc # adds <b>, <u>, <sup>, etc
to_remove = check_style_to_be_tag(self.style) to_remove = check_style_to_be_tag(self.style)
new_tags = [] new_tags = []
for i, (p, v) in enumerate(to_remove): for i, (attr, value) in enumerate(to_remove):
s = f'{p}:{v};' s = f'{attr}:{value};'
self.style = self.style.replace(s, '') self.style = self.style.replace(s, '')
self.style = self.style.strip() self.style = self.style.strip()
if i == 0: if i == 0:
self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)] self.tag.name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tags.append(self.tag) new_tags.append(self.tag)
else: else:
name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(p, v)] name = LIVECARTA_STYLE_ATTRS_SHOULD_BE_TAG[(attr, value)]
new_tag = BeautifulSoup(features='lxml').new_tag(name) new_tag = BeautifulSoup(features='lxml').new_tag(name)
new_tags[-1].wrap(new_tag) new_tags[-1].wrap(new_tag)
new_tags.append(new_tag) new_tags.append(new_tag)
@@ -267,34 +269,34 @@ class TagStyleConverter:
return top_tag return top_tag
@staticmethod @staticmethod
def wrap_p_to_save_style_attrs(t): def wrap_span_in_p_to_save_style_attrs(tag):
styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS styles_cant_be_in_p = [attr for attr in LIVECARTA_STYLE_ATTRS
if attr not in ['text-align', 'text-indent']] if attr not in ['text-align', 'text-indent']]
if t.name == 'p' and t.attrs.get('style'): if tag.name == 'p' and tag.attrs.get('style'):
check = [attr in t.attrs.get('style') for attr in styles_cant_be_in_p] styles_to_be_saved = [attr in tag.attrs.get('style') for attr in styles_cant_be_in_p]
if any(check): if any(styles_to_be_saved):
t.name = 'span' tag.name = 'span'
p_tag = BeautifulSoup(features='lxml').new_tag('p') p_tag = BeautifulSoup(features='lxml').new_tag('p')
old_style = t.attrs['style'] span_style = tag.attrs['style']
new_style = '' p_style = ''
possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)') possible_p_attrs_regexp = re.compile(r'(text-align:(\w+);)|(text-indent:(\w+);)')
has_p_style_attrs = re.search(possible_p_attrs_regexp, old_style) has_p_style_attrs = re.search(possible_p_attrs_regexp, span_style)
if has_p_style_attrs: if has_p_style_attrs:
if has_p_style_attrs.group(1): if has_p_style_attrs.group(1):
new_style += has_p_style_attrs.group(1) p_style += has_p_style_attrs.group(1)
old_style = old_style.replace(has_p_style_attrs.group(1), '') span_style = span_style.replace(has_p_style_attrs.group(1), '')
if has_p_style_attrs.group(3): if has_p_style_attrs.group(3):
new_style += has_p_style_attrs.group(3) p_style += has_p_style_attrs.group(3)
old_style = old_style.replace(has_p_style_attrs.group(3), '') span_style = span_style.replace(has_p_style_attrs.group(3), '')
p_tag.attrs['style'] = new_style p_tag.attrs['style'] = p_style
li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)') li_attrs_regexp = re.compile(r'(list-style-type:(\w+);)')
has_li_style_attr = re.search(li_attrs_regexp, old_style) has_li_style_attr = re.search(li_attrs_regexp, span_style)
old_style = old_style if not has_li_style_attr else old_style.replace(has_li_style_attr.group(1), '') span_style = span_style if not has_li_style_attr else span_style.replace(has_li_style_attr.group(1), '')
t.attrs['style'] = old_style tag.attrs['style'] = span_style
t.wrap(p_tag) tag.wrap(p_tag)
@staticmethod @staticmethod
def add_span_to_save_style_attrs_in_li(t): def add_span_to_save_style_attrs_in_li(t):
@@ -354,39 +356,38 @@ class TagStyleConverter:
t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '') t.attrs['style'] = style if not has_li_style_attr else style.replace(has_li_style_attr.group(1), '')
def convert_initial_tag(self): def convert_initial_tag(self):
del self.tag.attrs['livecarta_id']
self.tag = self.change_attrs_with_corresponding_tags() self.tag = self.change_attrs_with_corresponding_tags()
self.wrap_p_to_save_style_attrs(self.tag) self.wrap_span_in_p_to_save_style_attrs(self.tag)
self.add_span_to_save_style_attrs_in_li(self.tag) self.add_span_to_save_style_attrs_in_li(self.tag)
self.add_span_to_save_style_attrs_in_ul_ol(self.tag) self.add_span_to_save_style_attrs_in_ul_ol(self.tag)
self.add_span_to_save_style_attrs(self.tag) self.add_span_to_save_style_attrs(self.tag)
return self.tag return self.tag
def add_inline_style_to_html_soup(soup1, css_text): def add_inline_style_to_html_soup(soup1: BeautifulSoup, css_text: str):
css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '') css_text = css_text.replace('@namespace epub "http://www.idpf.org/2007/ops";', '')
livecarta_tmp_ids = [] livecarta_tmp_ids = []
h_regex = f'(^h[1-9]$)' h_regex = f'(^h[1-9]$)'
could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex) could_have_style_in_livecarta_regexp = re.compile('(^p$)|(^span$)|(^li$)|(^ul$)|(^ol$)|(^td$)|(^th$)|' + h_regex)
elements_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp) tags_with_possible_style_attr = soup1.find_all(could_have_style_in_livecarta_regexp)
for i, x in enumerate(elements_with_possible_style_attr): for i, x in enumerate(tags_with_possible_style_attr):
x.attrs['livecarta_id'] = i x.attrs['livecarta_id'] = i
livecarta_tmp_ids.append(i) livecarta_tmp_ids.append(i)
html_with_inline_style = transform(str(soup1), css_text=css_text, html_with_inline_style: str = transform(str(soup1), css_text=css_text,
remove_classes=False, remove_classes=False,
external_styles=False, external_styles=False,
allow_network=False, allow_network=False,
disable_validation=True) disable_validation=True)
soup2 = BeautifulSoup(html_with_inline_style, features='lxml') soup2 = BeautifulSoup(html_with_inline_style, features='lxml')
for i in livecarta_tmp_ids: for i in livecarta_tmp_ids:
tag = soup1.find(attrs={'livecarta_id': i}) tag = soup1.find(attrs={'livecarta_id': i})
tag_with_style = soup2.find(attrs={'livecarta_id': i}) tag_with_style = soup2.find(attrs={'livecarta_id': i})
del tag.attrs['livecarta_id']
if tag_with_style.attrs.get('style'): if tag_with_style.attrs.get('style'):
style_converter = TagStyleConverter(tag, tag_with_style) style_converter = TagStyleConverter(tag, tag_with_style)
style_converter.convert_initial_tag() style_converter.convert_initial_tag()
else:
del tag.attrs['livecarta_id']
return soup1 return soup1

View File

@@ -8,7 +8,7 @@ from threading import Event
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from html_docx_preprocessor import HTMLDocxPreprocessor from html_docx_preprocessor import HTMLDocxPreprocessor
from libra_html2json_converter import LibraHTML2JSONConverter from libra_html2json_converter import LibraHTML2JSONConverter
from src.book_solver import BookSolver from solver import BookSolver
class DocxBook(BookSolver): class DocxBook(BookSolver):

View File

@@ -28,10 +28,28 @@ class EpubConverter:
self.access = access self.access = access
self.logger: BookLogger = logger self.logger: BookLogger = logger
self.ebooklib_book = epub.read_epub(file) self.ebooklib_book = epub.read_epub(file)
self.href2soup_html: Dict[str, BeautifulSoup] = {} # main container for all epub .xhtml files
self.href2subchapter_ids = defaultdict(list) # enumerate all subchapter id for each file
self.added_to_toc_hrefs = set() # enumerate all file paths that where added to TOC
# toc tree structure stored as adj.list (NavPoint to list of NavPoints)
# key = -1 for top level NavPoints
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {}
# container for all chapters soup objects
# here soup object is only part of the .xhtml file
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {}
self.internal_anchors = set() self.internal_anchors = set()
self.id_anchor_exist_in_nav_points = False # flag to be updated while ebooklib.toc is parsed
self.href2img_bytes = {} # file path to bytes
self.old_image_path2_aws_path = {} # file path from <a> to generated aws path
self.footnotes_contents: List[str] = [] # to be sent on server as is
self.noterefs: List[Tag] = [] # start of the footnote
self.footnotes: List[Tag] = [] # end of the footnote
self.logger.log('Image processing.') self.logger.log('Image processing.')
self.href2img_bytes = {}
self.old_image_path2_aws_path = {}
for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE), for x in chain(self.ebooklib_book.get_items_of_type(ebooklib.ITEM_IMAGE),
self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)): self.ebooklib_book.get_items_of_type(ebooklib.ITEM_COVER)):
file_name = x.file_name file_name = x.file_name
@@ -39,8 +57,7 @@ class EpubConverter:
self.href2img_bytes[file_name] = content self.href2img_bytes[file_name] = content
self.logger.log('HTML files reading.') self.logger.log('HTML files reading.')
self.id_anchor_exist_in_nav_points = False self.href2soup_html = self.build_href2soup_content()
self.href2soup_html: Dict[str, BeautifulSoup] = self.build_href2soup_content()
self.logger.log('CSS files processing.') self.logger.log('CSS files processing.')
self.css_href2content, self.html_href2css_href = self.build_css_content() self.css_href2content, self.html_href2css_href = self.build_css_content()
@@ -48,9 +65,6 @@ class EpubConverter:
self.add_css_styles2soup() self.add_css_styles2soup()
self.logger.log('Footnotes processing.') self.logger.log('Footnotes processing.')
self.footnotes_contents: List[str] = []
self.noterefs = []
self.footnotes: List[Tag] = []
for href in self.href2soup_html: for href in self.href2soup_html:
content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href], content, noterefs, footnotes_tags = preprocess_footnotes(self.href2soup_html[href],
self.href2soup_html) self.href2soup_html)
@@ -65,19 +79,18 @@ class EpubConverter:
self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.') self.logger.log(f'Added {len(self.footnotes_contents)} footnotes.')
self.logger.log('TOC processing.') self.logger.log('TOC processing.')
self.href2subchapter_ids = defaultdict(list)
self.added_to_toc_hrefs = set()
self.adjacency_list: Dict[Union[NavPoint, -1], Union[list, None]] = {} # nav_point2nav_points
self.build_adjacency_list_from_toc(self.ebooklib_book.toc) self.build_adjacency_list_from_toc(self.ebooklib_book.toc)
# build simple toc from spine if needed # build simple toc from spine if needed
if not self.is_toc_valid(): if self.is_toc_empty():
self.build_adjacency_list_from_spine() self.build_adjacency_list_from_spine()
not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs] not_added = [x for x in self.href2soup_html if x not in self.added_to_toc_hrefs]
self.logger.log(f'Html documents not added to TOC: {not_added}.') self.logger.log(f'Html documents not added to TOC: {not_added}.')
self.add_not_added_files_to_adjacency_list(not_added) self.add_not_added_files_to_adjacency_list(not_added)
self.logger.log(f'Html internal links and structure processing.')
self.label_chapters_ids_with_tmp_id()
self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed self.process_html_soup_structure_to_line() # used only after parsed toc, ids from toc needed
self.process_internal_links() self.process_internal_links()
self.href_chapter_id2soup_html: Dict[tuple, BeautifulSoup] = {} self.logger.log(f'Building chapters content.')
self.define_chapters_content() self.define_chapters_content()
def build_href2soup_content(self) -> Dict[str, BeautifulSoup]: def build_href2soup_content(self) -> Dict[str, BeautifulSoup]:
@@ -129,7 +142,7 @@ class EpubConverter:
for href in self.href2soup_html: for href in self.href2soup_html:
if self.html_href2css_href.get(href): if self.html_href2css_href.get(href):
css: str = self.css_href2content[self.html_href2css_href[href]] css: str = self.css_href2content[self.html_href2css_href[href]]
content = self.href2soup_html[href] content: BeautifulSoup = self.href2soup_html[href]
content = add_inline_style_to_html_soup(content, css) content = add_inline_style_to_html_soup(content, css)
self.href2soup_html[href] = content self.href2soup_html[href] = content
@@ -142,7 +155,7 @@ class EpubConverter:
def build_adjacency_list_from_toc(self, element, lvl=0): def build_adjacency_list_from_toc(self, element, lvl=0):
""" """
self.adjacency_list builds based on TOC nested structure self.adjacency_list builds based on TOC nested structure, got from self.ebooklib_book.toc
key = -1 if root, value = None if leaf key = -1 if root, value = None if leaf
@@ -152,29 +165,29 @@ class EpubConverter:
if isinstance(element, Link): if isinstance(element, Link):
# todo: check if link exists # todo: check if link exists
node = NavPoint(element) nav_point = NavPoint(element)
if node.id: if nav_point.id:
self.id_anchor_exist_in_nav_points = True self.id_anchor_exist_in_nav_points = True
self.href2subchapter_ids[node.href].append(node.id) self.href2subchapter_ids[nav_point.href].append(nav_point.id)
self.adjacency_list[node] = None self.adjacency_list[nav_point] = None
self.added_to_toc_hrefs.add(node.href) self.added_to_toc_hrefs.add(nav_point.href)
return node return nav_point
elif isinstance(element, tuple): elif isinstance(element, tuple):
first, second = element first, second = element
assert isinstance(first, Section) assert isinstance(first, Section)
node = NavPoint(first) nav_point = NavPoint(first)
if node.id: if nav_point.id:
self.id_anchor_exist_in_nav_points = True self.id_anchor_exist_in_nav_points = True
self.href2subchapter_ids[node.href].append(node.id) self.href2subchapter_ids[nav_point.href].append(nav_point.id)
sub_nodes = [] sub_nodes = []
for i in second: for i in second:
sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1)) sub_nodes.append(self.build_adjacency_list_from_toc(i, lvl + 1))
self.adjacency_list[node] = sub_nodes self.adjacency_list[nav_point] = sub_nodes
self.added_to_toc_hrefs.add(node.href) self.added_to_toc_hrefs.add(nav_point.href)
return node return nav_point
elif isinstance(element, list) and (lvl == 0): elif isinstance(element, list) and (lvl == 0):
sub_nodes = [] sub_nodes = []
@@ -186,10 +199,10 @@ class EpubConverter:
else: else:
assert 0, f'Error. Element is not tuple/Link instance: {type(element)}' assert 0, f'Error. Element is not tuple/Link instance: {type(element)}'
def is_toc_valid(self): def is_toc_empty(self):
if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None): if (self.ebooklib_book.toc is None) or (self.adjacency_list.get(-1) is None):
return False return True
return True return False
def build_adjacency_list_from_spine(self): def build_adjacency_list_from_spine(self):
manifest_id2href = self.build_manifest_id2href() manifest_id2href = self.build_manifest_id2href()
@@ -197,18 +210,17 @@ class EpubConverter:
-1: [] -1: []
} }
for id_, _ in self.ebooklib_book.spine: for id_, _ in self.ebooklib_book.spine:
node = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_])) nav_point = NavPoint(Section(manifest_id2href[id_], manifest_id2href[id_]))
self.adjacency_list[-1].append(node) self.adjacency_list[-1].append(nav_point)
self.added_to_toc_hrefs.add(node.href) self.added_to_toc_hrefs.add(nav_point.href)
def add_not_added_files_to_adjacency_list(self, not_added): def add_not_added_files_to_adjacency_list(self, not_added):
for i, file in enumerate(not_added): for i, file in enumerate(not_added):
node = NavPoint(Section(f'To check #{i}, filename: {file}', file)) nav_point = NavPoint(Section(f'To check #{i}, filename: {file}', file))
self.adjacency_list[-1].append(node) self.adjacency_list[-1].append(nav_point)
self.added_to_toc_hrefs.add(file) self.added_to_toc_hrefs.add(file)
def process_html_soup_structure_to_line(self): def label_chapters_ids_with_tmp_id(self):
# mark
for href in self.href2soup_html: for href in self.href2soup_html:
ids = self.href2subchapter_ids[href] ids = self.href2subchapter_ids[href]
for i in ids: for i in ids:
@@ -219,6 +231,7 @@ class EpubConverter:
new_h.attrs['id'] = i new_h.attrs['id'] = i
tag.insert_before(new_h) tag.insert_before(new_h)
def process_html_soup_structure_to_line(self):
# go to line structure # go to line structure
for href in self.href2soup_html: for href in self.href2soup_html:
soup = self.href2soup_html[href] soup = self.href2soup_html[href]
@@ -236,18 +249,31 @@ class EpubConverter:
new_anchor_span.string = "\xa0" new_anchor_span.string = "\xa0"
return new_anchor_span return new_anchor_span
def match_href_to_path_from_toc(self, href, href_in_link, internal_link_tag): def _match_href_to_path_from_toc(self, cur_file_path, href_in_link, internal_link_tag):
dir_name = os.path.dirname(href) """
TOC: a/b/c.xhtml
b/c.xhtml -> a/b/c.xhtml
c.xhtml -> a/b/c.xhtml
Used to find full path to file that is parsed from tag link
:param cur_file_path: path to current file with tag link
:param href_in_link: filename got from tag link, like file1.xhtml
:param internal_link_tag: tag object that is parsed now
:return:
"""
dir_name = os.path.dirname(cur_file_path)
normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/') normed_path = os.path.normpath(os.path.join(dir_name, href_in_link)).replace('\\', '/')
full_path = [path for path in self.added_to_toc_hrefs if normed_path in path] full_path = [path for path in self.added_to_toc_hrefs if normed_path in path]
if not full_path: if not full_path:
self.logger.log(f'Error in {href} file. No {normed_path} file found in added to TOC documents. ' self.logger.log(f'Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. '
f'While processing href in {internal_link_tag}.') f'While processing href in {internal_link_tag}.')
internal_link_tag.attrs['converter-mark'] = 'bad-link' internal_link_tag.attrs['converter-mark'] = 'bad-link'
return None return None
if len(full_path) > 1: if len(full_path) > 1:
self.logger.log(f'Warning in {href}. Multiple paths found {full_path} for file {href_in_link}' self.logger.log(f'Warning in {cur_file_path}. Multiple paths found {full_path} for file {href_in_link}'
f' while {internal_link_tag} processing. The first one will be chosen.') f' while {internal_link_tag} processing. The first one will be chosen.')
return full_path[0] return full_path[0]
@@ -272,7 +298,7 @@ class EpubConverter:
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}): for internal_link_tag in soup.find_all('a', {'href': internal_link_reg1}):
a_tag_href = internal_link_tag.attrs['href'] a_tag_href = internal_link_tag.attrs['href']
# find full path # find full path
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag)
if not a_tag_href_matched_to_toc: if not a_tag_href_matched_to_toc:
continue continue
new_id = self._create_unique_id(a_tag_href_matched_to_toc, '') new_id = self._create_unique_id(a_tag_href_matched_to_toc, '')
@@ -291,9 +317,12 @@ class EpubConverter:
soup = self.href2soup_html[toc_href] soup = self.href2soup_html[toc_href]
for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}): for internal_link_tag in soup.find_all('a', {'href': internal_link_reg2}):
a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#') a_tag_href, a_tag_id = internal_link_tag.attrs['href'].split('#')
a_tag_href = a_tag_href or toc_href
# find full path # find full path
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(toc_href, a_tag_href, internal_link_tag) if a_tag_href:
a_tag_href_matched_to_toc = self._match_href_to_path_from_toc(toc_href, a_tag_href,
internal_link_tag)
else:
a_tag_href_matched_to_toc = os.path.normpath(toc_href).replace('\\', '/')
if not a_tag_href_matched_to_toc: if not a_tag_href_matched_to_toc:
continue continue
new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id) new_id = self._create_unique_id(a_tag_href_matched_to_toc, a_tag_id)
@@ -326,7 +355,7 @@ class EpubConverter:
f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.' f' Should be anchor with new id={new_id} in {a_tag_href_matched_to_toc} file.'
f' Old id={a_tag_id}') f' Old id={a_tag_id}')
def build_one_chapter(self, node): def build_one_chapter(self, nav_point):
""" """
Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object) Updates self.href_chapter_id2soup_html (mapping from (href,id) to chapter content/html soup object)
@@ -339,34 +368,34 @@ class EpubConverter:
and id of the next chapter/subchapter and id of the next chapter/subchapter
""" """
if node.id: if nav_point.id:
soup = self.href2soup_html[node.href] soup = self.href2soup_html[nav_point.href]
chapter_tags = get_tags_between_chapter_marks(first_id=node.id, href=node.href, html_soup=soup) chapter_tags = get_tags_between_chapter_marks(first_id=nav_point.id, href=nav_point.href, html_soup=soup)
new_tree = BeautifulSoup('', 'html.parser') new_tree = BeautifulSoup('', 'html.parser')
for tag in chapter_tags: for tag in chapter_tags:
new_tree.append(tag) new_tree.append(tag)
self.href_chapter_id2soup_html[(node.href, node.id)] = new_tree self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)] = new_tree
if self.adjacency_list.get(node): if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[node]: for sub_node in self.adjacency_list[nav_point]:
self.build_one_chapter(sub_node) self.build_one_chapter(sub_node)
def define_chapters_content(self): def define_chapters_content(self):
nav_points = self.adjacency_list[-1] top_level_nav_points = self.adjacency_list[-1]
if self.id_anchor_exist_in_nav_points: if self.id_anchor_exist_in_nav_points:
for point in nav_points: for point in top_level_nav_points:
self.build_one_chapter(point) self.build_one_chapter(point)
def node2livecarta_chapter_item(self, node: NavPoint, lvl=1) -> ChapterItem: def node2livecarta_chapter_item(self, nav_point: NavPoint, lvl=1) -> ChapterItem:
title = node.title title = nav_point.title
if node.id: if nav_point.id:
content: BeautifulSoup = self.href_chapter_id2soup_html[(node.href, node.id)] content: BeautifulSoup = self.href_chapter_id2soup_html[(nav_point.href, nav_point.id)]
else: else:
content: BeautifulSoup = self.href2soup_html[node.href] content: BeautifulSoup = self.href2soup_html[nav_point.href]
self.old_image_path2_aws_path = update_src_links_in_images(content, self.old_image_path2_aws_path = update_src_links_in_images(content,
self.href2img_bytes, self.href2img_bytes,
path_to_html=node.href, path_to_html=nav_point.href,
access=self.access, access=self.access,
path2aws_path=self.old_image_path2_aws_path) path2aws_path=self.old_image_path2_aws_path)
@@ -376,8 +405,8 @@ class EpubConverter:
sub_nodes = [] sub_nodes = []
# warning! not EpubHtmlItems won;t be added to chapter # warning! not EpubHtmlItems won;t be added to chapter
if self.adjacency_list.get(node): if self.adjacency_list.get(nav_point):
for sub_node in self.adjacency_list[node]: for sub_node in self.adjacency_list[nav_point]:
sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1) sub_chapter_item = self.node2livecarta_chapter_item(sub_node, lvl + 1)
sub_nodes.append(sub_chapter_item) sub_nodes.append(sub_chapter_item)

View File

@@ -1,5 +1,5 @@
from epub_converter import EpubConverter from epub_converter import EpubConverter
from src.book_solver import BookSolver from solver import BookSolver
class EpubBook(BookSolver): class EpubBook(BookSolver):

View File

@@ -8,7 +8,7 @@ from typing import List
from bs4 import BeautifulSoup, NavigableString, Tag from bs4 import BeautifulSoup, NavigableString, Tag
from livecarta_config import LawCartaConfig from livecarta_config import LawCartaConfig
from src.util.helpers import BookLogger, BookStatusWrapper from util.helpers import BookLogger, BookStatusWrapper
class HTMLDocxPreprocessor: class HTMLDocxPreprocessor:

139
src/solver.py Normal file
View File

@@ -0,0 +1,139 @@
""" This is Main Abstract class for solving a task of a book conversion
Having an id of coming book, gets book from server, runs conversion.
In parallel it updates status of a book conversion on admin panel.
Finally sends result to server.
Result is a json, JSON schema in book_schema.json
"""
import codecs
import json
import logging
import os
import pathlib
from abc import abstractmethod, ABCMeta
from livecarta_config import LawCartaConfig
from util.helpers import BookLogger, BookStatusWrapper
class BookSolver:
__metaclass__ = ABCMeta
def __init__(self, book_id=0, access=None, main_logger=None, logging_format='%(asctime)s - %(levelname)s - %(message)s'):
self.book_type = None
self.book_id = book_id
self.access = access
self.file_path = None # path to book file, appears after downloading from server
self.output_path = None # path to json file
self.logger_object = BookLogger(name=f'{__name__}_{self.book_id}',
logging_format=logging_format,
book_id=book_id,
main_logger=main_logger)
self.status_wrapper = BookStatusWrapper(access, self.logger_object, book_id)
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels."
def save_book_file(self, content):
"""
Save binary content of file to .docx/.epub.
:param content: binary content of the file.
"""
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'{self.book_type}/{self.book_id}')
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
try:
with open(file_path, 'wb+') as file:
file.write(content)
self.logger_object.log(f'File was saved to folder: {folder_path}.')
except Exception as exc:
self.logger_object.log(f"Error in writing {self.book_type} file.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise exc
self.file_path = pathlib.Path(file_path)
def get_book_file(self):
"""
Method for getting and saving book from server.
"""
try:
self.logger_object.log(f'Start receiving file from server. URL: {self.access.url}/doc-convert/{self.book_id}/file')
content = self.access.get_doc(self.book_id)
self.logger_object.log('File was received from server.')
self.save_book_file(content)
except FileNotFoundError as f_err:
self.logger_object.log("Can't get docx from server.", logging.ERROR)
self.logger_object.log_error_to_main_log()
raise f_err
except Exception as exc:
raise exc
def check_output_directory(self):
if self.output_path is None:
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
output_path = os.path.join(folder_path, f'json/{self.book_id}.json')
self.output_path = output_path
self.output_path = pathlib.Path(self.output_path)
self.logger_object.log(f'Output file path: {self.output_path}')
pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
self.output_path.touch(exist_ok=True)
def write_to_json(self, content: dict):
self.check_output_directory()
try:
with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
json.dump(content, f, ensure_ascii=False)
self.logger_object.log(f'Data has been saved to .json file: {self.output_path}')
except Exception as exc:
self.logger_object.log('Error has occurred while writing json file.' + str(exc), logging.ERROR)
def send_json_content_to_server(self, content: dict):
try:
self.access.send_book(self.book_id, content)
self.logger_object.log(f'JSON data has been sent to server.')
except Exception as exc:
self.logger_object.log('Error has occurred while sending json content.', logging.ERROR)
self.logger_object.log_error_to_main_log()
self.status_wrapper.set_error()
raise exc
@abstractmethod
def get_converted_book(self):
self.logger_object.log('Beginning of processing json output.')
self.status_wrapper.set_generating()
return {}
def test_conversion(self):
self.logger_object.log('Beginning of the test.')
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.join(folder_path, f'{self.book_type}')
file_path = os.path.join(folder_path, f'{self.book_id}.{self.book_type}')
self.file_path = pathlib.Path(file_path)
self.logger_object.log(f'Test on {self.book_type}: {self.file_path}')
content_dict = self.get_converted_book()
self.write_to_json(content_dict)
self.logger_object.log('End of the test.')
def conversion(self):
try:
self.logger_object.log(f'Beginning of conversion from .{self.book_type} to .json.')
self.get_book_file()
self.status_wrapper.set_processing()
content_dict = self.get_converted_book()
self.write_to_json(content_dict)
self.send_json_content_to_server(content_dict)
self.logger_object.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
except Exception as exc:
self.status_wrapper.set_error()
self.logger_object.log('Error has occurred while conversion.', logging.ERROR)
self.logger_object.log_error_to_main_log(str(exc))
raise exc