forked from LiveCarta/BookConverter
add color style processing
deleting toc in the end of whole html parsing fix searching <a> in footnotes
This commit is contained in:
@@ -32,7 +32,7 @@ class Book:
|
|||||||
main_logger=main_logger)
|
main_logger=main_logger)
|
||||||
self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id)
|
self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id)
|
||||||
|
|
||||||
assert BookConfig.SUPPORTED_LEVELS == len(BookConfig.SUPPORTED_HEADERS), \
|
assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
|
||||||
"Length of headers doesn't match allowed levels."
|
"Length of headers doesn't match allowed levels."
|
||||||
|
|
||||||
def save_docx(self, content):
|
def save_docx(self, content):
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import re
|
|||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
from config import BookConfig, BookLogger, BookApiWrapper
|
from config import LawCartaConfig, BookLogger, BookApiWrapper
|
||||||
|
|
||||||
|
|
||||||
class HTMLPreprocessor:
|
class HTMLPreprocessor:
|
||||||
@@ -49,8 +49,8 @@ class HTMLPreprocessor:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def convert_pt_to_px(cls, value):
|
def convert_pt_to_px(cls, value):
|
||||||
value = int(value)
|
value = int(value)
|
||||||
if value == BookConfig.WORD_DEFAULT_FONT_SIZE:
|
if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE:
|
||||||
return BookConfig.LAWCARTA_DEFAULT_FONT_SIZE
|
return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
|
||||||
else:
|
else:
|
||||||
return value
|
return value
|
||||||
|
|
||||||
@@ -70,7 +70,7 @@ class HTMLPreprocessor:
|
|||||||
size = size.group(1)
|
size = size.group(1)
|
||||||
new_size = cls.convert_pt_to_px(size)
|
new_size = cls.convert_pt_to_px(size)
|
||||||
|
|
||||||
if new_size == BookConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
return re.sub(size + "pt", str(new_size) + "px", style)
|
return re.sub(size + "pt", str(new_size) + "px", style)
|
||||||
@@ -83,41 +83,39 @@ class HTMLPreprocessor:
|
|||||||
for font in fonts:
|
for font in fonts:
|
||||||
face = font.get("face")
|
face = font.get("face")
|
||||||
style = font.get("style")
|
style = font.get("style")
|
||||||
|
color = font.get("color")
|
||||||
|
|
||||||
font.attrs = {}
|
font.attrs = {}
|
||||||
font.name = "span"
|
font.name = "span"
|
||||||
if style:
|
if style:
|
||||||
style = self.convert_font_pt_to_px(style)
|
style = self.convert_font_pt_to_px(style)
|
||||||
if style != "":
|
if style != "":
|
||||||
|
if color and color != '#000000':
|
||||||
|
style += f'; color: {color};'
|
||||||
font.attrs["style"] = style
|
font.attrs["style"] = style
|
||||||
|
elif color and color != '#000000':
|
||||||
|
font.attrs["style"] = f'color: {color};'
|
||||||
|
|
||||||
if face is not None:
|
if face is not None:
|
||||||
face = re.sub(r",[\w,\- ]*$", "", face)
|
face = re.sub(r",[\w,\- ]*$", "", face)
|
||||||
if face != BookConfig.DEFAULT_FONT_NAME and BookConfig.font_correspondence_table.get(face):
|
if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face):
|
||||||
font.attrs["face"] = BookConfig.font_correspondence_table[face]
|
font.attrs["face"] = LawCartaConfig.font_correspondence_table[face]
|
||||||
else:
|
else:
|
||||||
font.attrs["face"] = BookConfig.DEFAULT_FONT_NAME
|
font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME
|
||||||
|
|
||||||
if len(font.attrs) == 0:
|
if len(font.attrs) == 0:
|
||||||
font.unwrap()
|
font.unwrap()
|
||||||
|
|
||||||
assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more <font> tags
|
assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more <font> tags
|
||||||
|
|
||||||
def _remove_table_of_contents(self):
|
def delete_content_before_toc(self):
|
||||||
"""
|
# replace toc with empty <TOC> tag
|
||||||
Function to remove table of content from file.
|
|
||||||
"""
|
|
||||||
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
|
|
||||||
for table in tables:
|
|
||||||
table.decompose()
|
|
||||||
|
|
||||||
def _change_table_of_contents(self):
|
|
||||||
self._change_table_of_contents()
|
|
||||||
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
|
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
|
||||||
for table in tables:
|
for table in tables:
|
||||||
table.wrap(self.html_soup.new_tag("TOC"))
|
table.wrap(self.html_soup.new_tag("TOC"))
|
||||||
table.decompose()
|
table.decompose()
|
||||||
|
|
||||||
def delete_content_before_toc(self):
|
# remove all tag upper the <TOC>
|
||||||
toc_tag = self.html_soup.new_tag('TOC')
|
toc_tag = self.html_soup.new_tag('TOC')
|
||||||
if toc_tag in self.content:
|
if toc_tag in self.content:
|
||||||
ind = self.content.index(toc_tag) + 1
|
ind = self.content.index(toc_tag) + 1
|
||||||
@@ -131,14 +129,12 @@ class HTMLPreprocessor:
|
|||||||
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages
|
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages
|
||||||
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
|
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
|
||||||
|
|
||||||
self._clean_tag('font', 'color', re.compile(r'^#[0-9a-fA-F]{6}$'))
|
|
||||||
self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$'))
|
self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$'))
|
||||||
|
|
||||||
self._clean_tag("a", "name", "_GoBack")
|
self._clean_tag("a", "name", "_GoBack")
|
||||||
self._clean_underline_links()
|
self._clean_underline_links()
|
||||||
|
|
||||||
self._font_to_span()
|
self._font_to_span()
|
||||||
# self._remove_table_of_contents()
|
|
||||||
|
|
||||||
def _process_paragraph(self):
|
def _process_paragraph(self):
|
||||||
"""
|
"""
|
||||||
@@ -178,7 +174,7 @@ class HTMLPreprocessor:
|
|||||||
p.attrs = {}
|
p.attrs = {}
|
||||||
style = ''
|
style = ''
|
||||||
|
|
||||||
if align is not None and align != BookConfig.DEFAULT_ALIGN_STYLE:
|
if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE:
|
||||||
style += f'text-align: {align};'
|
style += f'text-align: {align};'
|
||||||
|
|
||||||
if indent is not None:
|
if indent is not None:
|
||||||
@@ -280,10 +276,6 @@ class HTMLPreprocessor:
|
|||||||
tag.string = tag.text.replace('\u200c', '')
|
tag.string = tag.text.replace('\u200c', '')
|
||||||
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
|
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
|
||||||
|
|
||||||
# %E2%80%8C
|
|
||||||
for tag in a_tags_with_href:
|
|
||||||
print(tag)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _clean_footnote_content(content):
|
def _clean_footnote_content(content):
|
||||||
content = content.strip()
|
content = content.strip()
|
||||||
@@ -303,7 +295,8 @@ class HTMLPreprocessor:
|
|||||||
footnotes = []
|
footnotes = []
|
||||||
|
|
||||||
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
|
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
|
||||||
true_a_tag = cont_tag.find('a', {'class': 'sdfootnotesym-western'})
|
true_a_tag = cont_tag.find_all('a', class_=re.compile(r'^sdfootnote.+$'))[0]
|
||||||
|
|
||||||
if true_a_tag.attrs.get('href') is None:
|
if true_a_tag.attrs.get('href') is None:
|
||||||
cont_tag.a.decompose()
|
cont_tag.a.decompose()
|
||||||
continue
|
continue
|
||||||
@@ -439,7 +432,7 @@ class HTMLPreprocessor:
|
|||||||
"""
|
"""
|
||||||
Function to convert all lower level headings to p tags
|
Function to convert all lower level headings to p tags
|
||||||
"""
|
"""
|
||||||
pattern = f'^h[{BookConfig.SUPPORTED_LEVELS + 1}-9]$'
|
pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
|
||||||
header_tags = self.body_tag.find_all(re.compile(pattern))
|
header_tags = self.body_tag.find_all(re.compile(pattern))
|
||||||
for tag in header_tags:
|
for tag in header_tags:
|
||||||
tag.name = 'p'
|
tag.name = 'p'
|
||||||
@@ -527,8 +520,8 @@ class HTMLPreprocessor:
|
|||||||
if title == "":
|
if title == "":
|
||||||
tag.unwrap()
|
tag.unwrap()
|
||||||
else:
|
else:
|
||||||
assert tag.name in BookConfig.SUPPORTED_HEADERS, \
|
assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
|
||||||
f'Preprocessing went wrong, there is still h{BookConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
|
||||||
# if tag.name in ["h4", "h5", "h6"]:
|
# if tag.name in ["h4", "h5", "h6"]:
|
||||||
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
|
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import codecs
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from config import BookConfig
|
from src.config import LawCartaConfig
|
||||||
|
|
||||||
|
|
||||||
class JSONConverter:
|
class JSONConverter:
|
||||||
@@ -34,7 +34,7 @@ class JSONConverter:
|
|||||||
|
|
||||||
:param ind: Index of header in content list.
|
:param ind: Index of header in content list.
|
||||||
"""
|
"""
|
||||||
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
|
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||||
title = self.content[ind].text
|
title = self.content[ind].text
|
||||||
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag
|
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag
|
||||||
result = {
|
result = {
|
||||||
@@ -47,7 +47,7 @@ class JSONConverter:
|
|||||||
|
|
||||||
while ind < len(self.content):
|
while ind < len(self.content):
|
||||||
# 1. next tag is a header
|
# 1. next tag is a header
|
||||||
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
|
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||||
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
outline = int(re.sub(r"^h", "", self.content[ind].name))
|
||||||
# - recursion step until h_i > h_initial
|
# - recursion step until h_i > h_initial
|
||||||
if outline > curr_outline:
|
if outline > curr_outline:
|
||||||
@@ -100,13 +100,13 @@ class JSONConverter:
|
|||||||
while ind < len(self.content):
|
while ind < len(self.content):
|
||||||
res = {}
|
res = {}
|
||||||
|
|
||||||
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS:
|
if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
|
||||||
res, ind = self.header_to_json(ind)
|
res, ind = self.header_to_json(ind)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
chapter_title = f'Untitled chapter {ch_num}'
|
chapter_title = f'Untitled chapter {ch_num}'
|
||||||
chapter = []
|
chapter = []
|
||||||
while ind < len(self.content) and self.content[ind].name not in BookConfig.SUPPORTED_HEADERS:
|
while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
|
||||||
if not self._is_empty_p_tag(self.content[ind]):
|
if not self._is_empty_p_tag(self.content[ind]):
|
||||||
chapter.append(self.format_html(str(self.content[ind])))
|
chapter.append(self.format_html(str(self.content[ind])))
|
||||||
ind += 1
|
ind += 1
|
||||||
|
|||||||
Reference in New Issue
Block a user