forked from LiveCarta/BookConverter
Formatting: documentation + optimization
This commit is contained in:
@@ -13,8 +13,8 @@ class BookSolver:
|
|||||||
"""
|
"""
|
||||||
This is Main Abstract class for solving a task of a book conversion
|
This is Main Abstract class for solving a task of a book conversion
|
||||||
Having an id of coming book, gets book from server, runs conversion.
|
Having an id of coming book, gets book from server, runs conversion.
|
||||||
In parallel it updates status of a book conversion on admin panel.
|
In parallel, it updates status of a book conversion on admin panel.
|
||||||
Finally sends result to server.
|
Finally, sends result to server.
|
||||||
Result is a json, JSON schema in book_schema.json
|
Result is a json, JSON schema in book_schema.json
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -35,12 +35,12 @@ class BookSolver:
|
|||||||
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
|
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
|
||||||
"Length of headers doesn't match allowed levels."
|
"Length of headers doesn't match allowed levels."
|
||||||
|
|
||||||
def save_book_file(self, content: str):
|
def save_book_file(self, content: bytes):
|
||||||
"""
|
"""
|
||||||
Function saves binary content of file to .docx/.epub
|
Function saves binary content of file to .docx/.epub
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
content: str
|
content: bytes str
|
||||||
binary content of the file
|
binary content of the file
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -116,8 +116,8 @@ class HTMLDocxPreprocessor:
|
|||||||
|
|
||||||
if face is not None:
|
if face is not None:
|
||||||
face = re.sub(r",[\w,\- ]*$", "", face)
|
face = re.sub(r",[\w,\- ]*$", "", face)
|
||||||
if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.font_correspondence_table.get(face):
|
if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(face):
|
||||||
font.attrs["face"] = LiveCartaConfig.font_correspondence_table[face]
|
font.attrs["face"] = LiveCartaConfig.FONT_CORRESPONDANCE_TABLE[face]
|
||||||
else:
|
else:
|
||||||
font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME
|
font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME
|
||||||
|
|
||||||
@@ -137,11 +137,11 @@ class HTMLDocxPreprocessor:
|
|||||||
def clean_trash(self):
|
def clean_trash(self):
|
||||||
"""Function to remove all styles and tags we don't need."""
|
"""Function to remove all styles and tags we don't need."""
|
||||||
self._clean_tag('span', 'style', re.compile(
|
self._clean_tag('span', 'style', re.compile(
|
||||||
r'^background: #[0-9a-fA-F]{6}$'))
|
r'^background: #[\da-fA-F]{6}$'))
|
||||||
# todo: check for another languages
|
# todo: check for another languages
|
||||||
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))
|
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))
|
||||||
self._clean_tag('span', 'style', re.compile(
|
self._clean_tag('span', 'style', re.compile(
|
||||||
'^letter-spacing: -?[\d\.]+pt$'))
|
'^letter-spacing: -?[\d.]+pt$'))
|
||||||
|
|
||||||
self._clean_tag('font', 'face', re.compile(
|
self._clean_tag('font', 'face', re.compile(
|
||||||
r'^Times New Roman[\w, ]+$'))
|
r'^Times New Roman[\w, ]+$'))
|
||||||
@@ -179,13 +179,13 @@ class HTMLDocxPreprocessor:
|
|||||||
style = p.get('style')
|
style = p.get('style')
|
||||||
|
|
||||||
if style:
|
if style:
|
||||||
indent = re.search(r'text-indent: ([\d\.]{1,4})in', style)
|
indent = re.search(r'text-indent: ([\d.]{1,4})in', style)
|
||||||
margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style)
|
margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style)
|
||||||
margin_right = re.search(
|
margin_right = re.search(
|
||||||
r'margin-right: ([\d\.]{1,4})in', style)
|
r'margin-right: ([\d.]{1,4})in', style)
|
||||||
margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style)
|
margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style)
|
||||||
margin_bottom = re.search(
|
margin_bottom = re.search(
|
||||||
r'margin-bottom: ([\d\.]{1,4})in', style)
|
r'margin-bottom: ([\d.]{1,4})in', style)
|
||||||
else:
|
else:
|
||||||
indent = None
|
indent = None
|
||||||
margin_left = None
|
margin_left = None
|
||||||
@@ -517,7 +517,7 @@ class HTMLDocxPreprocessor:
|
|||||||
Function for gathering info about top-level chapters.
|
Function for gathering info about top-level chapters.
|
||||||
|
|
||||||
Assume:
|
Assume:
|
||||||
- Headers with smallest outline(or digit in <h>) are top level chapters.
|
- Headers with the smallest outline(or digit in <h>) are top level chapters.
|
||||||
[ It is consistent with a recursive algorithm
|
[ It is consistent with a recursive algorithm
|
||||||
for saving content to a resulted json structure,
|
for saving content to a resulted json structure,
|
||||||
which happens in header_to_json()]
|
which happens in header_to_json()]
|
||||||
@@ -560,7 +560,7 @@ class HTMLDocxPreprocessor:
|
|||||||
|
|
||||||
Assume header(s) to be introduction if:
|
Assume header(s) to be introduction if:
|
||||||
1. one header not numbered, before 1 numbered header
|
1. one header not numbered, before 1 numbered header
|
||||||
2. it is first header from the top level list and it equals to 'introductio
|
2. it is first header from the top level list, and it equals to 'introduction'
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
None
|
None
|
||||||
@@ -665,10 +665,6 @@ class HTMLDocxPreprocessor:
|
|||||||
Function
|
Function
|
||||||
- process tags <li>.
|
- process tags <li>.
|
||||||
- unwrap <p> tags.
|
- unwrap <p> tags.
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
body_tag: Tag, soup object
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
None
|
None
|
||||||
|
|||||||
@@ -75,8 +75,8 @@ def _preprocess_table(body_tag: BeautifulSoup):
|
|||||||
"""Function to preprocess tables and tags(td|th|tr): style"""
|
"""Function to preprocess tables and tags(td|th|tr): style"""
|
||||||
tables = body_tag.find_all("table")
|
tables = body_tag.find_all("table")
|
||||||
for table in tables:
|
for table in tables:
|
||||||
ts = table.find_all(re.compile("td|th|tr"))
|
t_tags = table.find_all(re.compile("td|th|tr"))
|
||||||
for t_tag in ts:
|
for t_tag in t_tags:
|
||||||
style = t_tag.get('style')
|
style = t_tag.get('style')
|
||||||
width = ''
|
width = ''
|
||||||
if style:
|
if style:
|
||||||
@@ -113,7 +113,6 @@ def _process_lists(body_tag: BeautifulSoup):
|
|||||||
None
|
None
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
li_tags = body_tag.find_all("li")
|
li_tags = body_tag.find_all("li")
|
||||||
for li_tag in li_tags:
|
for li_tag in li_tags:
|
||||||
if li_tag.p:
|
if li_tag.p:
|
||||||
@@ -268,7 +267,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
|
|||||||
return footnotes, new_noterefs_tags, new_footnotes_tags
|
return footnotes, new_noterefs_tags, new_footnotes_tags
|
||||||
|
|
||||||
|
|
||||||
def unwrap_structural_tags(body_tag: BeautifulSoup):
|
def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup:
|
||||||
"""
|
"""
|
||||||
Main function that works with structure of html. Make changes inplace.
|
Main function that works with structure of html. Make changes inplace.
|
||||||
Parameters
|
Parameters
|
||||||
@@ -288,10 +287,10 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
None
|
body_tag: Tag, BeautifulSoup
|
||||||
|
adjusted body_tag
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _preserve_class_in_aside_tag(tag_):
|
def _preserve_class_in_aside_tag(tag_):
|
||||||
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
|
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
|
||||||
# this is for Wiley books with boxes
|
# this is for Wiley books with boxes
|
||||||
@@ -311,7 +310,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
None
|
bool
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# this is for Wiley books with boxes
|
# this is for Wiley books with boxes
|
||||||
@@ -454,21 +453,19 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
|
|||||||
tag = body_tag.new_tag('p')
|
tag = body_tag.new_tag('p')
|
||||||
tag.append(str(node))
|
tag.append(str(node))
|
||||||
node.replace_with(tag)
|
node.replace_with(tag)
|
||||||
|
|
||||||
return body_tag
|
return body_tag
|
||||||
|
|
||||||
|
|
||||||
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
|
||||||
"""After processing on a first_id that corresponds to current chapter,
|
"""After processing on a first_id that corresponds to current chapter,
|
||||||
from initial html_soup all tags from current chapter are extracted
|
from initial html_soup all tags from current chapter are extracted
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
first_id:
|
first_id:
|
||||||
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
|
||||||
href:
|
href:
|
||||||
Name of current chapter's file
|
Name of current chapter's file
|
||||||
html_soup: Tag, soup object
|
html_soup: Tag
|
||||||
Soup object of current file
|
Soup object of current file
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
@@ -530,19 +527,17 @@ def _clean_wiley_block(block):
|
|||||||
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
|
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_block_tags(chapter_tag):
|
def _preprocess_block_tags(chapter_tag: Tag):
|
||||||
"""Function preprocessing <block> tags"""
|
"""Function preprocessing <block> tags"""
|
||||||
for block in chapter_tag.find_all("blockquote"):
|
for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
|
||||||
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
|
_clean_wiley_block(block)
|
||||||
_clean_wiley_block(block)
|
color = '#DDDDDD' if block.attrs.get(
|
||||||
|
'class') == 'feature1' else None
|
||||||
color = '#DDDDDD' if block.attrs.get(
|
color = '#EEEEEE' if block.attrs.get(
|
||||||
'class') == 'feature1' else None
|
'class') == 'feature2' else color
|
||||||
color = '#EEEEEE' if block.attrs.get(
|
_wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
|
||||||
'class') == 'feature2' else color
|
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
|
||||||
_wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
|
block.unwrap()
|
||||||
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
|
|
||||||
block.unwrap()
|
|
||||||
|
|
||||||
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
|
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
|
||||||
_clean_wiley_block(future_block)
|
_clean_wiley_block(future_block)
|
||||||
@@ -647,8 +642,7 @@ def _preprocess_code_tags(chapter_tag: BeautifulSoup):
|
|||||||
code.name = "span"
|
code.name = "span"
|
||||||
if code.parent.name == "pre":
|
if code.parent.name == "pre":
|
||||||
continue
|
continue
|
||||||
|
# if tags aren't in pre and don't have style
|
||||||
# if tags aren't in pre
|
|
||||||
if not code.attrs.get('style'):
|
if not code.attrs.get('style'):
|
||||||
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
|
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user