Formatting: documentation + optimization

This commit is contained in:
Kiryl
2022-06-06 16:37:42 +03:00
parent 002316f086
commit acb2ce48c2
3 changed files with 33 additions and 43 deletions

View File

@@ -13,8 +13,8 @@ class BookSolver:
"""
This is Main Abstract class for solving a task of a book conversion
Having an id of coming book, gets book from server, runs conversion.
In parallel it updates status of a book conversion on admin panel.
Finally sends result to server.
In parallel, it updates status of a book conversion on admin panel.
Finally, sends result to server.
Result is a json, JSON schema in book_schema.json
"""
@@ -35,12 +35,12 @@ class BookSolver:
assert LiveCartaConfig.SUPPORTED_LEVELS == len(LiveCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels."
def save_book_file(self, content: str):
def save_book_file(self, content: bytes):
"""
Function saves binary content of file to .docx/.epub
Parameters
----------
content: str
content: bytes str
binary content of the file
"""

View File

@@ -116,8 +116,8 @@ class HTMLDocxPreprocessor:
if face is not None:
face = re.sub(r",[\w,\- ]*$", "", face)
if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.font_correspondence_table.get(face):
font.attrs["face"] = LiveCartaConfig.font_correspondence_table[face]
if face != LiveCartaConfig.DEFAULT_FONT_NAME and LiveCartaConfig.FONT_CORRESPONDANCE_TABLE.get(face):
font.attrs["face"] = LiveCartaConfig.FONT_CORRESPONDANCE_TABLE[face]
else:
font.attrs["face"] = LiveCartaConfig.DEFAULT_FONT_NAME
@@ -137,11 +137,11 @@ class HTMLDocxPreprocessor:
def clean_trash(self):
"""Function to remove all styles and tags we don't need."""
self._clean_tag('span', 'style', re.compile(
r'^background: #[0-9a-fA-F]{6}$'))
r'^background: #[\da-fA-F]{6}$'))
# todo: check for another languages
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))
self._clean_tag('span', 'style', re.compile(
'^letter-spacing: -?[\d\.]+pt$'))
'^letter-spacing: -?[\d.]+pt$'))
self._clean_tag('font', 'face', re.compile(
r'^Times New Roman[\w, ]+$'))
@@ -179,13 +179,13 @@ class HTMLDocxPreprocessor:
style = p.get('style')
if style:
indent = re.search(r'text-indent: ([\d\.]{1,4})in', style)
margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style)
indent = re.search(r'text-indent: ([\d.]{1,4})in', style)
margin_left = re.search(r'margin-left: ([\d.]{1,4})in', style)
margin_right = re.search(
r'margin-right: ([\d\.]{1,4})in', style)
margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style)
r'margin-right: ([\d.]{1,4})in', style)
margin_top = re.search(r'margin-top: ([\d.]{1,4})in', style)
margin_bottom = re.search(
r'margin-bottom: ([\d\.]{1,4})in', style)
r'margin-bottom: ([\d.]{1,4})in', style)
else:
indent = None
margin_left = None
@@ -517,7 +517,7 @@ class HTMLDocxPreprocessor:
Function for gathering info about top-level chapters.
Assume:
- Headers with smallest outline(or digit in <h>) are top level chapters.
- Headers with the smallest outline(or digit in <h>) are top level chapters.
[ It is consistent with a recursive algorithm
for saving content to a resulted json structure,
which happens in header_to_json()]
@@ -560,7 +560,7 @@ class HTMLDocxPreprocessor:
Assume header(s) to be introduction if:
1. one header not numbered, before 1 numbered header
2. it is first header from the top level list and it equals to 'introductio
2. it is first header from the top level list, and it equals to 'introduction'
Returns
-------
None
@@ -665,10 +665,6 @@ class HTMLDocxPreprocessor:
Function
- process tags <li>.
- unwrap <p> tags.
Parameters
----------
body_tag: Tag, soup object
Returns
-------
None

View File

@@ -75,8 +75,8 @@ def _preprocess_table(body_tag: BeautifulSoup):
"""Function to preprocess tables and tags(td|th|tr): style"""
tables = body_tag.find_all("table")
for table in tables:
ts = table.find_all(re.compile("td|th|tr"))
for t_tag in ts:
t_tags = table.find_all(re.compile("td|th|tr"))
for t_tag in t_tags:
style = t_tag.get('style')
width = ''
if style:
@@ -113,7 +113,6 @@ def _process_lists(body_tag: BeautifulSoup):
None
"""
li_tags = body_tag.find_all("li")
for li_tag in li_tags:
if li_tag.p:
@@ -268,7 +267,7 @@ def preprocess_footnotes(source_html_tag: Tag, href2soup_html: dict = None, note
return footnotes, new_noterefs_tags, new_footnotes_tags
def unwrap_structural_tags(body_tag: BeautifulSoup):
def unwrap_structural_tags(body_tag: BeautifulSoup) -> BeautifulSoup:
"""
Main function that works with structure of html. Make changes inplace.
Parameters
@@ -288,10 +287,10 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
Returns
-------
None
body_tag: Tag, BeautifulSoup
adjusted body_tag
"""
def _preserve_class_in_aside_tag(tag_):
"""to save css style inherited from class, copy class to aside tag (which is parent to tag_)"""
# this is for Wiley books with boxes
@@ -311,7 +310,7 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
Returns
-------
None
bool
"""
# this is for Wiley books with boxes
@@ -454,21 +453,19 @@ def unwrap_structural_tags(body_tag: BeautifulSoup):
tag = body_tag.new_tag('p')
tag.append(str(node))
node.replace_with(tag)
return body_tag
def get_tags_between_chapter_marks(first_id: str, href: str, html_soup: BeautifulSoup) -> list:
"""After processing on a first_id that corresponds to current chapter,
from initial html_soup all tags from current chapter are extracted
Parameters
----------
first_id:
Id that point where a chapter starts. A Tag with class: 'converter-chapter-mark'
href:
Name of current chapter's file
html_soup: Tag, soup object
html_soup: Tag
Soup object of current file
Returns
@@ -530,19 +527,17 @@ def _clean_wiley_block(block):
h.insert_before(BeautifulSoup(features='lxml').new_tag("br"))
def _preprocess_block_tags(chapter_tag):
def _preprocess_block_tags(chapter_tag: Tag):
"""Function preprocessing <block> tags"""
for block in chapter_tag.find_all("blockquote"):
if block.attrs.get('class') in ['feature1', 'feature2', 'feature3', 'feature4']:
_clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get(
'class') == 'feature2' else color
_wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap()
for block in chapter_tag.find_all("blockquote", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(block)
color = '#DDDDDD' if block.attrs.get(
'class') == 'feature1' else None
color = '#EEEEEE' if block.attrs.get(
'class') == 'feature2' else color
_wrap_block_tag_with_table(chapter_tag, block, bg_color=color)
block.insert_after(BeautifulSoup(features='lxml').new_tag("br"))
block.unwrap()
for future_block in chapter_tag.find_all("p", attrs={"class": re.compile("feature[1234]")}):
_clean_wiley_block(future_block)
@@ -647,8 +642,7 @@ def _preprocess_code_tags(chapter_tag: BeautifulSoup):
code.name = "span"
if code.parent.name == "pre":
continue
# if tags aren't in pre
# if tags aren't in pre and don't have style
if not code.attrs.get('style'):
code.attrs['style'] = 'font-size: 14px; font-family: courier new,courier,monospace;'