fix quotes processing

This commit is contained in:
Jeniamakarchik
2020-04-10 17:05:05 +03:00
parent 5c56c8c2fa
commit 9dea72ca31

View File

@@ -362,13 +362,31 @@ class Book:
if style: if style:
indent = re.search(r'text-indent: ([\d\.]{1,4})in', style) indent = re.search(r'text-indent: ([\d\.]{1,4})in', style)
margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style)
margin_right= re.search(r'margin-right: ([\d\.]{1,4})in', style)
margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style)
margin_bottom = re.search(r'margin-bottom: ([\d\.]{1,4})in', style)
else: else:
indent = None indent = None
margin_left = None
margin_right = None
margin_top = None
margin_bottom = None
if margin_left and margin_right and margin_top and margin_bottom and \
margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \
margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11':
blockquote = BeautifulSoup(features='lxml').new_tag('blockquote')
blockquote.append(BeautifulSoup(features='lxml').new_tag('p'))
else:
blockquote = None
p.attrs = {} p.attrs = {}
style = '' style = ''
if align is not None and align != self.DEFAULT_ALIGN_STYLE: if align is not None and align != self.DEFAULT_ALIGN_STYLE:
style += f'text-align: {align};' style += f'text-align: {align};'
if indent is not None: if indent is not None:
indent = indent.group(1) indent = indent.group(1)
style += f'text-indent: {indent}in;' style += f'text-indent: {indent}in;'
@@ -376,6 +394,11 @@ class Book:
if style: if style:
p.attrs['style'] = style p.attrs['style'] = style
if blockquote:
blockquote.p.attrs = p.attrs
blockquote.p.string = p.text
p.replace_with(blockquote)
def _process_two_columns(self): def _process_two_columns(self):
""" """
Function to process paragraphs which has two columns layout. Function to process paragraphs which has two columns layout.
@@ -387,20 +410,20 @@ class Book:
child["class"] = "columns2" child["class"] = "columns2"
div.unwrap() div.unwrap()
def _process_quotes(self): # def _process_quotes(self):
""" # """
Function to process <dl> tags. All tags will be replaced with <blockquote> tags. # Function to process <dl> tags. All tags will be replaced with <blockquote> tags.
""" # """
dls = self.body_tag.find_all('dl') # dls = self.body_tag.find_all('dl')
#
for dl in dls: # for dl in dls:
pars = dl.find_all('p') # pars = dl.find_all('p')
for p in pars: # for p in pars:
p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) # p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote'))
new_div = BeautifulSoup(features='lxml').new_tag('div') # new_div = BeautifulSoup(features='lxml').new_tag('div')
for p in pars: # for p in pars:
new_div.append(p.parent) # new_div.append(p.parent)
dl.replaceWith(new_div) # dl.replaceWith(new_div)
@staticmethod @staticmethod
def _clean_footnote_content(content): def _clean_footnote_content(content):
@@ -575,7 +598,7 @@ class Book:
self._preprocessing_headings() self._preprocessing_headings()
self._process_paragraph() self._process_paragraph()
self._process_two_columns() self._process_two_columns()
self._process_quotes() # self._process_quotes()
self.log('Footnotes processing.') self.log('Footnotes processing.')
self._process_footnotes() self._process_footnotes()
@@ -766,8 +789,8 @@ class Book:
if __name__ == "__main__": if __name__ == "__main__":
folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
file_path = pathlib.Path(os.path.join(folder_path, 'html/11/11.html')) file_path = pathlib.Path(os.path.join(folder_path, 'html/0/quote.html'))
out_path = pathlib.Path(os.path.join(folder_path, 'json/11.json')) out_path = pathlib.Path(os.path.join(folder_path, 'json/quote.json'))
logging_format = '%(asctime)s - %(levelname)s - %(message)s' logging_format = '%(asctime)s - %(levelname)s - %(message)s'