diff --git a/src/book.py b/src/book.py index a003e20..0fd7f4b 100644 --- a/src/book.py +++ b/src/book.py @@ -362,13 +362,31 @@ class Book: if style: indent = re.search(r'text-indent: ([\d\.]{1,4})in', style) + margin_left = re.search(r'margin-left: ([\d\.]{1,4})in', style) + margin_right= re.search(r'margin-right: ([\d\.]{1,4})in', style) + margin_top = re.search(r'margin-top: ([\d\.]{1,4})in', style) + margin_bottom = re.search(r'margin-bottom: ([\d\.]{1,4})in', style) else: indent = None + margin_left = None + margin_right = None + margin_top = None + margin_bottom = None + + if margin_left and margin_right and margin_top and margin_bottom and \ + margin_left.group(1) == '0.6' and margin_right.group(1) == '0.6' and \ + margin_top.group(1) == '0.14' and margin_bottom.group(1) == '0.11': + blockquote = BeautifulSoup(features='lxml').new_tag('blockquote') + blockquote.append(BeautifulSoup(features='lxml').new_tag('p')) + else: + blockquote = None p.attrs = {} style = '' + if align is not None and align != self.DEFAULT_ALIGN_STYLE: style += f'text-align: {align};' + if indent is not None: indent = indent.group(1) style += f'text-indent: {indent}in;' @@ -376,6 +394,11 @@ class Book: if style: p.attrs['style'] = style + if blockquote: + blockquote.p.attrs = p.attrs + blockquote.p.string = p.text + p.replace_with(blockquote) + def _process_two_columns(self): """ Function to process paragraphs which has two columns layout. @@ -387,20 +410,20 @@ class Book: child["class"] = "columns2" div.unwrap() - def _process_quotes(self): - """ - Function to process
tags. All tags will be replaced with
tags. - """ - dls = self.body_tag.find_all('dl') - - for dl in dls: - pars = dl.find_all('p') - for p in pars: - p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) - new_div = BeautifulSoup(features='lxml').new_tag('div') - for p in pars: - new_div.append(p.parent) - dl.replaceWith(new_div) + # def _process_quotes(self): + # """ + # Function to process
tags. All tags will be replaced with
tags. + # """ + # dls = self.body_tag.find_all('dl') + # + # for dl in dls: + # pars = dl.find_all('p') + # for p in pars: + # p.wrap(BeautifulSoup(features='lxml').new_tag('blockquote')) + # new_div = BeautifulSoup(features='lxml').new_tag('div') + # for p in pars: + # new_div.append(p.parent) + # dl.replaceWith(new_div) @staticmethod def _clean_footnote_content(content): @@ -575,7 +598,7 @@ class Book: self._preprocessing_headings() self._process_paragraph() self._process_two_columns() - self._process_quotes() + # self._process_quotes() self.log('Footnotes processing.') self._process_footnotes() @@ -766,8 +789,8 @@ class Book: if __name__ == "__main__": folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - file_path = pathlib.Path(os.path.join(folder_path, 'html/11/11.html')) - out_path = pathlib.Path(os.path.join(folder_path, 'json/11.json')) + file_path = pathlib.Path(os.path.join(folder_path, 'html/0/quote.html')) + out_path = pathlib.Path(os.path.join(folder_path, 'json/quote.json')) logging_format = '%(asctime)s - %(levelname)s - %(message)s'