add color style processing

deleting toc in the end of whole html parsing
fix searching <a> in footnotes
This commit is contained in:
shirshasa
2020-09-23 15:20:53 +03:00
parent f392b6930d
commit bbfd489327
3 changed files with 28 additions and 35 deletions

View File

@@ -32,7 +32,7 @@ class Book:
main_logger=main_logger) main_logger=main_logger)
self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id) self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id)
assert BookConfig.SUPPORTED_LEVELS == len(BookConfig.SUPPORTED_HEADERS), \ assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels." "Length of headers doesn't match allowed levels."
def save_docx(self, content): def save_docx(self, content):

View File

@@ -5,7 +5,7 @@ import re
from shutil import copyfile from shutil import copyfile
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString
from config import BookConfig, BookLogger, BookApiWrapper from config import LawCartaConfig, BookLogger, BookApiWrapper
class HTMLPreprocessor: class HTMLPreprocessor:
@@ -49,8 +49,8 @@ class HTMLPreprocessor:
@classmethod @classmethod
def convert_pt_to_px(cls, value): def convert_pt_to_px(cls, value):
value = int(value) value = int(value)
if value == BookConfig.WORD_DEFAULT_FONT_SIZE: if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE:
return BookConfig.LAWCARTA_DEFAULT_FONT_SIZE return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
else: else:
return value return value
@@ -70,7 +70,7 @@ class HTMLPreprocessor:
size = size.group(1) size = size.group(1)
new_size = cls.convert_pt_to_px(size) new_size = cls.convert_pt_to_px(size)
if new_size == BookConfig.LAWCARTA_DEFAULT_FONT_SIZE: if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
return "" return ""
return re.sub(size + "pt", str(new_size) + "px", style) return re.sub(size + "pt", str(new_size) + "px", style)
@@ -83,41 +83,39 @@ class HTMLPreprocessor:
for font in fonts: for font in fonts:
face = font.get("face") face = font.get("face")
style = font.get("style") style = font.get("style")
color = font.get("color")
font.attrs = {} font.attrs = {}
font.name = "span" font.name = "span"
if style: if style:
style = self.convert_font_pt_to_px(style) style = self.convert_font_pt_to_px(style)
if style != "": if style != "":
if color and color != '#000000':
style += f'; color: {color};'
font.attrs["style"] = style font.attrs["style"] = style
elif color and color != '#000000':
font.attrs["style"] = f'color: {color};'
if face is not None: if face is not None:
face = re.sub(r",[\w,\- ]*$", "", face) face = re.sub(r",[\w,\- ]*$", "", face)
if face != BookConfig.DEFAULT_FONT_NAME and BookConfig.font_correspondence_table.get(face): if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face):
font.attrs["face"] = BookConfig.font_correspondence_table[face] font.attrs["face"] = LawCartaConfig.font_correspondence_table[face]
else: else:
font.attrs["face"] = BookConfig.DEFAULT_FONT_NAME font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME
if len(font.attrs) == 0: if len(font.attrs) == 0:
font.unwrap() font.unwrap()
assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more <font> tags assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more <font> tags
def _remove_table_of_contents(self): def delete_content_before_toc(self):
""" # replace toc with empty <TOC> tag
Function to remove table of content from file.
"""
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
for table in tables:
table.decompose()
def _change_table_of_contents(self):
self._change_table_of_contents()
tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
for table in tables: for table in tables:
table.wrap(self.html_soup.new_tag("TOC")) table.wrap(self.html_soup.new_tag("TOC"))
table.decompose() table.decompose()
def delete_content_before_toc(self): # remove all tag upper the <TOC>
toc_tag = self.html_soup.new_tag('TOC') toc_tag = self.html_soup.new_tag('TOC')
if toc_tag in self.content: if toc_tag in self.content:
ind = self.content.index(toc_tag) + 1 ind = self.content.index(toc_tag) + 1
@@ -131,14 +129,12 @@ class HTMLPreprocessor:
self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages
self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$')) self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
self._clean_tag('font', 'color', re.compile(r'^#[0-9a-fA-F]{6}$'))
self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$')) self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$'))
self._clean_tag("a", "name", "_GoBack") self._clean_tag("a", "name", "_GoBack")
self._clean_underline_links() self._clean_underline_links()
self._font_to_span() self._font_to_span()
# self._remove_table_of_contents()
def _process_paragraph(self): def _process_paragraph(self):
""" """
@@ -178,7 +174,7 @@ class HTMLPreprocessor:
p.attrs = {} p.attrs = {}
style = '' style = ''
if align is not None and align != BookConfig.DEFAULT_ALIGN_STYLE: if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE:
style += f'text-align: {align};' style += f'text-align: {align};'
if indent is not None: if indent is not None:
@@ -280,10 +276,6 @@ class HTMLPreprocessor:
tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace('\u200c', '')
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
# %E2%80%8C
for tag in a_tags_with_href:
print(tag)
@staticmethod @staticmethod
def _clean_footnote_content(content): def _clean_footnote_content(content):
content = content.strip() content = content.strip()
@@ -303,7 +295,8 @@ class HTMLPreprocessor:
footnotes = [] footnotes = []
for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)):
true_a_tag = cont_tag.find('a', {'class': 'sdfootnotesym-western'}) true_a_tag = cont_tag.find_all('a', class_=re.compile(r'^sdfootnote.+$'))[0]
if true_a_tag.attrs.get('href') is None: if true_a_tag.attrs.get('href') is None:
cont_tag.a.decompose() cont_tag.a.decompose()
continue continue
@@ -439,7 +432,7 @@ class HTMLPreprocessor:
""" """
Function to convert all lower level headings to p tags Function to convert all lower level headings to p tags
""" """
pattern = f'^h[{BookConfig.SUPPORTED_LEVELS + 1}-9]$' pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern)) header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags: for tag in header_tags:
tag.name = 'p' tag.name = 'p'
@@ -527,8 +520,8 @@ class HTMLPreprocessor:
if title == "": if title == "":
tag.unwrap() tag.unwrap()
else: else:
assert tag.name in BookConfig.SUPPORTED_HEADERS, \ assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
f'Preprocessing went wrong, there is still h{BookConfig.SUPPORTED_LEVELS + 1}-h9 headings.' f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
# if tag.name in ["h4", "h5", "h6"]: # if tag.name in ["h4", "h5", "h6"]:
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings # tag.name = "h3" # All the lower level headings will be transformed to h3 headings

View File

@@ -4,7 +4,7 @@ import codecs
import json import json
from copy import copy from copy import copy
from config import BookConfig from src.config import LawCartaConfig
class JSONConverter: class JSONConverter:
@@ -34,7 +34,7 @@ class JSONConverter:
:param ind: Index of header in content list. :param ind: Index of header in content list.
""" """
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
title = self.content[ind].text title = self.content[ind].text
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag
result = { result = {
@@ -47,7 +47,7 @@ class JSONConverter:
while ind < len(self.content): while ind < len(self.content):
# 1. next tag is a header # 1. next tag is a header
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
outline = int(re.sub(r"^h", "", self.content[ind].name)) outline = int(re.sub(r"^h", "", self.content[ind].name))
# - recursion step until h_i > h_initial # - recursion step until h_i > h_initial
if outline > curr_outline: if outline > curr_outline:
@@ -100,13 +100,13 @@ class JSONConverter:
while ind < len(self.content): while ind < len(self.content):
res = {} res = {}
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
res, ind = self.header_to_json(ind) res, ind = self.header_to_json(ind)
else: else:
chapter_title = f'Untitled chapter {ch_num}' chapter_title = f'Untitled chapter {ch_num}'
chapter = [] chapter = []
while ind < len(self.content) and self.content[ind].name not in BookConfig.SUPPORTED_HEADERS: while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
if not self._is_empty_p_tag(self.content[ind]): if not self._is_empty_p_tag(self.content[ind]):
chapter.append(self.format_html(str(self.content[ind]))) chapter.append(self.format_html(str(self.content[ind])))
ind += 1 ind += 1