diff --git a/src/book.py b/src/book.py index 6587901..38efb1c 100644 --- a/src/book.py +++ b/src/book.py @@ -32,7 +32,7 @@ class Book: main_logger=main_logger) self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id) - assert BookConfig.SUPPORTED_LEVELS == len(BookConfig.SUPPORTED_HEADERS), \ + assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \ "Length of headers doesn't match allowed levels." def save_docx(self, content): diff --git a/src/html_preprocessor.py b/src/html_preprocessor.py index 491537d..3ed60a4 100644 --- a/src/html_preprocessor.py +++ b/src/html_preprocessor.py @@ -5,7 +5,7 @@ import re from shutil import copyfile from bs4 import BeautifulSoup, NavigableString -from config import BookConfig, BookLogger, BookApiWrapper +from config import LawCartaConfig, BookLogger, BookApiWrapper class HTMLPreprocessor: @@ -49,8 +49,8 @@ class HTMLPreprocessor: @classmethod def convert_pt_to_px(cls, value): value = int(value) - if value == BookConfig.WORD_DEFAULT_FONT_SIZE: - return BookConfig.LAWCARTA_DEFAULT_FONT_SIZE + if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE: + return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE else: return value @@ -70,7 +70,7 @@ class HTMLPreprocessor: size = size.group(1) new_size = cls.convert_pt_to_px(size) - if new_size == BookConfig.LAWCARTA_DEFAULT_FONT_SIZE: + if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE: return "" return re.sub(size + "pt", str(new_size) + "px", style) @@ -83,41 +83,39 @@ class HTMLPreprocessor: for font in fonts: face = font.get("face") style = font.get("style") + color = font.get("color") font.attrs = {} font.name = "span" if style: style = self.convert_font_pt_to_px(style) if style != "": + if color and color != '#000000': + style += f'; color: {color};' font.attrs["style"] = style + elif color and color != '#000000': + font.attrs["style"] = f'color: {color};' + if face is not None: face = re.sub(r",[\w,\- ]*$", "", face) - if face != BookConfig.DEFAULT_FONT_NAME and BookConfig.font_correspondence_table.get(face): - font.attrs["face"] = BookConfig.font_correspondence_table[face] + if face != LawCartaConfig.DEFAULT_FONT_NAME and LawCartaConfig.font_correspondence_table.get(face): + font.attrs["face"] = LawCartaConfig.font_correspondence_table[face] else: - font.attrs["face"] = BookConfig.DEFAULT_FONT_NAME + font.attrs["face"] = LawCartaConfig.DEFAULT_FONT_NAME if len(font.attrs) == 0: font.unwrap() assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more tags - def _remove_table_of_contents(self): - """ - Function to remove table of content from file. - """ - tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) - for table in tables: - table.decompose() - - def _change_table_of_contents(self): - self._change_table_of_contents() + def delete_content_before_toc(self): + # replace toc with empty tag tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) for table in tables: table.wrap(self.html_soup.new_tag("TOC")) table.decompose() - def delete_content_before_toc(self): + # remove all tag upper the toc_tag = self.html_soup.new_tag('TOC') if toc_tag in self.content: ind = self.content.index(toc_tag) + 1 @@ -131,14 +129,12 @@ class HTMLPreprocessor: self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$')) - self._clean_tag('font', 'color', re.compile(r'^#[0-9a-fA-F]{6}$')) self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$')) self._clean_tag("a", "name", "_GoBack") self._clean_underline_links() self._font_to_span() - # self._remove_table_of_contents() def _process_paragraph(self): """ @@ -178,7 +174,7 @@ class HTMLPreprocessor: p.attrs = {} style = '' - if align is not None and align != BookConfig.DEFAULT_ALIGN_STYLE: + if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE: style += f'text-align: {align};' if indent is not None: @@ -280,10 +276,6 @@ class HTMLPreprocessor: tag.string = tag.text.replace('\u200c', '') tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') - # %E2%80%8C - for tag in a_tags_with_href: - print(tag) - @staticmethod def _clean_footnote_content(content): content = content.strip() @@ -303,7 +295,8 @@ class HTMLPreprocessor: footnotes = [] for i, (anc_tag, cont_tag) in enumerate(zip(footnote_anchors, footnote_content)): - true_a_tag = cont_tag.find('a', {'class': 'sdfootnotesym-western'}) + true_a_tag = cont_tag.find_all('a', class_=re.compile(r'^sdfootnote.+$'))[0] + if true_a_tag.attrs.get('href') is None: cont_tag.a.decompose() continue @@ -439,7 +432,7 @@ class HTMLPreprocessor: """ Function to convert all lower level headings to p tags """ - pattern = f'^h[{BookConfig.SUPPORTED_LEVELS + 1}-9]$' + pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$' header_tags = self.body_tag.find_all(re.compile(pattern)) for tag in header_tags: tag.name = 'p' @@ -527,8 +520,8 @@ class HTMLPreprocessor: if title == "": tag.unwrap() else: - assert tag.name in BookConfig.SUPPORTED_HEADERS, \ - f'Preprocessing went wrong, there is still h{BookConfig.SUPPORTED_LEVELS + 1}-h9 headings.' + assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \ + f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.' # if tag.name in ["h4", "h5", "h6"]: # tag.name = "h3" # All the lower level headings will be transformed to h3 headings diff --git a/src/json_converter.py b/src/json_converter.py index 0e88ad7..64f7d2d 100644 --- a/src/json_converter.py +++ b/src/json_converter.py @@ -4,7 +4,7 @@ import codecs import json from copy import copy -from config import BookConfig +from src.config import LawCartaConfig class JSONConverter: @@ -34,7 +34,7 @@ class JSONConverter: :param ind: Index of header in content list. """ - if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: + if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS: title = self.content[ind].text curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag result = { @@ -47,7 +47,7 @@ class JSONConverter: while ind < len(self.content): # 1. next tag is a header - if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: + if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS: outline = int(re.sub(r"^h", "", self.content[ind].name)) # - recursion step until h_i > h_initial if outline > curr_outline: @@ -100,13 +100,13 @@ class JSONConverter: while ind < len(self.content): res = {} - if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: + if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS: res, ind = self.header_to_json(ind) else: chapter_title = f'Untitled chapter {ch_num}' chapter = [] - while ind < len(self.content) and self.content[ind].name not in BookConfig.SUPPORTED_HEADERS: + while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS: if not self._is_empty_p_tag(self.content[ind]): chapter.append(self.format_html(str(self.content[ind]))) ind += 1