diff --git a/src/book.py b/src/book.py new file mode 100644 index 0000000..5d1670e --- /dev/null +++ b/src/book.py @@ -0,0 +1,596 @@ +import argparse +import codecs +import json +import logging +import os +import pathlib +import re +from shutil import copyfile + +from bs4 import BeautifulSoup + +# from src.header_detection import HeaderDetector + + +class Book: + # Main constant values + DEFAULT_FONT_NAME = 'Times New Roman' + DEFAULT_ALIGN_STYLE = 'left' + WORD_DEFAULT_FONT_SIZE = 11 + LAWCARTA_DEFAULT_FONT_SIZE = 18 + FONT_CONVERT_RATIO = LAWCARTA_DEFAULT_FONT_SIZE / WORD_DEFAULT_FONT_SIZE + font_correspondence_table = { + "Arial": "arial,helvetica,sans-serif", + "Comic Sans MS": "comic sans ms,cursive", + "Courier New": "courier new,courier,monospace", + "Georgia": "georgia,serif", + "Lucida Sans Unicode": "lucida sans unicode,lucida grande,sans-serif", + "Tahoma": "tahoma,geneva,sans-serif", + "Times New Roman": "times new roman,times,serif", + "Trebuchet MS": "trebuchet ms,helvetica,sans-serif", + "Verdana": "verdana,geneva,sans-serif" + } + SUPPORTED_HEADERS = ["h1", "h2", "h3"] + + def __init__(self, file_path, output=None, recreate=False, train_mode=False, convert=False, model_location=None): + self.file_path = pathlib.Path(file_path) + self.output_path = output + self.recreate = recreate + self.train_mode = train_mode + self.convert = convert + self.model_location = model_location + + self.logger = None + self.html_soup = None + self.body_tag = None + self.content = list() + self.footnotes = list() + self.images = list() + self.content_dict = dict() + # self.model = HeaderDetector(self.model_location, self.file_path.name) + + def parse_args(self): + """ + Method for parsing arguments from command line. + """ + parser = argparse.ArgumentParser(description='Converts .docx/.html documents to .json file with ' + 'LiveCarta book structure.') + parser.add_argument('-f', dest='file_path', type=str, required=True, help='Path to file to be processed.') + parser.add_argument('-o', dest='output', help='Path to output file.', default="") + parser.add_argument('--recreate', nargs='?', const=True, default=False, + help='If output file exist, will overwrite it.') + parser.add_argument('--convert', dest='convert', nargs='?', const=True, default=False, + help='Conversion from .docx to .html with "libreoffice".') + parser.add_argument('--train', dest='train_mode', nargs='?', const=True, default=False, + help='Train mode, takes labeled file (with highlighted paragraphs) and fitted model') + parser.add_argument('-m', dest='model_location', + help='Path to file with fitted model. If does not exist, will be created') + + args = parser.parse_args() + + folder_path = os.path.dirname(os.path.abspath(__file__)) + self.file_path = os.path.join(folder_path, "..", args.file_path) + self.output_path = args.output + self.recreate = args.recreate + self.train_mode = args.train_mode + self.convert = args.convert + self.model_location = args.model_location + # self.model = HeaderDetector(self.model_location, self.file_path.name) + + def configure_file_logger(self, name, attr_name='logger', filename='logs/converter_log.log', filemode='w', + logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'): + """ + Method for Logger configuration. Logger will write in file. + + :param name: name of the Logger. + :param attr_name: name of attribute that will be added to self. + :param filename: name of the log file. + :param filemode: mode of opening log file. + :param logging_level: logging level: 10 - debug, 20 - info, 30 - warning, 40 - error, 50 - critical. + :param logging_format: format of record in log file. + :param date_format: format of the date that will be used in record. + """ + logger = logging.getLogger(name) + + if self.file_path: + filename = f'logs/{self.file_path.stem}_log.log' + + file_handler = logging.FileHandler(filename, mode=filemode) + # file_format = logging.Formatter(fmt=logging_format, datefmt=date_format) + file_format = logging.Formatter(fmt=logging_format) + file_handler.setFormatter(file_format) + logger.addHandler(file_handler) + + logger.setLevel(logging_level) + + setattr(self, attr_name, logger) + + def convert_doc_to_html(self): + """ + Method for convert .docx document to .html file. + """ + self.log(f'File - {self.file_path}.') + print(self.file_path) + self.log('Beginning of conversion from .docx to .html.') + try: + f = open(self.file_path) + f.close() + except FileNotFoundError as error: + self.logger.error('Invalid path to input data.') + raise error + + command = f'libreoffice --headless --convert-to html "{str(self.file_path)}" --outdir html' + os.system(command) + + self.file_path = pathlib.Path(f'html/{self.file_path.stem}.html') + + try: + f = open(self.file_path) + f.close() + except FileNotFoundError as e: + self.logger.error('Conversion has gone wrong.') + raise e + + self.log("End of conversion from .docx to .html.") + self.log(f'Input file path after conversion: {self.file_path}.') + + def check_output_directory(self): + if not self.output_path: + filename = f'{self.file_path.stem}.json' + self.output_path = f'json/{filename}' + + self.output_path = pathlib.Path(self.output_path) + self.logger.info(f'Output file path: {self.output_path}') + + pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) + try: + self.output_path.touch(exist_ok=self.recreate) + except FileExistsError as e: + self.logger.error('Output file already exists! ' + 'Either change the name of output file or use --recreate switch.') + raise e + + def read_html(self): + """ + Method for reading .html file into beautiful soup tag. + """ + try: + html_text = open(self.file_path, 'r', encoding='utf8').read() + except FileNotFoundError as e: + self.logger.error('There is no html to process. Conversion went wrong or you specified wrong paths.') + raise e + + self.html_soup = BeautifulSoup(html_text, features='lxml') + self.body_tag = self.html_soup.body + + # head_tag = self.html_soup.head + # styles = parse_styles(head_tag.style) + # head_tag.decompose() + + def _clean_tag(self, tag, attr_name, attr_value): + """ + Function to clean tags by its name and attribute value. + + :param tag: Tag name to clean. + :param attr_name: Attribute name. + :param attr_value: Attribute value. + """ + tags = self.body_tag.find_all(tag, {attr_name: attr_value}) + for tag in tags: + if len(tag.attrs) == 1: + tag.unwrap() + + def _clean_underline_links(self): + """ + Function cleans meaningless tags before links. + """ + underlines = self.body_tag.find_all("u") + for u in underlines: + if u.find_all('a'): + u.unwrap() + + links = self.body_tag.find_all('a') + for link in links: + u = link.find_all('u') + if u and len(u) == 1: + u[0].unwrap() + + @classmethod + def convert_pt_to_px(cls, style): + """ + Method converts point in the font-size to pixels. + + :param style: Str with style to process. + :return: Str with converted style. + """ + size = re.search(r"font-size: (\d{1,3})pt", style) + + if size is None: + return style + + size = size.group(1) + new_size = round(cls.FONT_CONVERT_RATIO * float(size)) + + if new_size == cls.LAWCARTA_DEFAULT_FONT_SIZE: + return "" + + return re.sub(size + "pt", str(new_size) + "px", style) + + def _font_to_span(self): + """ + Function to convert tag to . If font style is default, then remove this tag. + """ + fonts = self.body_tag.find_all("font") + for font in fonts: + face = font.get("face") + style = font.get("style") + + font.attrs = {} + font.name = "span" + if style: + style = self.convert_pt_to_px(style) + if style != "": + font.attrs["style"] = style + if face is not None: + face = re.sub(r",[\w,\- ]*$", "", face) + if face != self.DEFAULT_FONT_NAME and self.font_correspondence_table.get(face): + font.attrs["face"] = self.font_correspondence_table[face] + + if len(font.attrs) == 0: + font.unwrap() + + assert len(self.body_tag.find_all("font")) == 0 # on this step there should be no more tags + + def _remove_table_of_contents(self): + """ + Function to remove table of content from file. + """ + tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+')) + for table in tables: + table.decompose() + + def clean_trash(self): + """ + Function to remove all styles and tags we don't need. + """ + self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$')) + self._clean_tag('span', 'lang', re.compile(r'^ru-RU$')) # todo: check for another languages + self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$')) + + self._clean_tag('font', 'color', re.compile(r'^#[0-9a-fA-F]{6}$')) + self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$')) + + self._clean_tag("a", "name", "_GoBack") + self._clean_underline_links() + + self._font_to_span() + self._remove_table_of_contents() + + def _process_paragraph(self): + """ + Function to process

tags (text-align and text-indent value). + """ + paragraphs = self.body_tag.find_all('p') + + for p in paragraphs: + align = p.get('align') + style = p.get('style') + + if style: + indent = re.search(r'text-indent: ([\d\.]{1,4})in', style) + else: + indent = None + + p.attrs = {} + style = '' + if align is not None and align != self.DEFAULT_ALIGN_STYLE: + style += f'text-align: {align};' + if indent is not None: + indent = indent.group(1) + style += f'text-indent: {indent}in;' + + if style: + p.attrs['style'] = style + + def _process_two_columns(self): + """ + Function to process paragraphs which has two columns layout. + """ + two_columns = self.body_tag.find_all("div", style="column-count: 2") + for div in two_columns: + for child in div.children: + if child.name == "p": + child["class"] = "columns2" + div.unwrap() + + def _process_quotes(self): + """ + Function to process

tags. All tags will be replaced with
tags. + """ + dls = self.body_tag.find_all("dl") + + for dl in dls: + pars = dl.find_all("p") + for p in pars: + p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote")) + new_div = BeautifulSoup(features="lxml").new_tag("div") + for p in pars: + new_div.append(p.parent) + dl.replaceWith(new_div) + + @staticmethod + def _clean_footnote_content(content): + content = content.strip() + content = re.sub(r'^\d+ ?', '', content) + return content.strip() + + def _process_footnotes(self): + """ + Function returns list of footnotes and delete them from html_soup. + """ + footnote_ancors = self.body_tag.find_all("a", class_="sdfootnoteanc") + footnote_content = self.body_tag.find_all("div", id=re.compile(r"^sdfootnote\d+$")) + footnote_amt = len(footnote_ancors) + + assert footnote_amt == len(footnote_content) + + footnotes = [] + for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)): + assert anc_tag['name'] == cont_tag.find('a')['href'][1:] + + new_tag = BeautifulSoup(features="lxml").new_tag('sup') + new_tag['class'] = 'footnote-element' + new_tag['data-id'] = i+1 + new_tag['id'] = f'footnote-{i+1}' + new_tag.string = '*' + anc_tag.replace_with(new_tag) + + content = self._clean_footnote_content(cont_tag.text) + # new_tag = BeautifulSoup(features="lxml").new_tag('div') + # new_tag['class'] = 'footnote-element' + # new_tag['data-id'] = f'"{i}"' + # new_tag['id'] = f'footnote-{i}' + # new_tag.string = content + # footnotes.append(str(new_tag)) + footnotes.append(content) + + # i += 1 + + self.footnotes = footnotes + + def _process_images(self): + """ + Funcction to process tag. Img should be sent Amazon S3 and then return new tag with valid link. + For now images are moved to one folder. + """ + imgs = self.body_tag.find_all("img") + + if len(imgs): + new_path = pathlib.Path(f'json/img_{self.file_path.stem}/') + new_path.mkdir(exist_ok=True) + + for img in imgs: + img_name = img.attrs.get("src") + img_path = pathlib.Path(f'html/{img_name}') + + # img_size = os.path.getsize(img_path) # TODO: Implement loading to S3 and then getting link to it. + # print(f'{img_name} successfully loaded. Image size: {img_size}.') + + new_img_path = new_path / img_name + copyfile(img_path, new_img_path) + img.attrs["src"] = str(new_img_path) + + self.images = imgs + + def _process_div(self): + """ + Function to process
tags. All the tags will be deleted from file, all content of the tags will stay. + """ + divs = self.body_tag.find_all("div") + + for div in divs: + div.unwrap() + + def _process_toc_links(self): + """ + Function to extract nodes which contains TOC links, remove links from file and detect headers. + """ + toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')}) + headers = [link.parent for link in toc_links] + outline_level = "1" # All the unknown outlines will be predicted as

+ for tag in headers: + if re.search(r"^h\d$", tag.name): + tag.a.unwrap() + # outline_level = tag.name[-1] # TODO: add prediction of the outline level + # TODO: escape from recounting paragraphs every time + elif tag.name == "p": + if tag in self.body_tag.find_all("p"): + new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level) + text = tag.text + tag.replaceWith(new_tag) + new_tag.string = text + else: + # rethink document structure when you have toc_links, other cases? + self.logger.warning(f'Something went wrong in processing toc_links. Check the structure of the file. ' + f'Tag name: {tag.name}') + + @staticmethod + def clean_header_title(title): + """ + Function to remove digits and extra spaces from headers. + + :param title: Title to process. + """ + title = re.sub(r'\s+', ' ', title).strip() + title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title) + # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title) # delete chapter numbering from the title + title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title) + return title.strip() + + def _process_headings(self): + """ + Function to process tags . + """ + header_tags = self.body_tag.find_all(re.compile("^h[1-6]$")) + for tag in header_tags: + if tag.parent.name == "li": + tag.parent.unwrap() + while tag.parent.name == "ol": + tag.parent.unwrap() + + title = tag.text + title = self.clean_header_title(title) + if title == "": + tag.unwrap() + else: + if tag.name in ["h4", "h5", "h6"]: # All the lower level headings will be transformed to h3 headings + tag.name = "h3" + + new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name) + new_tag.string = title + tag.replace_with(new_tag) + + def write_html_from_list(self, file_name='url_test.html'): + with open(file_name, 'w', encoding='utf-8') as f_out: + # f_out.write("".join([tag.prettify() for tag in self.content])) + f_out.write(self.body_tag.prettify()) + self.logger.info(f'Check test file - url_test.html.') + + def process_html(self): + """ + Process html code to satisfy LawCarta formatting. + """ + self.logger.info('Beginning of processing .html file.') + + self.clean_trash() + + # process main elements of the .html doc + self._process_paragraph() + self._process_two_columns() + self._process_quotes() + + self.logger.info('Footnotes processing.') + self._process_footnotes() + self.logger.info(f'{len(self.footnotes)} footnotes have been processed.') + + self.logger.info('Image processing.') + self._process_images() + self.logger.info(f'{len(self.images)} images have been processed.') + + self._process_div() + + self.content = self.body_tag.find_all(recursive=False) + + # if self.train_mode: + # self.model.train_model(self.content) + # else: + # self.model.predict_headers(self.content) + + self.write_html_from_list() + + self._process_toc_links() + self._process_headings() + + self.content = self.body_tag.find_all(recursive=False) + self.logger.info('End of processing .html file.') + + @staticmethod + def format_html(html_text): + """ + Function to remove useless symbols from html code. + + :param html_text: Text to process. + :return: Cleaned text. + """ + new_text = re.sub(r'([\n\t])', ' ', html_text) + return new_text + + # TODO: rethink the function structure without indexes. + def header_to_json(self, ind): + """ + Function process header and collects all content for it. + + :param ind: Index of header in content list. + """ + if self.content[ind].name in self.SUPPORTED_HEADERS: + title = self.content[ind].text + curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag + result = {title: []} + ch_content = [] + ind += 1 + while ind < len(self.content): + if self.content[ind].name in self.SUPPORTED_HEADERS: + outline = int(re.sub(r"^h", "", self.content[ind].name)) + if outline > curr_outline: + res, ind = self.header_to_json(ind) + if ch_content: + result[title].append("".join(ch_content)) + ch_content = [] + result[title].append(res) + else: + # return result, ind + break + else: + res = self.format_html(str(self.content[ind])) + # result[title].append(res) + ch_content.append(res) + ind += 1 + if ch_content: + result[title].append("".join(ch_content)) + return result, ind + return '' + + def convert_to_json(self): + """ + Function which convert list of html nodes to appropriate json structure. + """ + json_strc = [] + ind = 0 + ch_num = 0 + while ind < len(self.content): + if self.content[ind].name in self.SUPPORTED_HEADERS: + res, ind = self.header_to_json(ind) + else: + chapter_title = f'Untitled chapter {ch_num}' + chapter = [] + while self.content[ind].name not in self.SUPPORTED_HEADERS: + chapter.append(self.format_html(str(self.content[ind]))) + ind += 1 + res = {chapter_title: ["".join(chapter)]} + ch_num += 1 + json_strc.append(res) + + self.content_dict = { + "content": json_strc, + "footnotes": self.footnotes + } + + def write_json(self): + with codecs.open(self.output_path, 'w', encoding='utf-8') as f: + json.dump(self.content_dict, f, ensure_ascii=False) + + def log(self, message, logging_level=20): + self.logger.log(msg=message, level=logging_level) + + def conversion(self, logging_format, filemode='w'): + self.configure_file_logger(__name__, logging_format=logging_format, filemode=filemode) + self.log('Beginning of conversion from .docx to .json.') + if self.convert: + self.convert_doc_to_html() + self.check_output_directory() + self.read_html() + self.clean_trash() + self.process_html() + self.convert_to_json() + self.write_json() + self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.') + + +if __name__ == '__main__': + logging_format = '%(asctime)s - %(levelname)s - %(message)s' + + book = Book(file_path="", recreate=True) + book.parse_args() + book.conversion(logging_format) + + print('Script has finished.') diff --git a/src/consumer.py b/src/consumer.py new file mode 100644 index 0000000..8120042 --- /dev/null +++ b/src/consumer.py @@ -0,0 +1,81 @@ +import json +import os +from threading import Thread, active_count + +import pika + +from book import Book +# from src.book import Book + + +class Consumer: + def __init__(self, url): + self._connection = None + self._channel = None + self._closing = False + self._url = url + + def run(self): + pass + + def close_connection(self): + pass + + +def convert_book(file_path, output=None, recreate=True, train_mode=False, convert=False, model_location=None): + logging_format = '%(asctime)s - %(levelname)s - %(message)s' + + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + file_path = os.path.join(folder_path, file_path) + book = Book(file_path, output, recreate, train_mode, convert, model_location) + book.conversion(logging_format=logging_format) + print('Book has been proceeded.') + + range(10) + + +def callback(ch, method, properties, body): + print(f'Message: {body}.') + try: + data = json.loads(body) + thread = Thread(target=convert_book, kwargs=data) + thread.start() + print(f'Active threads: {active_count()}.') + + except Exception as e: + print(e) + pass + + finally: + # thread.join() + print('Waiting for the message...') + + +if __name__ == '__main__': + + folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + config_path = os.path.join(folder_path, "test_config/config.json") + with open(config_path, "r") as f: + conf_param = json.load(f) + + # credentials = pika.PlainCredentials('admin', 'admin') + # parameters = pika.ConnectionParameters('10.40.10.173', credentials=credentials) + + credentials = pika.PlainCredentials(username=conf_param['username'], password=conf_param['password']) + parameters = pika.ConnectionParameters(host=conf_param['host'], credentials=credentials) + connection = pika.BlockingConnection(parameters) + channel = connection.channel() + + try: + channel.queue_declare(queue=conf_param['queue'], passive=True) + except Exception as e: + print(e) + raise + + channel.basic_consume(queue=conf_param['queue'], + auto_ack=True, + on_message_callback=callback) + + print('Waiting for messages...') + channel.start_consuming() +