add main scripts for converter

2020-01-16 18:17:39 +03:00
parent a9cea63542
commit 653b4c934b
2 changed files with 677 additions and 0 deletions
--- a/src/book.py
+++ b/src/book.py
@@ -0,0 +1,596 @@
 import argparse
 import codecs
 import json
 import logging
 import os
 import pathlib
 import re
 from shutil import copyfile
 from bs4 import BeautifulSoup
 # from src.header_detection import HeaderDetector
 class Book:
    # Main constant values
    DEFAULT_FONT_NAME = 'Times New Roman'
    DEFAULT_ALIGN_STYLE = 'left'
    WORD_DEFAULT_FONT_SIZE = 11
    LAWCARTA_DEFAULT_FONT_SIZE = 18
    FONT_CONVERT_RATIO = LAWCARTA_DEFAULT_FONT_SIZE / WORD_DEFAULT_FONT_SIZE
    font_correspondence_table = {
        "Arial": "arial,helvetica,sans-serif",
        "Comic Sans MS": "comic sans ms,cursive",
        "Courier New": "courier new,courier,monospace",
        "Georgia": "georgia,serif",
        "Lucida Sans Unicode": "lucida sans unicode,lucida grande,sans-serif",
        "Tahoma": "tahoma,geneva,sans-serif",
        "Times New Roman": "times new roman,times,serif",
        "Trebuchet MS": "trebuchet ms,helvetica,sans-serif",
        "Verdana": "verdana,geneva,sans-serif"
    }
    SUPPORTED_HEADERS = ["h1", "h2", "h3"]
    def __init__(self, file_path, output=None, recreate=False, train_mode=False, convert=False, model_location=None):
        self.file_path = pathlib.Path(file_path)
        self.output_path = output
        self.recreate = recreate
        self.train_mode = train_mode
        self.convert = convert
        self.model_location = model_location
        self.logger = None
        self.html_soup = None
        self.body_tag = None
        self.content = list()
        self.footnotes = list()
        self.images = list()
        self.content_dict = dict()
        # self.model = HeaderDetector(self.model_location, self.file_path.name)
    def parse_args(self):
        """
        Method for parsing arguments from command line.
        """
        parser = argparse.ArgumentParser(description='Converts .docx/.html documents to .json file with '
                                                     'LiveCarta book structure.')
        parser.add_argument('-f', dest='file_path', type=str, required=True, help='Path to file to be processed.')
        parser.add_argument('-o', dest='output', help='Path to output file.', default="")
        parser.add_argument('--recreate', nargs='?', const=True, default=False,
                            help='If output file exist, will overwrite it.')
        parser.add_argument('--convert', dest='convert', nargs='?', const=True, default=False,
                            help='Conversion from .docx to .html with "libreoffice".')
        parser.add_argument('--train', dest='train_mode', nargs='?', const=True, default=False,
                            help='Train mode, takes labeled file (with highlighted paragraphs) and fitted model')
        parser.add_argument('-m', dest='model_location',
                            help='Path to file with fitted model. If does not exist, will be created')
        args = parser.parse_args()
        folder_path = os.path.dirname(os.path.abspath(__file__))
        self.file_path = os.path.join(folder_path, "..", args.file_path)
        self.output_path = args.output
        self.recreate = args.recreate
        self.train_mode = args.train_mode
        self.convert = args.convert
        self.model_location = args.model_location
        # self.model = HeaderDetector(self.model_location, self.file_path.name)
    def configure_file_logger(self, name, attr_name='logger', filename='logs/converter_log.log', filemode='w',
                              logging_level=logging.INFO, logging_format='%(asctime)s - %(message)s'):
        """
        Method for Logger configuration. Logger will write in file.
        :param name: name of the Logger.
        :param attr_name: name of attribute that will be added to self.
        :param filename: name of the log file.
        :param filemode: mode of opening log file.
        :param logging_level: logging level: 10 - debug, 20 - info, 30 - warning, 40 - error, 50 - critical.
        :param logging_format: format of record in log file.
        :param date_format: format of the date that will be used in record.
        """
        logger = logging.getLogger(name)
        if self.file_path:
            filename = f'logs/{self.file_path.stem}_log.log'
        file_handler = logging.FileHandler(filename, mode=filemode)
        # file_format = logging.Formatter(fmt=logging_format, datefmt=date_format)
        file_format = logging.Formatter(fmt=logging_format)
        file_handler.setFormatter(file_format)
        logger.addHandler(file_handler)
        logger.setLevel(logging_level)
        setattr(self, attr_name, logger)
    def convert_doc_to_html(self):
        """
        Method for convert .docx document to .html file.
        """
        self.log(f'File - {self.file_path}.')
        print(self.file_path)
        self.log('Beginning of conversion from .docx to .html.')
        try:
            f = open(self.file_path)
            f.close()
        except FileNotFoundError as error:
            self.logger.error('Invalid path to input data.')
            raise error
        command = f'libreoffice --headless --convert-to html "{str(self.file_path)}" --outdir html'
        os.system(command)
        self.file_path = pathlib.Path(f'html/{self.file_path.stem}.html')
        try:
            f = open(self.file_path)
            f.close()
        except FileNotFoundError as e:
            self.logger.error('Conversion has gone wrong.')
            raise e
        self.log("End of conversion from .docx to .html.")
        self.log(f'Input file path after conversion: {self.file_path}.')
    def check_output_directory(self):
        if not self.output_path:
            filename = f'{self.file_path.stem}.json'
            self.output_path = f'json/{filename}'
        self.output_path = pathlib.Path(self.output_path)
        self.logger.info(f'Output file path: {self.output_path}')
        pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
        try:
            self.output_path.touch(exist_ok=self.recreate)
        except FileExistsError as e:
            self.logger.error('Output file already exists! '
                              'Either change the name of output file or use --recreate switch.')
            raise e
    def read_html(self):
        """
        Method for reading .html file into beautiful soup tag.
        """
        try:
            html_text = open(self.file_path, 'r', encoding='utf8').read()
        except FileNotFoundError as e:
            self.logger.error('There is no html to process. Conversion went wrong or you specified wrong paths.')
            raise e
        self.html_soup = BeautifulSoup(html_text, features='lxml')
        self.body_tag = self.html_soup.body
        # head_tag = self.html_soup.head
        # styles = parse_styles(head_tag.style)
        # head_tag.decompose()
    def _clean_tag(self, tag, attr_name, attr_value):
        """
        Function to clean tags by its name and attribute value.
        :param tag: Tag name to clean.
        :param attr_name: Attribute name.
        :param attr_value: Attribute value.
        """
        tags = self.body_tag.find_all(tag, {attr_name: attr_value})
        for tag in tags:
            if len(tag.attrs) == 1:
                tag.unwrap()
    def _clean_underline_links(self):
        """
        Function cleans meaningless <u> tags before links.
        """
        underlines = self.body_tag.find_all("u")
        for u in underlines:
            if u.find_all('a'):
                u.unwrap()
        links = self.body_tag.find_all('a')
        for link in links:
            u = link.find_all('u')
            if u and len(u) == 1:
                u[0].unwrap()
    @classmethod
    def convert_pt_to_px(cls, style):
        """
        Method converts point in the font-size to pixels.
        :param style: Str with style to process.
        :return: Str with converted style.
        """
        size = re.search(r"font-size: (\d{1,3})pt", style)
        if size is None:
            return style
        size = size.group(1)
        new_size = round(cls.FONT_CONVERT_RATIO * float(size))
        if new_size == cls.LAWCARTA_DEFAULT_FONT_SIZE:
            return ""
        return re.sub(size + "pt", str(new_size) + "px", style)
    def _font_to_span(self):
        """
        Function to convert <font> tag to <span>. If font style is default, then remove this tag.
        """
        fonts = self.body_tag.find_all("font")
        for font in fonts:
            face = font.get("face")
            style = font.get("style")
            font.attrs = {}
            font.name = "span"
            if style:
                style = self.convert_pt_to_px(style)
                if style != "":
                    font.attrs["style"] = style
            if face is not None:
                face = re.sub(r",[\w,\- ]*$", "", face)
                if face != self.DEFAULT_FONT_NAME and self.font_correspondence_table.get(face):
                    font.attrs["face"] = self.font_correspondence_table[face]
            if len(font.attrs) == 0:
                font.unwrap()
        assert len(self.body_tag.find_all("font")) == 0  # on this step there should be no more <font> tags
    def _remove_table_of_contents(self):
        """
        Function to remove table of content from file.
        """
        tables = self.body_tag.find_all("div", id=re.compile(r'^Table of Contents\d+'))
        for table in tables:
            table.decompose()
    def clean_trash(self):
        """
        Function to remove all styles and tags we don't need.
        """
        self._clean_tag('span', 'style', re.compile(r'^background: #[0-9a-fA-F]{6}$'))
        self._clean_tag('span', 'lang', re.compile(r'^ru-RU$'))  # todo: check for another languages
        self._clean_tag('span', 'style', re.compile('^letter-spacing: -?[\d\.]+pt$'))
        self._clean_tag('font', 'color', re.compile(r'^#[0-9a-fA-F]{6}$'))
        self._clean_tag('font', 'face', re.compile(r'^Times New Roman[\w, ]+$'))
        self._clean_tag("a", "name", "_GoBack")
        self._clean_underline_links()
        self._font_to_span()
        self._remove_table_of_contents()
    def _process_paragraph(self):
        """
        Function to process <p> tags (text-align and text-indent value).
        """
        paragraphs = self.body_tag.find_all('p')
        for p in paragraphs:
            align = p.get('align')
            style = p.get('style')
            if style:
                indent = re.search(r'text-indent: ([\d\.]{1,4})in', style)
            else:
                indent = None
            p.attrs = {}
            style = ''
            if align is not None and align != self.DEFAULT_ALIGN_STYLE:
                style += f'text-align: {align};'
            if indent is not None:
                indent = indent.group(1)
                style += f'text-indent: {indent}in;'
            if style:
                p.attrs['style'] = style
    def _process_two_columns(self):
        """
        Function to process paragraphs which has two columns layout.
        """
        two_columns = self.body_tag.find_all("div", style="column-count: 2")
        for div in two_columns:
            for child in div.children:
                if child.name == "p":
                    child["class"] = "columns2"
            div.unwrap()
    def _process_quotes(self):
        """
        Function to process <dl> tags. All tags will be replaced with <blockquote> tags.
        """
        dls = self.body_tag.find_all("dl")
        for dl in dls:
            pars = dl.find_all("p")
            for p in pars:
                p.wrap(BeautifulSoup(features="lxml").new_tag("blockquote"))
            new_div = BeautifulSoup(features="lxml").new_tag("div")
            for p in pars:
                new_div.append(p.parent)
            dl.replaceWith(new_div)
    @staticmethod
    def _clean_footnote_content(content):
        content = content.strip()
        content = re.sub(r'^\d+ ?', '', content)
        return content.strip()
    def _process_footnotes(self):
        """
        Function returns list of footnotes and delete them from html_soup.
        """
        footnote_ancors = self.body_tag.find_all("a", class_="sdfootnoteanc")
        footnote_content = self.body_tag.find_all("div", id=re.compile(r"^sdfootnote\d+$"))
        footnote_amt = len(footnote_ancors)
        assert footnote_amt == len(footnote_content)
        footnotes = []
        for i, (anc_tag, cont_tag) in enumerate(zip(footnote_ancors, footnote_content)):
            assert anc_tag['name'] == cont_tag.find('a')['href'][1:]
            new_tag = BeautifulSoup(features="lxml").new_tag('sup')
            new_tag['class'] = 'footnote-element'
            new_tag['data-id'] = i+1
            new_tag['id'] = f'footnote-{i+1}'
            new_tag.string = '*'
            anc_tag.replace_with(new_tag)
            content = self._clean_footnote_content(cont_tag.text)
            # new_tag = BeautifulSoup(features="lxml").new_tag('div')
            # new_tag['class'] = 'footnote-element'
            # new_tag['data-id'] = f'"{i}"'
            # new_tag['id'] = f'footnote-{i}'
            # new_tag.string = content
            # footnotes.append(str(new_tag))
            footnotes.append(content)
            # i += 1
        self.footnotes = footnotes
    def _process_images(self):
        """
        Funcction to process <img> tag. Img should be sent Amazon S3 and then return new tag with valid link.
        For now images are moved to one folder.
        """
        imgs = self.body_tag.find_all("img")
        if len(imgs):
            new_path = pathlib.Path(f'json/img_{self.file_path.stem}/')
            new_path.mkdir(exist_ok=True)
            for img in imgs:
                img_name = img.attrs.get("src")
                img_path = pathlib.Path(f'html/{img_name}')
                # img_size = os.path.getsize(img_path)  # TODO: Implement loading to S3 and then getting link to it.
                # print(f'{img_name} successfully loaded. Image size: {img_size}.')
                new_img_path = new_path / img_name
                copyfile(img_path, new_img_path)
                img.attrs["src"] = str(new_img_path)
        self.images = imgs
    def _process_div(self):
        """
        Function to process <div> tags. All the tags will be deleted from file, all content of the tags will stay.
        """
        divs = self.body_tag.find_all("div")
        for div in divs:
            div.unwrap()
    def _process_toc_links(self):
        """
        Function to extract nodes which contains TOC links, remove links from file and detect headers.
        """
        toc_links = self.body_tag.find_all("a", {'name': re.compile(r'^_Toc\d+')})
        headers = [link.parent for link in toc_links]
        outline_level = "1"  # All the unknown outlines will be predicted as <h1>
        for tag in headers:
            if re.search(r"^h\d$", tag.name):
                tag.a.unwrap()
                # outline_level = tag.name[-1]  # TODO: add prediction of the outline level
            # TODO: escape from recounting paragraphs every time
            elif tag.name == "p":
                if tag in self.body_tag.find_all("p"):
                    new_tag = BeautifulSoup(features="lxml").new_tag("h" + outline_level)
                    text = tag.text
                    tag.replaceWith(new_tag)
                    new_tag.string = text
            else:
                # rethink document structure when you have toc_links, other cases?
                self.logger.warning(f'Something went wrong in processing toc_links. Check the structure of the file. '
                                    f'Tag name: {tag.name}')
    @staticmethod
    def clean_header_title(title):
        """
        Function to remove digits and extra spaces from headers.
        :param title: Title to process.
        """
        title = re.sub(r'\s+', ' ', title).strip()
        title = re.sub(r'^(?:\.?\d+\.? ?)+', '', title)
        # title = re.sub(r'^(?:\.?[MDCLXVIclxvi]+\.? ?)+ ', '', title)  # delete chapter numbering from the title
        title = re.sub(r'^(?:[A-Za-z]\. ?)+', '', title)
        return title.strip()
    def _process_headings(self):
        """
        Function to process tags <h>.
        """
        header_tags = self.body_tag.find_all(re.compile("^h[1-6]$"))
        for tag in header_tags:
            if tag.parent.name == "li":
                tag.parent.unwrap()
                while tag.parent.name == "ol":
                    tag.parent.unwrap()
            title = tag.text
            title = self.clean_header_title(title)
            if title == "":
                tag.unwrap()
            else:
                if tag.name in ["h4", "h5", "h6"]:  # All the lower level headings will be transformed to h3 headings
                    tag.name = "h3"
                new_tag = BeautifulSoup(features='lxml').new_tag(name=tag.name)
                new_tag.string = title
                tag.replace_with(new_tag)
    def write_html_from_list(self, file_name='url_test.html'):
        with open(file_name, 'w', encoding='utf-8') as f_out:
            # f_out.write("".join([tag.prettify() for tag in self.content]))
            f_out.write(self.body_tag.prettify())
            self.logger.info(f'Check test file - url_test.html.')
    def process_html(self):
        """
        Process html code to satisfy LawCarta formatting.
        """
        self.logger.info('Beginning of processing .html file.')
        self.clean_trash()
        # process main elements of the .html doc
        self._process_paragraph()
        self._process_two_columns()
        self._process_quotes()
        self.logger.info('Footnotes processing.')
        self._process_footnotes()
        self.logger.info(f'{len(self.footnotes)} footnotes have been processed.')
        self.logger.info('Image processing.')
        self._process_images()
        self.logger.info(f'{len(self.images)} images have been processed.')
        self._process_div()
        self.content = self.body_tag.find_all(recursive=False)
        # if self.train_mode:
        #     self.model.train_model(self.content)
        # else:
        #     self.model.predict_headers(self.content)
        self.write_html_from_list()
        self._process_toc_links()
        self._process_headings()
        self.content = self.body_tag.find_all(recursive=False)
        self.logger.info('End of processing .html file.')
    @staticmethod
    def format_html(html_text):
        """
        Function to remove useless symbols from html code.
        :param html_text: Text to process.
        :return: Cleaned text.
        """
        new_text = re.sub(r'([\n\t])', ' ', html_text)
        return new_text
    # TODO: rethink the function structure without indexes.
    def header_to_json(self, ind):
        """
        Function process header and collects all content for it.
        :param ind: Index of header in content list.
        """
        if self.content[ind].name in self.SUPPORTED_HEADERS:
            title = self.content[ind].text
            curr_outline = int(re.sub(r"^h", "", self.content[ind].name))  # extract outline from tag
            result = {title: []}
            ch_content = []
            ind += 1
            while ind < len(self.content):
                if self.content[ind].name in self.SUPPORTED_HEADERS:
                    outline = int(re.sub(r"^h", "", self.content[ind].name))
                    if outline > curr_outline:
                        res, ind = self.header_to_json(ind)
                        if ch_content:
                            result[title].append("".join(ch_content))
                        ch_content = []
                        result[title].append(res)
                    else:
                        # return result, ind
                        break
                else:
                    res = self.format_html(str(self.content[ind]))
                    # result[title].append(res)
                    ch_content.append(res)
                    ind += 1
            if ch_content:
                result[title].append("".join(ch_content))
            return result, ind
        return ''
    def convert_to_json(self):
        """
        Function which convert list of html nodes to appropriate json structure.
        """
        json_strc = []
        ind = 0
        ch_num = 0
        while ind < len(self.content):
            if self.content[ind].name in self.SUPPORTED_HEADERS:
                res, ind = self.header_to_json(ind)
            else:
                chapter_title = f'Untitled chapter {ch_num}'
                chapter = []
                while self.content[ind].name not in self.SUPPORTED_HEADERS:
                    chapter.append(self.format_html(str(self.content[ind])))
                    ind += 1
                res = {chapter_title: ["".join(chapter)]}
                ch_num += 1
            json_strc.append(res)
        self.content_dict = {
            "content": json_strc,
            "footnotes": self.footnotes
        }
    def write_json(self):
        with codecs.open(self.output_path, 'w', encoding='utf-8') as f:
            json.dump(self.content_dict, f, ensure_ascii=False)
    def log(self, message, logging_level=20):
        self.logger.log(msg=message, level=logging_level)
    def conversion(self, logging_format, filemode='w'):
        self.configure_file_logger(__name__, logging_format=logging_format, filemode=filemode)
        self.log('Beginning of conversion from .docx to .json.')
        if self.convert:
            self.convert_doc_to_html()
        self.check_output_directory()
        self.read_html()
        self.clean_trash()
        self.process_html()
        self.convert_to_json()
        self.write_json()
        self.log(f'End of the conversion to LawCarta format. Check {self.output_path}.')
 if __name__ == '__main__':
    logging_format = '%(asctime)s - %(levelname)s - %(message)s'
    book = Book(file_path="", recreate=True)
    book.parse_args()
    book.conversion(logging_format)
    print('Script has finished.')
--- a/src/consumer.py
+++ b/src/consumer.py
@@ -0,0 +1,81 @@
 import json
 import os
 from threading import Thread, active_count
 import pika
 from book import Book
 # from src.book import Book
 class Consumer:
    def __init__(self, url):
        self._connection = None
        self._channel = None
        self._closing = False
        self._url = url
    def run(self):
        pass
    def close_connection(self):
        pass
 def convert_book(file_path, output=None, recreate=True, train_mode=False, convert=False, model_location=None):
    logging_format = '%(asctime)s - %(levelname)s - %(message)s'
    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    file_path = os.path.join(folder_path, file_path)
    book = Book(file_path, output, recreate, train_mode, convert, model_location)
    book.conversion(logging_format=logging_format)
    print('Book has been proceeded.')
    range(10)
 def callback(ch, method, properties, body):
    print(f'Message: {body}.')
    try:
        data = json.loads(body)
        thread = Thread(target=convert_book, kwargs=data)
        thread.start()
        print(f'Active threads: {active_count()}.')
    except Exception as e:
        print(e)
        pass
    finally:
        # thread.join()
        print('Waiting for the message...')
 if __name__ == '__main__':
    folder_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    config_path = os.path.join(folder_path, "test_config/config.json")
    with open(config_path, "r") as f:
        conf_param = json.load(f)
    # credentials = pika.PlainCredentials('admin', 'admin')
    # parameters = pika.ConnectionParameters('10.40.10.173', credentials=credentials)
    credentials = pika.PlainCredentials(username=conf_param['username'], password=conf_param['password'])
    parameters = pika.ConnectionParameters(host=conf_param['host'], credentials=credentials)
    connection = pika.BlockingConnection(parameters)
    channel = connection.channel()
    try:
        channel.queue_declare(queue=conf_param['queue'], passive=True)
    except Exception as e:
        print(e)
        raise
    channel.basic_consume(queue=conf_param['queue'],
                          auto_ack=True,
                          on_message_callback=callback)
    print('Waiting for messages...')
    channel.start_consuming()