# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
from __future__ import division
from __future__ import print_function
import os
import re
import zipfile
from os import makedirs
from random import shuffle
import numpy as np
import tensorflow as tf
from nlp_architect.api.abstract_api import AbstractApi
from nlp_architect.models.matchlstm_ansptr import MatchLSTMAnswerPointer
from nlp_architect import LIBRARY_OUT
from nlp_architect.utils.generic import license_prompt
from nlp_architect.utils.io import download_unlicensed_file
from nlp_architect.utils.mrc_utils import (
create_squad_training, max_values_squad, get_data_array_squad)
[docs]class MachineComprehensionApi(AbstractApi):
"""
Machine Comprehension API
"""
dir = str(LIBRARY_OUT / 'mrc-pretrained')
data_path = os.path.join(dir, 'mrc_data', 'data')
data_dir = os.path.join(dir, 'mrc_data')
model_dir = os.path.join(dir, 'mrc_trained_model')
model_path = os.path.join(dir, 'mrc_trained_model', 'trained_model')
def __init__(self, prompt=True):
self.prompt = None
self.vocab_dict = None
self.vocab_rev = None
self.model = None
self.dev = None
self.sess = None
self.prompt = prompt
self.params_dict = {'batch_size': 1,
'hidden_size': 150,
'max_para': 300,
'epoch_no': 15,
'inference_only': True}
self.file_name_dict = {'train_para_ids': 'train.ids.context',
'train_ques_ids': 'train.ids.question',
'train_answer': 'train.span',
'val_para_ids': 'dev.ids.context',
'val_ques_ids': 'dev.ids.question',
'val_ans': 'dev.span',
'vocab_file': 'vocab.dat',
'embedding': 'glove.trimmed.300.npz'}
[docs] def download_model(self):
# Validate contents of data_path folder:
data_path = self.data_path
download = False
for file_name in self.file_name_dict.values():
if not os.path.exists(os.path.join(data_path, file_name)):
# prompt
download = True
print("The following required file is missing :", file_name)
if download is True:
if self.prompt is True:
license_prompt('mrc_data',
'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc'
'/mrc_data.zip',
self.data_dir)
license_prompt('mrc_model',
'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc'
'/mrc_model.zip',
self.model_dir)
data_zipfile = os.path.join(self.data_dir, 'mrc_data.zip')
model_zipfile = os.path.join(self.model_dir, 'mrc_model.zip')
makedirs(self.data_dir, exist_ok=True)
makedirs(self.model_dir, exist_ok=True)
download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
'/models/mrc/',
'mrc_data.zip', data_zipfile)
download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
'/models/mrc/',
'mrc_model.zip', model_zipfile)
with zipfile.ZipFile(data_zipfile) as data_zip_ref:
data_zip_ref.extractall(self.data_dir)
with zipfile.ZipFile(model_zipfile) as model_zip_ref:
model_zip_ref.extractall(self.model_dir)
[docs] def load_model(self):
select_device = 'GPU'
restore_model = True
# Create dictionary of filenames
self.download_model()
data_path = self.data_path
# Paths for preprcessed files
path_gen = data_path # data is actually in mrc_data/data not, mrc_data
train_para_ids = os.path.join(path_gen, self.file_name_dict['train_para_ids'])
train_ques_ids = os.path.join(path_gen, self.file_name_dict['train_ques_ids'])
answer_file = os.path.join(path_gen, self.file_name_dict['train_answer'])
val_paras_ids = os.path.join(path_gen, self.file_name_dict['val_para_ids'])
val_ques_ids = os.path.join(path_gen, self.file_name_dict['val_ques_ids'])
val_ans_file = os.path.join(path_gen, self.file_name_dict['val_ans'])
vocab_file = os.path.join(path_gen, self.file_name_dict['vocab_file'])
model_dir = self.model_path
# Create model dir if it doesn't exist
if not os.path.exists(model_dir):
os.makedirs(model_dir)
model_path = model_dir
# Create lists for train and validation sets
data_train = create_squad_training(train_para_ids, train_ques_ids, answer_file)
data_dev = create_squad_training(val_paras_ids, val_ques_ids, val_ans_file)
with open(vocab_file, encoding='UTF-8') as fp:
vocab_list = fp.readlines()
self.vocab_dict = {}
self.vocab_rev = {}
for i in range(len(vocab_list)):
self.vocab_dict[i] = vocab_list[i].strip()
self.vocab_rev[vocab_list[i].strip()] = i
self.params_dict['train_set_size'] = len(data_train)
# Combine train and dev data
data_total = data_train + data_dev
# obtain maximum length of question
_, max_question = max_values_squad(data_total)
self.params_dict['max_question'] = max_question
# Load embeddings for vocab
print('Loading Embeddings')
embeddingz = np.load(os.path.join(path_gen, self.file_name_dict['embedding']))
embeddings = embeddingz['glove']
# Create train and dev sets
print("Creating training and development sets")
self.dev = get_data_array_squad(self.params_dict, data_dev, set_val='val')
# Define Reading Comprehension model
with tf.device('/device:' + select_device + ':0'):
self.model = MatchLSTMAnswerPointer(self.params_dict, embeddings)
# Define Configs for training
run_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
# Create session run training
self.sess = tf.Session(config=run_config)
init = tf.global_variables_initializer()
# Model Saver
# pylint: disable=no-member
model_saver = tf.train.Saver()
model_ckpt = tf.train.get_checkpoint_state(model_path)
idx_path = model_ckpt.model_checkpoint_path + ".index" if model_ckpt else ""
# Initialize with random or pretrained weights
# pylint: disable=no-member
if model_ckpt and restore_model and (tf.gfile.Exists(
model_ckpt.model_checkpoint_path) or tf.gfile.Exists(idx_path)):
model_saver.restore(self.sess, model_ckpt.model_checkpoint_path)
print("Loading from previously stored session")
else:
self.sess.run(init)
shuffle(self.dev)
[docs] @staticmethod
def paragraphs(valid, vocab_tuple, num_examples):
paragraphs = []
vocab_forward = vocab_tuple[0]
for idx in range(num_examples):
test_paragraph = [vocab_forward[ele] for ele in valid[idx][0] if ele != 0]
para_string = " ".join(map(str, test_paragraph))
paragraphs.append(re.sub(r'\s([?.!,"](?:\s|$))', r'\1', para_string)) # (?:\s|$))
return paragraphs
[docs] @staticmethod
def questions(valid, vocab_tuple, num_examples):
vocab_forward = vocab_tuple[0]
questions = []
for idx in range(num_examples):
test_question = [vocab_forward[ele] for ele in valid[idx][1] if ele != 0]
ques_string = " ".join(map(str, test_question))
questions.append(re.sub(r'\s([?.!"",])', r'\1', ques_string))
return questions
[docs] def inference(self, doc):
body = doc
print("Begin Inference Mode")
question = body['question']
paragraph_id = body['paragraph']
return self.model.inference_mode(self.sess, self.dev, [self.vocab_dict, self.vocab_rev],
dynamic_question_mode=True, num_examples=1, dropout=1.0,
dynamic_usr_question=question,
dynamic_question_index=paragraph_id)
[docs] def get_paragraphs(self):
ret = {'paragraphs': self.paragraphs(self.dev, [self.vocab_dict, self.vocab_rev],
num_examples=5),
'questions': self.questions(self.dev, [self.vocab_dict, self.vocab_rev],
num_examples=5)}
return ret