Source code for nlp_architect.api.intent_extraction_api
# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import numpy as np
import pickle
from os import makedirs, path, sys
from nlp_architect.api.abstract_api import AbstractApi
from nlp_architect.models.intent_extraction import MultiTaskIntentModel, Seq2SeqIntentModel
from nlp_architect import LIBRARY_OUT
from nlp_architect.utils.generic import pad_sentences
from nlp_architect.utils.io import download_unlicensed_file
from nlp_architect.utils.text import SpacyInstance, bio_to_spans
[docs]class IntentExtractionApi(AbstractApi):
model_dir = str(LIBRARY_OUT / 'intent-pretrained')
pretrained_model_info = path.join(model_dir, 'model_info.dat')
pretrained_model = path.join(model_dir, 'model.h5')
def __init__(self, prompt=True):
self.model = None
self.model_type = None
self.word_vocab = None
self.tags_vocab = None
self.char_vocab = None
self.intent_vocab = None
self._download_pretrained_model(prompt)
self.nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
[docs] def process_text(self, text):
input_text = ' '.join(text.strip().split())
return self.nlp.tokenize(input_text)
@staticmethod
def _prompt():
response = input('\nTo download \'{}\', please enter YES: '.
format('intent_extraction'))
res = response.lower().strip()
if res == "yes" or (len(res) == 1 and res == 'y'):
print('Downloading {}...'.format('ner'))
responded_yes = True
else:
print('Download declined. Response received {} != YES|Y. '.format(res))
responded_yes = False
return responded_yes
@staticmethod
def _download_pretrained_model(prompt=True):
"""Downloads the pre-trained BIST model if non-existent."""
model_info_exists = path.isfile(IntentExtractionApi.pretrained_model_info)
model_exists = path.isfile(IntentExtractionApi.pretrained_model)
if not model_exists or not model_info_exists:
print('The pre-trained models to be downloaded for the intent extraction dataset '
'are licensed under Apache 2.0. By downloading, you accept the terms '
'and conditions provided by the license')
makedirs(IntentExtractionApi.model_dir, exist_ok=True)
if prompt is True:
agreed = IntentExtractionApi._prompt()
if agreed is False:
sys.exit(0)
download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
'/models/intent/',
'model_info.dat', IntentExtractionApi.pretrained_model_info)
download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
'/models/intent/',
'model.h5', IntentExtractionApi.pretrained_model)
print('Done.')
[docs] @staticmethod
def display_results(text_str, predictions, intent_type):
ret = {'annotation_set': [], 'doc_text': ' '.join([t for t in text_str])}
spans = []
available_tags = set()
for s, e, tag in bio_to_spans(text_str, predictions):
spans.append({
'start': s,
'end': e,
'type': tag
})
available_tags.add(tag)
ret['annotation_set'] = list(available_tags)
ret['spans'] = spans
ret['title'] = intent_type
return {'doc': ret, 'type': 'high_level'}
[docs] def vectorize(self, doc, vocab, char_vocab=None):
words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc])\
.reshape(1, -1)
if char_vocab is not None:
sentence_chars = []
for w in doc:
word_chars = []
for c in w:
if c in char_vocab:
_cid = char_vocab[c]
else:
_cid = 1
word_chars.append(_cid)
sentence_chars.append(word_chars)
sentence_chars = np.expand_dims(pad_sentences(sentence_chars, self.model.word_length),
axis=0)
return [words, sentence_chars]
return words
[docs] def inference(self, doc):
text_arr = self.process_text(doc)
intent_type = None
if self.model_type == 'mtl':
doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab)
intent, tags = self.model.predict(doc_vec, batch_size=1)
intent = int(intent.argmax(1).flatten())
intent_type = self.intent_vocab.get(intent, None)
print('Detected intent type: {}'.format(intent_type))
else:
doc_vec = self.vectorize(text_arr, self.word_vocab, None)
tags = self.model.predict(doc_vec, batch_size=1)
tags = tags.argmax(2).flatten()
tag_str = [self.tags_vocab.get(n, None) for n in tags]
for t, n in zip(text_arr, tag_str):
print('{}\t{}\t'.format(t, n))
return self.display_results(text_arr, tag_str, intent_type)
[docs] def load_model(self):
with open(IntentExtractionApi.pretrained_model_info, 'rb') as fp:
model_info = pickle.load(fp)
self.model_type = model_info['type']
self.word_vocab = model_info['word_vocab']
self.tags_vocab = {v: k for k, v in model_info['tags_vocab'].items()}
if self.model_type == 'mtl':
self.char_vocab = model_info['char_vocab']
self.intent_vocab = {v: k for k, v in model_info['intent_vocab'].items()}
model = MultiTaskIntentModel()
else:
model = Seq2SeqIntentModel()
model.load(self.pretrained_model)
self.model = model