Expand source code
from ..imports import *
from .. import utils as U
from . import preprocessor as tpp
NBSVM = 'nbsvm'
FASTTEXT = 'fasttext'
LOGREG = 'logreg'
BIGRU = 'bigru'
STANDARD_GRU = 'standard_gru'
BERT = 'bert'
DISTILBERT = tpp.DISTILBERT
HUGGINGFACE_MODELS = [DISTILBERT]
LINREG = 'linreg'
TEXT_CLASSIFIERS = {
FASTTEXT: "a fastText-like model [http://arxiv.org/pdf/1607.01759.pdf]",
LOGREG: "logistic regression using a trainable Embedding layer",
NBSVM: "NBSVM model [http://www.aclweb.org/anthology/P12-2018]",
BIGRU: 'Bidirectional GRU with pretrained fasttext word vectors [https://fasttext.cc/docs/en/crawl-vectors.html]',
STANDARD_GRU: 'simple 2-layer GRU with randomly initialized embeddings',
BERT: 'Bidirectional Encoder Representations from Transformers (BERT) from keras_bert [https://arxiv.org/abs/1810.04805]',
DISTILBERT: 'distilled, smaller, and faster BERT from Hugging Face transformers [https://arxiv.org/abs/1910.01108]',
}
TEXT_REGRESSION_MODELS = {
FASTTEXT: "a fastText-like model [http://arxiv.org/pdf/1607.01759.pdf]",
LINREG: "linear text regression using a trainable Embedding layer",
BIGRU: 'Bidirectional GRU with pretrained English word vectors [https://arxiv.org/abs/1712.09405]',
STANDARD_GRU: 'simple 2-layer GRU with randomly initialized embeddings',
BERT: 'Bidirectional Encoder Representations from Transformers (BERT) - keras_bert implementation [https://arxiv.org/abs/1810.04805]',
DISTILBERT: 'distilled, smaller, and faster BERT from Hugging Face transformers [https://arxiv.org/abs/1910.01108]',
}
def print_text_classifiers():
for k,v in TEXT_CLASSIFIERS.items():
print("%s: %s" % (k,v))
def print_text_regression_models():
for k,v in TEXT_REGRESSION_MODELS.items():
print("%s: %s" % (k,v))
def calc_pr(y_i, x, y, b):
idx = np.argwhere((y==y_i)==b)
ct = x[idx[:,0]].sum(0)+1
tot = ((y==y_i)==b).sum()+1
return ct/tot
def calc_r(y_i, x, y):
return np.log(calc_pr(y_i, x, y, True) / calc_pr(y_i, x, y, False))
def _text_model(name, train_data, preproc=None, multilabel=None, classification=True, metrics=['accuracy'], verbose=1):
"""
```
Build and return a text classification or text regression model.
Args:
name (string): one of:
- 'fasttext' for FastText model
- 'nbsvm' for NBSVM model
- 'logreg' for logistic regression
- 'bigru' for Bidirectional GRU with pretrained word vectors
- 'bert' for BERT Text Classification
- 'distilbert' for Hugging Face DistilBert model
train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train) or ktrain.Dataset instance
returned from one of the texts_from_* functions
preproc: a ktrain.text.TextPreprocessor instance.
As of v0.8.0, this is required.
multilabel (bool): If True, multilabel model will be returned.
If false, binary/multiclass model will be returned.
If None, multilabel will be inferred from data.
classification(bool): If True, will build a text classificaton model.
Otherwise, a text regression model will be returned.
metrics(list): list of metrics to use
verbose (boolean): verbosity of output
Return:
model (Model): A Keras Model instance
```
"""
# check arguments
if not isinstance(train_data, tuple) and not U.is_huggingface_from_data(train_data):
err ="""
Please pass training data in the form of a tuple of numpy.ndarrays
or data returned from a ktrain texts_from* function.
"""
raise Exception(err)
if not isinstance(preproc, tpp.TextPreprocessor):
msg = 'The preproc argument is required.'
msg += ' The preproc arg should be an instance of TextPreprocessor, which is '
msg += ' the third return value from texts_from_folder, texts_from_csv, etc.'
#warnings.warn(msg, FutureWarning)
raise ValueError(msg)
if name == BIGRU and preproc.ngram_count() != 1:
raise ValueError('Data should be processed with ngram_range=1 for bigru model.')
is_bert = U.bert_data_tuple(train_data)
if (is_bert and name != BERT) or (not is_bert and name == BERT):
raise ValueError("if '%s' is selected model, then preprocess_mode='%s' should be used and vice versa" % (BERT, BERT))
is_huggingface = U.is_huggingface(data=train_data)
if (is_huggingface and name not in HUGGINGFACE_MODELS) or (not is_huggingface and name in HUGGINGFACE_MODELS):
raise ValueError('you are using a Hugging Face transformer model but did not preprocess as such (or vice versa)')
if is_huggingface and preproc.name != name:
raise ValueError('you preprocessed for %s but want to build a %s model' % (preproc.name, name))
if not classification: # regression
if metrics is None or metrics==['accuracy']: metrics=['mae']
num_classes = 1
multilabel = False
loss_func = 'mse'
activation = None
max_features = preproc.max_features
features = None
maxlen = U.shape_from_data(train_data)[1]
U.vprint('maxlen is %s' % (maxlen), verbose=verbose)
else: # classification
if metrics is None: metrics = ['accuracy']
# set number of classes and multilabel flag
num_classes = U.nclasses_from_data(train_data)
# determine multilabel
if multilabel is None:
multilabel = U.is_multilabel(train_data)
if multilabel and name in [NBSVM, LOGREG]:
warnings.warn('switching to fasttext model, as data suggests '
'multilabel classification from data.')
name = FASTTEXT
U.vprint("Is Multi-Label? %s" % (multilabel), verbose=verbose)
# set loss and activations
loss_func = 'categorical_crossentropy'
activation = 'softmax'
if multilabel:
loss_func = 'binary_crossentropy'
activation = 'sigmoid'
# determine number of classes, maxlen, and max_features
max_features = preproc.max_features if preproc is not None else None
features = set()
if not is_bert and not is_huggingface:
U.vprint('compiling word ID features...', verbose=verbose)
x_train = train_data[0]
y_train = train_data[1]
if isinstance(y_train[0], int): raise ValueError('train labels should not be in sparse format')
for x in x_train:
features.update(x)
#max_features = len(features)
if max_features is None:
max_features = max(features)+1
U.vprint('max_features is %s' % (max_features), verbose=verbose)
maxlen = U.shape_from_data(train_data)[1]
U.vprint('maxlen is %s' % (maxlen), verbose=verbose)
# return appropriate model
if name in [LOGREG, LINREG]:
model = _build_logreg(num_classes,
maxlen,
max_features,
features,
loss_func=loss_func,
activation=activation, metrics=metrics, verbose=verbose)
elif name==FASTTEXT:
model = _build_fasttext(num_classes,
maxlen,
max_features,
features,
loss_func=loss_func,
activation=activation, metrics=metrics, verbose=verbose)
elif name==STANDARD_GRU:
model = _build_standard_gru(num_classes,
maxlen,
max_features,
features,
loss_func=loss_func,
activation=activation, metrics=metrics, verbose=verbose)
elif name==NBSVM:
model = _build_nbsvm(num_classes,
maxlen,
max_features,
features,
loss_func=loss_func,
activation=activation, metrics=metrics, verbose=verbose,
train_data=train_data)
elif name==BIGRU:
(tokenizer, tok_dct) = preproc.get_preprocessor()
model = _build_bigru(num_classes,
maxlen,
max_features,
features,
loss_func=loss_func,
activation=activation, metrics=metrics, verbose=verbose,
tokenizer=tokenizer,
preproc=preproc)
elif name == BERT:
model = _build_bert(num_classes,
maxlen,
max_features,
features,
loss_func=loss_func,
activation=activation, metrics=metrics, verbose=verbose,
preproc=preproc)
elif name in HUGGINGFACE_MODELS:
model = _build_transformer(num_classes,
maxlen,
max_features,
features,
loss_func=loss_func,
activation=activation, metrics=metrics, verbose=verbose,
preproc=preproc)
else:
raise ValueError('name for textclassifier is invalid')
U.vprint('done.', verbose=verbose)
return model
def _build_logreg(num_classes,
maxlen, max_features, features,
loss_func='categorical_crossentropy',
activation = 'softmax', metrics=['accuracy'], verbose=1):
embedding_matrix = np.ones((max_features, 1))
embedding_matrix[0] = 0
# set up the model
inp = keras.layers.Input(shape=(maxlen,))
r = keras.layers.Embedding(max_features, 1, input_length=maxlen,
weights=[embedding_matrix], trainable=False)(inp)
x = keras.layers.Embedding(max_features, num_classes, input_length=maxlen,
embeddings_initializer='glorot_normal')(inp)
x = keras.layers.dot([x,r], axes=1)
x = keras.layers.Flatten()(x)
if activation: x = keras.layers.Activation(activation)(x)
model = keras.Model(inputs=inp, outputs=x)
model.compile(loss=loss_func,
optimizer=U.DEFAULT_OPT,
metrics=metrics)
return model
def _build_bert(num_classes,
maxlen, max_features, features,
loss_func='categorical_crossentropy',
activation = 'softmax', metrics=['accuracy'], verbose=1,
preproc=None):
if preproc is None: raise ValueError('preproc is missing')
lang = preproc.lang
if lang is None: raise ValueError('lang is missing')
config_path = os.path.join(tpp.get_bert_path(lang=lang), 'bert_config.json')
checkpoint_path = os.path.join(tpp.get_bert_path(lang=lang), 'bert_model.ckpt')
import keras_bert
model = keras_bert.load_trained_model_from_checkpoint(
config_path,
checkpoint_path,
training=True,
trainable=True,
seq_len=maxlen)
inputs = model.inputs[:2]
dense = model.get_layer('NSP-Dense').output
outputs = keras.layers.Dense(units=num_classes, activation=activation)(dense)
model = keras.Model(inputs, outputs)
model.compile(loss=loss_func,
optimizer=U.DEFAULT_OPT,
metrics=metrics)
return model
def _build_transformer(num_classes,
maxlen, max_features, features,
loss_func='categorical_crossentropy',
activation = 'softmax', metrics=['accuracy'], verbose=1,
preproc=None):
if not isinstance(preproc, tpp.TransformersPreprocessor):
raise ValueError('preproc must be instance of %s' % (str(tpp.TransformersPreprocessor)))
if loss_func == 'mse':
if preproc.get_classes():
raise Exception('This is supposed to be regression problem, but preproc.get_classes() is not empty. ' +\
'Something went wrong. Please open a GitHub issue.')
if len(preproc.get_classes()) != num_classes:
raise Exception('Number of labels from preproc.get_classes() is not equal to num_classes. ' +\
'Something went wrong. Please open GitHub issue.')
else:
if not preproc.get_classes():
raise Exception('This is supposed to be a classification problem, but preproc.get_classes() is empty. ' +\
'Something went wrong. Please open a GitHub issue.')
return preproc.get_regression_model(metrics=metrics) if loss_func == 'mse' else preproc.get_classifier(metrics=metrics)
def _build_nbsvm(num_classes,
maxlen, max_features, features,
loss_func='categorical_crossentropy',
activation = 'softmax', metrics=['accuracy'], verbose=1, train_data=None):
if train_data is None: raise ValueError('train_data is required')
x_train = train_data[0]
y_train = train_data[1]
Y = np.array([np.argmax(row) for row in y_train])
num_columns = max(features) + 1
num_rows = len(x_train)
# set up document-term matrix
X = csr_matrix((num_rows, num_columns), dtype=np.int8)
#X = lil_matrix((num_rows, num_columns), dtype=np.int8)
U.vprint('building document-term matrix... this may take a few moments...',
verbose=verbose)
r_ids = []
c_ids = []
data = []
for row_id, row in enumerate(x_train):
trigger = 10000
trigger_end = min(row_id+trigger, num_rows)
if row_id % trigger == 0:
U.vprint('rows: %s-%s' % (row_id+1, trigger_end),
verbose=verbose)
tmp_c_ids = [column_id for column_id in row if column_id >0 ]
num = len(tmp_c_ids)
c_ids.extend(tmp_c_ids)
r_ids.extend([row_id]* num)
data.extend([1] * num)
X = csr_matrix( (data,(r_ids,c_ids)), shape=(num_rows, num_columns) )
# compute Naive Bayes log-count ratios
U.vprint('computing log-count ratios...', verbose=verbose)
nbratios = np.stack([calc_r(i, X, Y).A1 for i in range(num_classes)])
nbratios = nbratios.T
embedding_matrix = np.zeros((num_columns, num_classes))
for i in range(1, num_columns):
for j in range(num_classes):
embedding_matrix[i,j] = nbratios[i,j]
# set up the model
inp = keras.layers.Input(shape=(maxlen,))
r = keras.layers.Embedding(num_columns, num_classes, input_length=maxlen,
weights=[embedding_matrix], trainable=False)(inp)
x = keras.layers.Embedding(num_columns, 1, input_length=maxlen,
embeddings_initializer='glorot_normal')(inp)
x = keras.layers.dot([r,x], axes=1)
x = keras.layers.Flatten()(x)
x = keras.layers.Activation(activation)(x)
model = keras.Model(inputs=inp, outputs=x)
model.compile(loss=loss_func,
optimizer=U.DEFAULT_OPT,
metrics=metrics)
return model
def _build_fasttext(num_classes,
maxlen, max_features, features,
loss_func='categorical_crossentropy',
activation = 'softmax', metrics=['accuracy'], verbose=1):
model = keras.models.Sequential()
model.add(keras.layers.Embedding(max_features, 64, input_length=maxlen))
model.add(keras.layers.SpatialDropout1D(0.25))
model.add(keras.layers.GlobalMaxPool1D())
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_normal'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(num_classes, activation=activation))
model.compile(loss=loss_func, optimizer=U.DEFAULT_OPT, metrics=metrics)
return model
def _build_standard_gru(num_classes,
maxlen, max_features, features,
loss_func='categorical_crossentropy',
activation = 'softmax', metrics=['accuracy'], verbose=1):
model = keras.models.Sequential()
model.add(keras.layers.Embedding(max_features, 256, input_length = maxlen))
model.add(keras.layers.GRU(256, dropout=0.9, return_sequences=True))
model.add(keras.layers.GRU(256, dropout=0.9))
model.add(keras.layers.Dense(num_classes, activation=activation))
model.compile(loss=loss_func, optimizer=U.DEFAULT_OPT, metrics=metrics)
return model
def _build_bigru(num_classes,
maxlen, max_features, features,
loss_func='categorical_crossentropy',
activation = 'softmax', metrics=['accuracy'], verbose=1,
tokenizer=None, preproc=None):
if tokenizer is None: raise ValueError('bigru requires valid Tokenizer object')
if preproc is None: raise ValueError('bigru requires valid preproc')
if not hasattr(preproc, 'lang') or preproc.lang is None:
lang = 'en'
else:
lang = preproc.lang
wv_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.%s.300.vec.gz" % (lang.split('-')[0])
if verbose: print('word vectors will be loaded from: %s' % (wv_url))
# setup pre-trained word embeddings
embed_size = 300
U.vprint('processing pretrained word vectors...', verbose=verbose)
embeddings_index = tpp.load_wv(wv_path_or_url=wv_url, verbose=verbose)
word_index = tokenizer.word_index
#nb_words = min(max_features, len(word_index))
nb_words = max_features
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
if i >= max_features: continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None: embedding_matrix[i] = embedding_vector
# define model
inp = keras.layers.Input(shape=(maxlen, ))
x = keras.layers.Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = keras.layers.SpatialDropout1D(0.2)(x)
x = keras.layers.Bidirectional(keras.layers.GRU(80, return_sequences=True))(x)
avg_pool = keras.layers.GlobalAveragePooling1D()(x)
max_pool = keras.layers.GlobalMaxPool1D()(x)
conc = keras.layers.concatenate([avg_pool, max_pool])
outp = keras.layers.Dense(num_classes, activation=activation)(conc)
model = keras.Model(inputs=inp, outputs=outp)
model.compile(loss=loss_func,
optimizer=U.DEFAULT_OPT,
metrics=metrics)
return model
def text_classifier(name, train_data, preproc=None, multilabel=None, metrics=['accuracy'], verbose=1):
"""
```
Build and return a text classification model.
Args:
name (string): one of:
- 'fasttext' for FastText model
- 'nbsvm' for NBSVM model
- 'logreg' for logistic regression using embedding layers
- 'bigru' for Bidirectional GRU with pretrained word vectors
- 'bert' for BERT Text Classification
- 'distilbert' for Hugging Face DistilBert model
train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train) or ktrain.Dataset instance
returned from one of the texts_from_* functions
preproc: a ktrain.text.TextPreprocessor instance.
As of v0.8.0, this is required.
multilabel (bool): If True, multilabel model will be returned.
If false, binary/multiclass model will be returned.
If None, multilabel will be inferred from data.
metrics(list): metrics to use
verbose (boolean): verbosity of output
Return:
model (Model): A Keras Model instance
```
"""
if name not in TEXT_CLASSIFIERS:
raise ValueError('invalid name for text classification: %s' % (name))
if preproc is not None and not preproc.get_classes():
raise ValueError('preproc.get_classes() is empty, but required for text classification')
return _text_model(name, train_data, preproc=preproc,
multilabel=multilabel, classification=True, metrics=metrics, verbose=verbose)
def text_regression_model(name, train_data, preproc=None, metrics=['mae'], verbose=1):
"""
```
Build and return a text regression model.
Args:
name (string): one of:
- 'fasttext' for FastText model
- 'nbsvm' for NBSVM model
- 'linreg' for linear regression using embedding layers
- 'bigru' for Bidirectional GRU with pretrained word vectors
- 'bert' for BERT Text Classification
- 'distilbert' for Hugging Face DistilBert model
train_data (tuple): a tuple of numpy.ndarrays: (x_train, y_train)
preproc: a ktrain.text.TextPreprocessor instance.
As of v0.8.0, this is required.
metrics(list): metrics to use
verbose (boolean): verbosity of output
Return:
model (Model): A Keras Model instance
```
"""
if name not in TEXT_REGRESSION_MODELS:
raise ValueError('invalid name for text classification: %s' % (name) )
if preproc is not None and preproc.get_classes():
raise ValueError('preproc.get_classes() is supposed to be empty for text regression tasks')
return _text_model(name, train_data, preproc=preproc,
multilabel=False, classification=False, metrics=metrics, verbose=verbose)