Module ktrain.tabular.models

Expand source code
from ..imports import *
from .. import utils as U
from ..models import bn_drop_lin

MLP = 'mlp'
TABULAR_MODELS = {
                    MLP: "a configurable multilayer perceptron with categorical variable embeddings [https://arxiv.org/abs/1604.06737]",
                    } 

def print_tabular_classifiers():
    for k,v in TABULAR_MODELS.items():
        print("%s: %s" % (k,v))

def print_tabular_regression_models():
    for k,v in TABULAR_MODELS.items():
        print("%s: %s" % (k,v))



def _tabular_model(name, train_data, multilabel=None, is_regression=False, metrics=['accuracy'], 
                   hidden_layers=[1000, 500], hidden_dropouts=[0., 0.5], bn=False, verbose=1):
    """
    ```
    Build and return a classification or regression model for tabular data

    Args:
        name (string): currently accepts 'mlp' for multilayer perceptron
        train_data (TabularDataset): TabularDataset instance returned from one of the tabular_from_* functions
        multilabel (bool):  If True, multilabel model will be returned.
                            If false, binary/multiclass model will be returned.
                            If None, multilabel will be inferred from data.
        is_regression(bool): If True, will build a regression model, else classification model.
        metrics(list): list of metrics to use
        hidden_layers(list): number of units in each hidden layer of NN
        hidden_dropouts(list): Dropout values after each hidden layer of NN
        bn(bool): If True, BatchNormalization will be used before each fully-connected layer in NN
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    """

    # check arguments
    if not U.is_tabular_from_data(train_data):
        err ="""
            Please pass training data in the form of data returned from a ktrain tabular_from* function.
            """
        raise Exception(err)
    if len(hidden_layers) != len(hidden_dropouts): raise ValueError('len(hidden_layers) must equal len(hidden_dropouts)')

    # reformat dropouts for each of construction
    output_dropout = hidden_dropouts[1]
    hidden_dropouts[1] = hidden_dropouts[0]
    hidden_dropouts[0] = 0.

    # set model configuration values
    if is_regression: # regression
        if metrics is None or metrics==['accuracy']: metrics=['mae']
        num_classes = 1
        multilabel = False
        loss_func = 'mse'
        activation = 'linear'
    else:             # classification
        if metrics is None: metrics = ['accuracy']
        # set number of classes and multilabel flag
        num_classes = U.nclasses_from_data(train_data)

        # determine multilabel
        if multilabel is None:
            multilabel = U.is_multilabel(train_data)
        U.vprint("Is Multi-Label? %s" % (multilabel), verbose=verbose)

        # set loss and activations
        loss_func = 'categorical_crossentropy'
        activation = 'softmax'
        if multilabel:
            loss_func = 'binary_crossentropy'
            activation = 'sigmoid'


    # construct model

    ilayers = []
    n_cat = len(train_data.cat_columns)
    n_cont = len(train_data.cont_columns)
    if n_cat ==0 and n_cont == 0: raise ValueError('There are zero continuous and cateorical variables.')

    # categorical inputs and embeddings
    if n_cat > 0:
        emblayers = []
        num_uniques = [max(c.cat.codes.values+1)+1 for n, c in train_data.df[train_data.cat_columns].items()]
        for i in range(n_cat):
            inp = keras.layers.Input(shape=(1,))
            ilayers.append(inp)
            emb_size = min(50, (num_uniques[i]//2)+1)
            #emb_size = min(600, round(1.6 * num_uniques[i]**0.56))
            emb = keras.layers.Embedding(num_uniques[i], emb_size, input_length=1)(inp)
            emblayers.append(emb)
        x = keras.layers.concatenate(emblayers)if len(emblayers) > 1 else emblayers[0]
        x = keras.layers.Flatten()(x)

    # continuous inputs
    if n_cont > 0:
        x_cont = keras.layers.Input(shape=(n_cont,))
        ilayers.append(x_cont)
        x = keras.layers.concatenate([x, x_cont]) if n_cat > 0 else x_cont

    # hidden layers
    output = x
    for i, n_out in enumerate(hidden_layers):
        output = bn_drop_lin(output, n_out, bn=bn, p=hidden_dropouts[i], actn='relu')

    # output layer
    output = bn_drop_lin(output, num_classes , bn=bn, p=output_dropout, actn=activation)

    # construct and compile model
    model = keras.Model(inputs=ilayers, outputs=output)
    model.compile(optimizer=U.DEFAULT_OPT, loss=loss_func, metrics=metrics)
    U.vprint('done.', verbose=verbose)
    return model



def tabular_classifier(name, train_data, multilabel=None, metrics=['accuracy'], 
                       hidden_layers=[1000, 500], hidden_dropouts=[0., 0.5], bn=False, verbose=1):
    """
    ```
    Build and return a classification model for tabular data

    Args:
        name (string): currently accepts 'mlp' for multilayer perceptron
        train_data (TabularDataset): TabularDataset instance returned from one of the tabular_from_* functions
        multilabel (bool):  If True, multilabel model will be returned.
                            If false, binary/multiclass model will be returned.
                            If None, multilabel will be inferred from data.
        metrics(list): list of metrics to use
        hidden_layers(list): number of units in each hidden layer of NN
        hidden_dropouts(list): Dropout values after each hidden layer of NN
        bn(bool): If True, BatchNormalization will be used before each fully-connected layer in NN
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    """


    return _tabular_model(name, train_data, multilabel=multilabel, metrics=metrics,
                          hidden_layers=hidden_layers, hidden_dropouts=hidden_dropouts, bn=bn,
                          verbose=verbose, is_regression=False)


def tabular_regression_model(name, train_data,  metrics=['mae'], 
                             hidden_layers=[1000, 500], hidden_dropouts=[0., 0.5], bn=False, verbose=1):
    """
    ```
    Build and return a regression model for tabular data

    Args:
        name (string): currently accepts 'mlp' for multilayer perceptron
        train_data (TabularDataset): TabularDataset instance returned from one of the tabular_from_* functions
        metrics(list): list of metrics to use
        hidden_layers(list): number of units in each hidden layer of NN
        hidden_dropouts(list): Dropout values after each hidden layer of NN
        bn(bool): If True, BatchNormalization will be before used each fully-connected layer in NN
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    """


    return _tabular_model(name, train_data, multilabel=None, metrics=metrics, 
                          hidden_layers=hidden_layers, hidden_dropouts=hidden_dropouts, bn=bn,
                          verbose=verbose, is_regression=True)

Functions

def print_tabular_classifiers()
Expand source code
def print_tabular_classifiers():
    for k,v in TABULAR_MODELS.items():
        print("%s: %s" % (k,v))
def print_tabular_regression_models()
Expand source code
def print_tabular_regression_models():
    for k,v in TABULAR_MODELS.items():
        print("%s: %s" % (k,v))
def tabular_classifier(name, train_data, multilabel=None, metrics=['accuracy'], hidden_layers=[1000, 500], hidden_dropouts=[0.0, 0.5], bn=False, verbose=1)
Build and return a classification model for tabular data

Args:
    name (string): currently accepts 'mlp' for multilayer perceptron
    train_data (TabularDataset): TabularDataset instance returned from one of the tabular_from_* functions
    multilabel (bool):  If True, multilabel model will be returned.
                        If false, binary/multiclass model will be returned.
                        If None, multilabel will be inferred from data.
    metrics(list): list of metrics to use
    hidden_layers(list): number of units in each hidden layer of NN
    hidden_dropouts(list): Dropout values after each hidden layer of NN
    bn(bool): If True, BatchNormalization will be used before each fully-connected layer in NN
    verbose (boolean): verbosity of output
Return:
    model (Model): A Keras Model instance
Expand source code
def tabular_classifier(name, train_data, multilabel=None, metrics=['accuracy'], 
                       hidden_layers=[1000, 500], hidden_dropouts=[0., 0.5], bn=False, verbose=1):
    """
    ```
    Build and return a classification model for tabular data

    Args:
        name (string): currently accepts 'mlp' for multilayer perceptron
        train_data (TabularDataset): TabularDataset instance returned from one of the tabular_from_* functions
        multilabel (bool):  If True, multilabel model will be returned.
                            If false, binary/multiclass model will be returned.
                            If None, multilabel will be inferred from data.
        metrics(list): list of metrics to use
        hidden_layers(list): number of units in each hidden layer of NN
        hidden_dropouts(list): Dropout values after each hidden layer of NN
        bn(bool): If True, BatchNormalization will be used before each fully-connected layer in NN
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    """


    return _tabular_model(name, train_data, multilabel=multilabel, metrics=metrics,
                          hidden_layers=hidden_layers, hidden_dropouts=hidden_dropouts, bn=bn,
                          verbose=verbose, is_regression=False)
def tabular_regression_model(name, train_data, metrics=['mae'], hidden_layers=[1000, 500], hidden_dropouts=[0.0, 0.5], bn=False, verbose=1)
Build and return a regression model for tabular data

Args:
    name (string): currently accepts 'mlp' for multilayer perceptron
    train_data (TabularDataset): TabularDataset instance returned from one of the tabular_from_* functions
    metrics(list): list of metrics to use
    hidden_layers(list): number of units in each hidden layer of NN
    hidden_dropouts(list): Dropout values after each hidden layer of NN
    bn(bool): If True, BatchNormalization will be before used each fully-connected layer in NN
    verbose (boolean): verbosity of output
Return:
    model (Model): A Keras Model instance
Expand source code
def tabular_regression_model(name, train_data,  metrics=['mae'], 
                             hidden_layers=[1000, 500], hidden_dropouts=[0., 0.5], bn=False, verbose=1):
    """
    ```
    Build and return a regression model for tabular data

    Args:
        name (string): currently accepts 'mlp' for multilayer perceptron
        train_data (TabularDataset): TabularDataset instance returned from one of the tabular_from_* functions
        metrics(list): list of metrics to use
        hidden_layers(list): number of units in each hidden layer of NN
        hidden_dropouts(list): Dropout values after each hidden layer of NN
        bn(bool): If True, BatchNormalization will be before used each fully-connected layer in NN
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    ```
    """


    return _tabular_model(name, train_data, multilabel=None, metrics=metrics, 
                          hidden_layers=hidden_layers, hidden_dropouts=hidden_dropouts, bn=bn,
                          verbose=verbose, is_regression=True)