Package ktrain

Expand source code
from .version import __version__
from . import imports as I
from .core import ArrayLearner, GenLearner, get_predictor, load_predictor, release_gpu_memory
from .vision.learner import ImageClassLearner
from .text.learner import BERTTextClassLearner, TransformerTextClassLearner
from .text.ner.learner import NERLearner
from .graph.learner import NodeClassLearner, LinkPredLearner
from .data import Dataset, TFDataset, SequenceDataset

from . import utils as U

__all__ = ['get_learner', 'get_predictor', 'load_predictor', 'release_gpu_memory',
           'Dataset', 'TFDataset', 'SequenceDataset']




def get_learner(model, train_data=None, val_data=None, 
                batch_size=U.DEFAULT_BS, eval_batch_size=U.DEFAULT_BS,
                workers=1, use_multiprocessing=False):
    """
    ```
    Returns a Learner instance that can be used to tune and train Keras models.  

    model (Model):        A compiled instance of keras.engine.training.Model  
    train_data (tuple or generator): Either a:   
                                   1) tuple of (x_train, y_train), where x_train and   
                                      y_train are numpy.ndarrays or   
                                   2) Iterator  
    val_data (tuple or generator): Either a:   
                                   1) tuple of (x_test, y_test), where x_testand   
                                      y_test are numpy.ndarrays or    
                                   2) Iterator  
                                   Note: Should be same type as train_data.  
    batch_size (int):              Batch size to use in training. default:32  
    eval_batch_size(int):  batch size used by learner.predict  
                           only applies to validaton data during training if  
                           val_data is instance of utils.Sequence.  
                           default:32  
    workers (int): number of cpu processes used to load data.  
                   This is ignored unless train_data/val_data is an instance of   
                   tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator.   
    use_multiprocessing(bool):  whether or not to use multiprocessing for workers  
                               This is ignored unless train_data/val_data is an instance of   
                               tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator.   
    ```
    """

    # check arguments
    if not isinstance(model, I.Model):
        raise ValueError('model must be of instance Model')
    U.data_arg_check(train_data=train_data, val_data=val_data)
    if type(workers) != type(1) or workers < 1:
        workers =1
    # check for NumpyArrayIterator 
    if train_data and not U.ondisk(train_data):
        if workers > 1 and not use_multiprocessing:
            use_multiprocessing = True
            wrn_msg = 'Changed use_multiprocessing to True because NumpyArrayIterator with workers>1'
            wrn_msg +=' is slow when use_multiprocessing=False.'
            wrn_msg += ' If you experience issues with this, please set workers=1 and use_multiprocessing=False.'
            I.warnings.warn(wrn_msg)

    # verify BERT
    is_bert = U.bert_data_tuple(train_data)
    if is_bert:
        maxlen = U.shape_from_data(train_data)[1]
        msg = """For a GPU with 12GB of RAM, the following maxima apply:
        sequence len=64, max_batch_size=64
        sequence len=128, max_batch_size=32
        sequence len=256, max_batch_size=16
        sequence len=320, max_batch_size=14
        sequence len=384, max_batch_size=12
        sequence len=512, max_batch_size=6
        
        You've exceeded these limits.
        If using a GPU with <=12GB of memory, you may run out of memory during training.
        If necessary, adjust sequence length or batch size based on above."""
        wrn = False
        if maxlen > 64 and batch_size > 64:
            wrn=True
        elif maxlen > 128 and batch_size>32:
            wrn=True
        elif maxlen>256 and batch_size>16:
            wrn=True
        elif maxlen>320 and batch_size>14:
            wrn=True
        elif maxlen>384 and batch_size>12:
            wrn=True
        elif maxlen > 512 and batch_size>6:
            wrn=True
        if wrn: I.warnings.warn(msg)


    # return the appropriate trainer
    if U.is_iter(train_data):
        if U.is_ner(model=model, data=train_data):
            learner = NERLearner
        elif U.is_imageclass_from_data(train_data):
            learner = ImageClassLearner
        elif U.is_nodeclass(data=train_data):
            learner = NodeClassLearner
        elif U.is_nodeclass(data=train_data):
            learner = LinkPredLearner
        elif U.is_huggingface(data=train_data):
            learner = TransformerTextClassLearner
        else:
            learner = GenLearner
    else:
        if is_bert: 
            learner = BERTTextClassLearner
        else: # vanilla text classifiers use standard ArrayLearners
            learner = ArrayLearner
    return learner(model, train_data=train_data, val_data=val_data, 
                   batch_size=batch_size, eval_batch_size=eval_batch_size, 
                   workers=workers, use_multiprocessing=use_multiprocessing)


# keys
# currently_unsupported: unsupported or disabled features (e.g., xai graph neural networks have not been implemented)
# dep_fix:  a fix to address a problem in a dependency

Sub-modules

ktrain.core
ktrain.data
ktrain.graph
ktrain.imports
ktrain.lroptimize
ktrain.models
ktrain.predictor
ktrain.preprocessor
ktrain.tabular
ktrain.text
ktrain.utils
ktrain.version
ktrain.vision

Functions

def get_learner(model, train_data=None, val_data=None, batch_size=32, eval_batch_size=32, workers=1, use_multiprocessing=False)
Returns a Learner instance that can be used to tune and train Keras models.  

model (Model):        A compiled instance of keras.engine.training.Model  
train_data (tuple or generator): Either a:   
                               1) tuple of (x_train, y_train), where x_train and   
                                  y_train are numpy.ndarrays or   
                               2) Iterator  
val_data (tuple or generator): Either a:   
                               1) tuple of (x_test, y_test), where x_testand   
                                  y_test are numpy.ndarrays or    
                               2) Iterator  
                               Note: Should be same type as train_data.  
batch_size (int):              Batch size to use in training. default:32  
eval_batch_size(int):  batch size used by learner.predict  
                       only applies to validaton data during training if  
                       val_data is instance of utils.Sequence.  
                       default:32  
workers (int): number of cpu processes used to load data.  
               This is ignored unless train_data/val_data is an instance of   
               tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator.   
use_multiprocessing(bool):  whether or not to use multiprocessing for workers  
                           This is ignored unless train_data/val_data is an instance of   
                           tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator.   
Expand source code
def get_learner(model, train_data=None, val_data=None, 
                batch_size=U.DEFAULT_BS, eval_batch_size=U.DEFAULT_BS,
                workers=1, use_multiprocessing=False):
    """
    ```
    Returns a Learner instance that can be used to tune and train Keras models.  

    model (Model):        A compiled instance of keras.engine.training.Model  
    train_data (tuple or generator): Either a:   
                                   1) tuple of (x_train, y_train), where x_train and   
                                      y_train are numpy.ndarrays or   
                                   2) Iterator  
    val_data (tuple or generator): Either a:   
                                   1) tuple of (x_test, y_test), where x_testand   
                                      y_test are numpy.ndarrays or    
                                   2) Iterator  
                                   Note: Should be same type as train_data.  
    batch_size (int):              Batch size to use in training. default:32  
    eval_batch_size(int):  batch size used by learner.predict  
                           only applies to validaton data during training if  
                           val_data is instance of utils.Sequence.  
                           default:32  
    workers (int): number of cpu processes used to load data.  
                   This is ignored unless train_data/val_data is an instance of   
                   tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator.   
    use_multiprocessing(bool):  whether or not to use multiprocessing for workers  
                               This is ignored unless train_data/val_data is an instance of   
                               tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator.   
    ```
    """

    # check arguments
    if not isinstance(model, I.Model):
        raise ValueError('model must be of instance Model')
    U.data_arg_check(train_data=train_data, val_data=val_data)
    if type(workers) != type(1) or workers < 1:
        workers =1
    # check for NumpyArrayIterator 
    if train_data and not U.ondisk(train_data):
        if workers > 1 and not use_multiprocessing:
            use_multiprocessing = True
            wrn_msg = 'Changed use_multiprocessing to True because NumpyArrayIterator with workers>1'
            wrn_msg +=' is slow when use_multiprocessing=False.'
            wrn_msg += ' If you experience issues with this, please set workers=1 and use_multiprocessing=False.'
            I.warnings.warn(wrn_msg)

    # verify BERT
    is_bert = U.bert_data_tuple(train_data)
    if is_bert:
        maxlen = U.shape_from_data(train_data)[1]
        msg = """For a GPU with 12GB of RAM, the following maxima apply:
        sequence len=64, max_batch_size=64
        sequence len=128, max_batch_size=32
        sequence len=256, max_batch_size=16
        sequence len=320, max_batch_size=14
        sequence len=384, max_batch_size=12
        sequence len=512, max_batch_size=6
        
        You've exceeded these limits.
        If using a GPU with <=12GB of memory, you may run out of memory during training.
        If necessary, adjust sequence length or batch size based on above."""
        wrn = False
        if maxlen > 64 and batch_size > 64:
            wrn=True
        elif maxlen > 128 and batch_size>32:
            wrn=True
        elif maxlen>256 and batch_size>16:
            wrn=True
        elif maxlen>320 and batch_size>14:
            wrn=True
        elif maxlen>384 and batch_size>12:
            wrn=True
        elif maxlen > 512 and batch_size>6:
            wrn=True
        if wrn: I.warnings.warn(msg)


    # return the appropriate trainer
    if U.is_iter(train_data):
        if U.is_ner(model=model, data=train_data):
            learner = NERLearner
        elif U.is_imageclass_from_data(train_data):
            learner = ImageClassLearner
        elif U.is_nodeclass(data=train_data):
            learner = NodeClassLearner
        elif U.is_nodeclass(data=train_data):
            learner = LinkPredLearner
        elif U.is_huggingface(data=train_data):
            learner = TransformerTextClassLearner
        else:
            learner = GenLearner
    else:
        if is_bert: 
            learner = BERTTextClassLearner
        else: # vanilla text classifiers use standard ArrayLearners
            learner = ArrayLearner
    return learner(model, train_data=train_data, val_data=val_data, 
                   batch_size=batch_size, eval_batch_size=eval_batch_size, 
                   workers=workers, use_multiprocessing=use_multiprocessing)
def get_predictor(model, preproc, batch_size=32)
Returns a Predictor instance that can be used to make predictions on
unlabeled examples.  Can be saved to disk and reloaded as part of a 
larger application.

Args
    model (Model):        A compiled instance of keras.engine.training.Model
    preproc(Preprocessor):   An instance of TextPreprocessor,ImagePreprocessor,
                             or NERPreprocessor.
                             These instances are returned from the data loading
                             functions in the ktrain vision and text modules:

                             ktrain.vision.images_from_folder
                             ktrain.vision.images_from_csv
                             ktrain.vision.images_from_array
                             ktrain.text.texts_from_folder
                             ktrain.text.texts_from_csv
                             ktrain.text.ner.entities_from_csv
    batch_size(int):    batch size to use.  default:32
Expand source code
def get_predictor(model, preproc, batch_size=U.DEFAULT_BS):
    """
    ```
    Returns a Predictor instance that can be used to make predictions on
    unlabeled examples.  Can be saved to disk and reloaded as part of a 
    larger application.

    Args
        model (Model):        A compiled instance of keras.engine.training.Model
        preproc(Preprocessor):   An instance of TextPreprocessor,ImagePreprocessor,
                                 or NERPreprocessor.
                                 These instances are returned from the data loading
                                 functions in the ktrain vision and text modules:

                                 ktrain.vision.images_from_folder
                                 ktrain.vision.images_from_csv
                                 ktrain.vision.images_from_array
                                 ktrain.text.texts_from_folder
                                 ktrain.text.texts_from_csv
                                 ktrain.text.ner.entities_from_csv
        batch_size(int):    batch size to use.  default:32
    ```
    """

    # check arguments
    if not isinstance(model, Model):
        raise ValueError('model must be of instance Model')
    if not isinstance(preproc, (ImagePreprocessor,TextPreprocessor, NERPreprocessor, NodePreprocessor, LinkPreprocessor, TabularPreprocessor)):
        raise ValueError('preproc must be instance of ktrain.preprocessor.Preprocessor')
    if isinstance(preproc, ImagePreprocessor):
        return ImagePredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, TextPreprocessor):
    #elif type(preproc).__name__ == 'TextPreprocessor':
        return TextPredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, NERPreprocessor):
        return NERPredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, NodePreprocessor):
        return NodePredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, LinkPreprocessor):
        return LinkPredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, TabularPreprocessor):
        return TabularPredictor(model, preproc, batch_size=batch_size)

    else:
        raise Exception('preproc of type %s not currently supported' % (type(preproc)))
def load_predictor(fpath, batch_size=32, custom_objects=None)
Loads a previously saved Predictor instance
Args
  fpath(str): predictor path name (value supplied to predictor.save)
              From v0.16.x, this is always the path to a folder.
              Pre-v0.16.x, this is the base name used to save model and .preproc instance.
  batch_size(int): batch size to use for predictions. default:32
  custom_objects(dict): custom objects required to load model.
                        This is useful if you compiled the model with a custom loss function, for example.
                        For models included with ktrain as is, this is populated automatically
                        and can be disregarded.  
Expand source code
def load_predictor(fpath, batch_size=U.DEFAULT_BS, custom_objects=None):
    """
    ```
    Loads a previously saved Predictor instance
    Args
      fpath(str): predictor path name (value supplied to predictor.save)
                  From v0.16.x, this is always the path to a folder.
                  Pre-v0.16.x, this is the base name used to save model and .preproc instance.
      batch_size(int): batch size to use for predictions. default:32
      custom_objects(dict): custom objects required to load model.
                            This is useful if you compiled the model with a custom loss function, for example.
                            For models included with ktrain as is, this is populated automatically
                            and can be disregarded.  
    ```
    """

    # load the preprocessor
    preproc = None
    try:
        preproc_name = os.path.join(fpath, U.PREPROC_NAME)
        with open(preproc_name, 'rb') as f: preproc = pickle.load(f)
    except:
        try:
            preproc_name = fpath +'.preproc'
            #warnings.warn('could not load .preproc file as %s - attempting to load as %s' % (os.path.join(fpath, U.PREPROC_NAME), preproc_name))
            with open(preproc_name, 'rb') as f: preproc = pickle.load(f)
        except:
            raise Exception('Failed to load .preproc file in either the post v0.16.x loction (%s) or pre v0.16.x location (%s)' % (os.path.join(fpath, U.PREPROC_NAME), fpath+'.preproc'))

    # load the model
    model = _load_model(fpath, preproc=preproc, custom_objects=custom_objects)


    # preprocessing functions in ImageDataGenerators are not pickable
    # so, we must reconstruct
    if hasattr(preproc, 'datagen') and hasattr(preproc.datagen, 'ktrain_preproc'):
        preproc_name = preproc.datagen.ktrain_preproc
        if preproc_name == 'resnet50':
            preproc.datagen.preprocessing_function = pre_resnet50
        elif preproc_name == 'mobilenet':
            preproc.datagen.preprocessing_function = pre_mobilenet
        elif preproc_name == 'inception':
            preproc.datagen.preprocessing_function = pre_inception
        else:
            raise Exception('Uknown preprocessing_function name: %s' % (preproc_name))
    
    # return the appropriate predictor
    if not isinstance(model, Model):
        raise ValueError('model must be of instance Model')
    if not isinstance(preproc, (ImagePreprocessor, TextPreprocessor, NERPreprocessor, NodePreprocessor, LinkPreprocessor, TabularPreprocessor)):
        raise ValueError('preproc must be instance of ktrain.preprocessor.Preprocessor')
    if isinstance(preproc, ImagePreprocessor):
        return ImagePredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, TextPreprocessor):
        return TextPredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, NERPreprocessor):
        return NERPredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, NodePreprocessor):
        return NodePredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, LinkPreprocessor):
        return LinkPredictor(model, preproc, batch_size=batch_size)
    elif isinstance(preproc, TabularPreprocessor):
        return TabularPredictor(model, preproc, batch_size=batch_size)
    else:
        raise Exception('preprocessor not currently supported')
def release_gpu_memory(device=0)
Relase GPU memory allocated by Tensorflow
Source: 
https://stackoverflow.com/questions/51005147/keras-release-memory-after-finish-training-process
Expand source code
def release_gpu_memory(device=0):
    """
    ```
    Relase GPU memory allocated by Tensorflow
    Source: 
    https://stackoverflow.com/questions/51005147/keras-release-memory-after-finish-training-process
    ```
    """
    from numba import cuda
    K.clear_session()
    cuda.select_device(device)
    cuda.close()
    return

Classes

class Dataset
Base class for custom datasets in ktrain.

If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.

The signature of to_tfdataset is as follows:

def to_tfdataset(self, train=True)

See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class Dataset:
    """
    ```
    Base class for custom datasets in ktrain.

    If subclass of Dataset implements a method to to_tfdataset
    that converts the data to a tf.Dataset, then this will be
    invoked by Learner instances just prior to training so
    fit() will train using a tf.Dataset representation of your data.
    Sequence methods such as __get_item__ and __len__
    must still be implemented.

    The signature of to_tfdataset is as follows:

    def to_tfdataset(self, train=True)

    See ktrain.text.preprocess.TransformerDataset as an example.
    ```
    """

    # required: used by ktrain.core.Learner instances
    def nsamples(self):
        raise NotImplemented

    # required: used by ktrain.core.Learner instances
    def get_y(self):
        raise NotImplemented

    # optional: to modify dataset between epochs (e.g., shuffle)
    def on_epoch_end(self):
        pass

    # optional
    def ondisk(self):
        """
        ```
        Is data being read from disk like with DirectoryIterators?
        ```
        """
        return False

    # optional: used only if invoking *_classifier functions
    def xshape(self):
        """
        ```
        shape of X
        Examples:
            for images: input_shape
            for text: (n_example, sequence_length)
        ```
        """
        raise NotImplemented
    
    # optional: used only if invoking *_classifier functions
    def nclasses(self):
        """
        ```
        Number of classes
        For classification problems: this is the number of labels
        Not used for regression problems
        ```
        """
        raise NotImplemented

Subclasses

Methods

def get_y(self)
Expand source code
def get_y(self):
    raise NotImplemented
def nclasses(self)
Number of classes
For classification problems: this is the number of labels
Not used for regression problems
Expand source code
def nclasses(self):
    """
    ```
    Number of classes
    For classification problems: this is the number of labels
    Not used for regression problems
    ```
    """
    raise NotImplemented
def nsamples(self)
Expand source code
def nsamples(self):
    raise NotImplemented
def on_epoch_end(self)
Expand source code
def on_epoch_end(self):
    pass
def ondisk(self)
Is data being read from disk like with DirectoryIterators?
Expand source code
def ondisk(self):
    """
    ```
    Is data being read from disk like with DirectoryIterators?
    ```
    """
    return False
def xshape(self)
shape of X
Examples:
    for images: input_shape
    for text: (n_example, sequence_length)
Expand source code
def xshape(self):
    """
    ```
    shape of X
    Examples:
        for images: input_shape
        for text: (n_example, sequence_length)
    ```
    """
    raise NotImplemented
class SequenceDataset (batch_size=32)
Base class for custom datasets in ktrain.

If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.

The signature of to_tfdataset is as follows:

def to_tfdataset(self, training=True)

See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class SequenceDataset(Dataset, Sequence):
    """
    ```
    Base class for custom datasets in ktrain.

    If subclass of Dataset implements a method to to_tfdataset
    that converts the data to a tf.Dataset, then this will be
    invoked by Learner instances just prior to training so
    fit() will train using a tf.Dataset representation of your data.
    Sequence methods such as __get_item__ and __len__
    must still be implemented.

    The signature of to_tfdataset is as follows:

    def to_tfdataset(self, training=True)

    See ktrain.text.preprocess.TransformerDataset as an example.
    ```
    """
    def __init__(self, batch_size=32):
        self.batch_size = batch_size

    # required by keras.utils.Sequence instances
    def __len__(self):
        raise NotImplemented

    # required by keras.utils.Sequence instances
    def __getitem__(self, idx):
        raise NotImplemented

        return False

Ancestors

  • Dataset
  • tensorflow.python.keras.utils.data_utils.Sequence

Subclasses

Inherited members

class TFDataset (tfdataset, n, y)
Wrapper for tf.data.Datasets
Args:
  tfdataset(tf.data.Dataset):  a tf.Dataset instance
  n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets)
  y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
Expand source code
class TFDataset(Dataset):
    """
    ```
    Wrapper for tf.data.Datasets
    ```
    """
    def __init__(self, tfdataset, n, y):
        """
        ```
        Args:
          tfdataset(tf.data.Dataset):  a tf.Dataset instance
          n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets)
          y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
        ```
        """
        if not isinstance(tfdataset, tf.data.Dataset):
            raise ValueError('tfdataset must be a fully-configured tf.data.Dataset with batch_size, etc. set appropriately')
        self.tfdataset = tfdataset
        self.bs = next(tfdataset.as_numpy_iterator())[-1].shape[0] # extract batch_size from tfdataset
        self.n = n
        self.y = y

    @property
    def batch_size(self):
        return self.bs

    @batch_size.setter
    def batch_size(self, value):
        if value != self.bs:
            warnings.warn('batch_size parameter is ignored, as pre-configured batch_size of tf.data.Dataset is used')


    def nsamples(self):
        return self.n

    def get_y(self):
        return self.y

    def to_tfdataset(self, train=True):
        return self.tfdataset

Ancestors

Instance variables

var batch_size
Expand source code
@property
def batch_size(self):
    return self.bs

Methods

def get_y(self)
Expand source code
def get_y(self):
    return self.y
def nsamples(self)
Expand source code
def nsamples(self):
    return self.n
def to_tfdataset(self, train=True)
Expand source code
def to_tfdataset(self, train=True):
    return self.tfdataset

Inherited members