Package ktrain
Expand source code
from .version import __version__
from . import imports as I
from .core import ArrayLearner, GenLearner, get_predictor, load_predictor, release_gpu_memory
from .vision.learner import ImageClassLearner
from .text.learner import BERTTextClassLearner, TransformerTextClassLearner
from .text.ner.learner import NERLearner
from .graph.learner import NodeClassLearner, LinkPredLearner
from .data import Dataset, TFDataset, SequenceDataset
from . import utils as U
__all__ = ['get_learner', 'get_predictor', 'load_predictor', 'release_gpu_memory',
'Dataset', 'TFDataset', 'SequenceDataset']
def get_learner(model, train_data=None, val_data=None,
batch_size=U.DEFAULT_BS, eval_batch_size=U.DEFAULT_BS,
workers=1, use_multiprocessing=False):
"""
```
Returns a Learner instance that can be used to tune and train Keras models.
model (Model): A compiled instance of keras.engine.training.Model
train_data (tuple or generator): Either a:
1) tuple of (x_train, y_train), where x_train and
y_train are numpy.ndarrays or
2) Iterator
val_data (tuple or generator): Either a:
1) tuple of (x_test, y_test), where x_testand
y_test are numpy.ndarrays or
2) Iterator
Note: Should be same type as train_data.
batch_size (int): Batch size to use in training. default:32
eval_batch_size(int): batch size used by learner.predict
only applies to validaton data during training if
val_data is instance of utils.Sequence.
default:32
workers (int): number of cpu processes used to load data.
This is ignored unless train_data/val_data is an instance of
tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator.
use_multiprocessing(bool): whether or not to use multiprocessing for workers
This is ignored unless train_data/val_data is an instance of
tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator.
```
"""
# check arguments
if not isinstance(model, I.Model):
raise ValueError('model must be of instance Model')
U.data_arg_check(train_data=train_data, val_data=val_data)
if type(workers) != type(1) or workers < 1:
workers =1
# check for NumpyArrayIterator
if train_data and not U.ondisk(train_data):
if workers > 1 and not use_multiprocessing:
use_multiprocessing = True
wrn_msg = 'Changed use_multiprocessing to True because NumpyArrayIterator with workers>1'
wrn_msg +=' is slow when use_multiprocessing=False.'
wrn_msg += ' If you experience issues with this, please set workers=1 and use_multiprocessing=False.'
I.warnings.warn(wrn_msg)
# verify BERT
is_bert = U.bert_data_tuple(train_data)
if is_bert:
maxlen = U.shape_from_data(train_data)[1]
msg = """For a GPU with 12GB of RAM, the following maxima apply:
sequence len=64, max_batch_size=64
sequence len=128, max_batch_size=32
sequence len=256, max_batch_size=16
sequence len=320, max_batch_size=14
sequence len=384, max_batch_size=12
sequence len=512, max_batch_size=6
You've exceeded these limits.
If using a GPU with <=12GB of memory, you may run out of memory during training.
If necessary, adjust sequence length or batch size based on above."""
wrn = False
if maxlen > 64 and batch_size > 64:
wrn=True
elif maxlen > 128 and batch_size>32:
wrn=True
elif maxlen>256 and batch_size>16:
wrn=True
elif maxlen>320 and batch_size>14:
wrn=True
elif maxlen>384 and batch_size>12:
wrn=True
elif maxlen > 512 and batch_size>6:
wrn=True
if wrn: I.warnings.warn(msg)
# return the appropriate trainer
if U.is_iter(train_data):
if U.is_ner(model=model, data=train_data):
learner = NERLearner
elif U.is_imageclass_from_data(train_data):
learner = ImageClassLearner
elif U.is_nodeclass(data=train_data):
learner = NodeClassLearner
elif U.is_nodeclass(data=train_data):
learner = LinkPredLearner
elif U.is_huggingface(data=train_data):
learner = TransformerTextClassLearner
else:
learner = GenLearner
else:
if is_bert:
learner = BERTTextClassLearner
else: # vanilla text classifiers use standard ArrayLearners
learner = ArrayLearner
return learner(model, train_data=train_data, val_data=val_data,
batch_size=batch_size, eval_batch_size=eval_batch_size,
workers=workers, use_multiprocessing=use_multiprocessing)
# keys
# currently_unsupported: unsupported or disabled features (e.g., xai graph neural networks have not been implemented)
# dep_fix: a fix to address a problem in a dependency
Sub-modules
ktrain.core
ktrain.data
ktrain.graph
ktrain.imports
ktrain.lroptimize
ktrain.models
ktrain.predictor
ktrain.preprocessor
ktrain.tabular
ktrain.text
ktrain.utils
ktrain.version
ktrain.vision
Functions
def get_learner(model, train_data=None, val_data=None, batch_size=32, eval_batch_size=32, workers=1, use_multiprocessing=False)
-
Returns a Learner instance that can be used to tune and train Keras models. model (Model): A compiled instance of keras.engine.training.Model train_data (tuple or generator): Either a: 1) tuple of (x_train, y_train), where x_train and y_train are numpy.ndarrays or 2) Iterator val_data (tuple or generator): Either a: 1) tuple of (x_test, y_test), where x_testand y_test are numpy.ndarrays or 2) Iterator Note: Should be same type as train_data. batch_size (int): Batch size to use in training. default:32 eval_batch_size(int): batch size used by learner.predict only applies to validaton data during training if val_data is instance of utils.Sequence. default:32 workers (int): number of cpu processes used to load data. This is ignored unless train_data/val_data is an instance of tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator. use_multiprocessing(bool): whether or not to use multiprocessing for workers This is ignored unless train_data/val_data is an instance of tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator.
Expand source code
def get_learner(model, train_data=None, val_data=None, batch_size=U.DEFAULT_BS, eval_batch_size=U.DEFAULT_BS, workers=1, use_multiprocessing=False): """ ``` Returns a Learner instance that can be used to tune and train Keras models. model (Model): A compiled instance of keras.engine.training.Model train_data (tuple or generator): Either a: 1) tuple of (x_train, y_train), where x_train and y_train are numpy.ndarrays or 2) Iterator val_data (tuple or generator): Either a: 1) tuple of (x_test, y_test), where x_testand y_test are numpy.ndarrays or 2) Iterator Note: Should be same type as train_data. batch_size (int): Batch size to use in training. default:32 eval_batch_size(int): batch size used by learner.predict only applies to validaton data during training if val_data is instance of utils.Sequence. default:32 workers (int): number of cpu processes used to load data. This is ignored unless train_data/val_data is an instance of tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator. use_multiprocessing(bool): whether or not to use multiprocessing for workers This is ignored unless train_data/val_data is an instance of tf.keras.preprocessing.image.DirectoryIterator or tf.keras.preprocessing.image.DataFrameIterator. ``` """ # check arguments if not isinstance(model, I.Model): raise ValueError('model must be of instance Model') U.data_arg_check(train_data=train_data, val_data=val_data) if type(workers) != type(1) or workers < 1: workers =1 # check for NumpyArrayIterator if train_data and not U.ondisk(train_data): if workers > 1 and not use_multiprocessing: use_multiprocessing = True wrn_msg = 'Changed use_multiprocessing to True because NumpyArrayIterator with workers>1' wrn_msg +=' is slow when use_multiprocessing=False.' wrn_msg += ' If you experience issues with this, please set workers=1 and use_multiprocessing=False.' I.warnings.warn(wrn_msg) # verify BERT is_bert = U.bert_data_tuple(train_data) if is_bert: maxlen = U.shape_from_data(train_data)[1] msg = """For a GPU with 12GB of RAM, the following maxima apply: sequence len=64, max_batch_size=64 sequence len=128, max_batch_size=32 sequence len=256, max_batch_size=16 sequence len=320, max_batch_size=14 sequence len=384, max_batch_size=12 sequence len=512, max_batch_size=6 You've exceeded these limits. If using a GPU with <=12GB of memory, you may run out of memory during training. If necessary, adjust sequence length or batch size based on above.""" wrn = False if maxlen > 64 and batch_size > 64: wrn=True elif maxlen > 128 and batch_size>32: wrn=True elif maxlen>256 and batch_size>16: wrn=True elif maxlen>320 and batch_size>14: wrn=True elif maxlen>384 and batch_size>12: wrn=True elif maxlen > 512 and batch_size>6: wrn=True if wrn: I.warnings.warn(msg) # return the appropriate trainer if U.is_iter(train_data): if U.is_ner(model=model, data=train_data): learner = NERLearner elif U.is_imageclass_from_data(train_data): learner = ImageClassLearner elif U.is_nodeclass(data=train_data): learner = NodeClassLearner elif U.is_nodeclass(data=train_data): learner = LinkPredLearner elif U.is_huggingface(data=train_data): learner = TransformerTextClassLearner else: learner = GenLearner else: if is_bert: learner = BERTTextClassLearner else: # vanilla text classifiers use standard ArrayLearners learner = ArrayLearner return learner(model, train_data=train_data, val_data=val_data, batch_size=batch_size, eval_batch_size=eval_batch_size, workers=workers, use_multiprocessing=use_multiprocessing)
def get_predictor(model, preproc, batch_size=32)
-
Returns a Predictor instance that can be used to make predictions on unlabeled examples. Can be saved to disk and reloaded as part of a larger application. Args model (Model): A compiled instance of keras.engine.training.Model preproc(Preprocessor): An instance of TextPreprocessor,ImagePreprocessor, or NERPreprocessor. These instances are returned from the data loading functions in the ktrain vision and text modules: ktrain.vision.images_from_folder ktrain.vision.images_from_csv ktrain.vision.images_from_array ktrain.text.texts_from_folder ktrain.text.texts_from_csv ktrain.text.ner.entities_from_csv batch_size(int): batch size to use. default:32
Expand source code
def get_predictor(model, preproc, batch_size=U.DEFAULT_BS): """ ``` Returns a Predictor instance that can be used to make predictions on unlabeled examples. Can be saved to disk and reloaded as part of a larger application. Args model (Model): A compiled instance of keras.engine.training.Model preproc(Preprocessor): An instance of TextPreprocessor,ImagePreprocessor, or NERPreprocessor. These instances are returned from the data loading functions in the ktrain vision and text modules: ktrain.vision.images_from_folder ktrain.vision.images_from_csv ktrain.vision.images_from_array ktrain.text.texts_from_folder ktrain.text.texts_from_csv ktrain.text.ner.entities_from_csv batch_size(int): batch size to use. default:32 ``` """ # check arguments if not isinstance(model, Model): raise ValueError('model must be of instance Model') if not isinstance(preproc, (ImagePreprocessor,TextPreprocessor, NERPreprocessor, NodePreprocessor, LinkPreprocessor, TabularPreprocessor)): raise ValueError('preproc must be instance of ktrain.preprocessor.Preprocessor') if isinstance(preproc, ImagePreprocessor): return ImagePredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, TextPreprocessor): #elif type(preproc).__name__ == 'TextPreprocessor': return TextPredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, NERPreprocessor): return NERPredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, NodePreprocessor): return NodePredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, LinkPreprocessor): return LinkPredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, TabularPreprocessor): return TabularPredictor(model, preproc, batch_size=batch_size) else: raise Exception('preproc of type %s not currently supported' % (type(preproc)))
def load_predictor(fpath, batch_size=32, custom_objects=None)
-
Loads a previously saved Predictor instance Args fpath(str): predictor path name (value supplied to predictor.save) From v0.16.x, this is always the path to a folder. Pre-v0.16.x, this is the base name used to save model and .preproc instance. batch_size(int): batch size to use for predictions. default:32 custom_objects(dict): custom objects required to load model. This is useful if you compiled the model with a custom loss function, for example. For models included with ktrain as is, this is populated automatically and can be disregarded.
Expand source code
def load_predictor(fpath, batch_size=U.DEFAULT_BS, custom_objects=None): """ ``` Loads a previously saved Predictor instance Args fpath(str): predictor path name (value supplied to predictor.save) From v0.16.x, this is always the path to a folder. Pre-v0.16.x, this is the base name used to save model and .preproc instance. batch_size(int): batch size to use for predictions. default:32 custom_objects(dict): custom objects required to load model. This is useful if you compiled the model with a custom loss function, for example. For models included with ktrain as is, this is populated automatically and can be disregarded. ``` """ # load the preprocessor preproc = None try: preproc_name = os.path.join(fpath, U.PREPROC_NAME) with open(preproc_name, 'rb') as f: preproc = pickle.load(f) except: try: preproc_name = fpath +'.preproc' #warnings.warn('could not load .preproc file as %s - attempting to load as %s' % (os.path.join(fpath, U.PREPROC_NAME), preproc_name)) with open(preproc_name, 'rb') as f: preproc = pickle.load(f) except: raise Exception('Failed to load .preproc file in either the post v0.16.x loction (%s) or pre v0.16.x location (%s)' % (os.path.join(fpath, U.PREPROC_NAME), fpath+'.preproc')) # load the model model = _load_model(fpath, preproc=preproc, custom_objects=custom_objects) # preprocessing functions in ImageDataGenerators are not pickable # so, we must reconstruct if hasattr(preproc, 'datagen') and hasattr(preproc.datagen, 'ktrain_preproc'): preproc_name = preproc.datagen.ktrain_preproc if preproc_name == 'resnet50': preproc.datagen.preprocessing_function = pre_resnet50 elif preproc_name == 'mobilenet': preproc.datagen.preprocessing_function = pre_mobilenet elif preproc_name == 'inception': preproc.datagen.preprocessing_function = pre_inception else: raise Exception('Uknown preprocessing_function name: %s' % (preproc_name)) # return the appropriate predictor if not isinstance(model, Model): raise ValueError('model must be of instance Model') if not isinstance(preproc, (ImagePreprocessor, TextPreprocessor, NERPreprocessor, NodePreprocessor, LinkPreprocessor, TabularPreprocessor)): raise ValueError('preproc must be instance of ktrain.preprocessor.Preprocessor') if isinstance(preproc, ImagePreprocessor): return ImagePredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, TextPreprocessor): return TextPredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, NERPreprocessor): return NERPredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, NodePreprocessor): return NodePredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, LinkPreprocessor): return LinkPredictor(model, preproc, batch_size=batch_size) elif isinstance(preproc, TabularPreprocessor): return TabularPredictor(model, preproc, batch_size=batch_size) else: raise Exception('preprocessor not currently supported')
def release_gpu_memory(device=0)
-
Relase GPU memory allocated by Tensorflow Source: https://stackoverflow.com/questions/51005147/keras-release-memory-after-finish-training-process
Expand source code
def release_gpu_memory(device=0): """ ``` Relase GPU memory allocated by Tensorflow Source: https://stackoverflow.com/questions/51005147/keras-release-memory-after-finish-training-process ``` """ from numba import cuda K.clear_session() cuda.select_device(device) cuda.close() return
Classes
class Dataset
-
Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, train=True) See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class Dataset: """ ``` Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, train=True) See ktrain.text.preprocess.TransformerDataset as an example. ``` """ # required: used by ktrain.core.Learner instances def nsamples(self): raise NotImplemented # required: used by ktrain.core.Learner instances def get_y(self): raise NotImplemented # optional: to modify dataset between epochs (e.g., shuffle) def on_epoch_end(self): pass # optional def ondisk(self): """ ``` Is data being read from disk like with DirectoryIterators? ``` """ return False # optional: used only if invoking *_classifier functions def xshape(self): """ ``` shape of X Examples: for images: input_shape for text: (n_example, sequence_length) ``` """ raise NotImplemented # optional: used only if invoking *_classifier functions def nclasses(self): """ ``` Number of classes For classification problems: this is the number of labels Not used for regression problems ``` """ raise NotImplemented
Subclasses
Methods
def get_y(self)
-
Expand source code
def get_y(self): raise NotImplemented
def nclasses(self)
-
Number of classes For classification problems: this is the number of labels Not used for regression problems
Expand source code
def nclasses(self): """ ``` Number of classes For classification problems: this is the number of labels Not used for regression problems ``` """ raise NotImplemented
def nsamples(self)
-
Expand source code
def nsamples(self): raise NotImplemented
def on_epoch_end(self)
-
Expand source code
def on_epoch_end(self): pass
def ondisk(self)
-
Is data being read from disk like with DirectoryIterators?
Expand source code
def ondisk(self): """ ``` Is data being read from disk like with DirectoryIterators? ``` """ return False
def xshape(self)
-
shape of X Examples: for images: input_shape for text: (n_example, sequence_length)
Expand source code
def xshape(self): """ ``` shape of X Examples: for images: input_shape for text: (n_example, sequence_length) ``` """ raise NotImplemented
class SequenceDataset (batch_size=32)
-
Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, training=True) See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class SequenceDataset(Dataset, Sequence): """ ``` Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, training=True) See ktrain.text.preprocess.TransformerDataset as an example. ``` """ def __init__(self, batch_size=32): self.batch_size = batch_size # required by keras.utils.Sequence instances def __len__(self): raise NotImplemented # required by keras.utils.Sequence instances def __getitem__(self, idx): raise NotImplemented return False
Ancestors
- Dataset
- tensorflow.python.keras.utils.data_utils.Sequence
Subclasses
- MultiArrayDataset
- LinkSequenceWrapper
- NodeSequenceWrapper
- TabularDataset
- NERSequence
- TransformerDataset
Inherited members
class TFDataset (tfdataset, n, y)
-
Wrapper for tf.data.Datasets
Args: tfdataset(tf.data.Dataset): a tf.Dataset instance n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets) y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
Expand source code
class TFDataset(Dataset): """ ``` Wrapper for tf.data.Datasets ``` """ def __init__(self, tfdataset, n, y): """ ``` Args: tfdataset(tf.data.Dataset): a tf.Dataset instance n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets) y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded) ``` """ if not isinstance(tfdataset, tf.data.Dataset): raise ValueError('tfdataset must be a fully-configured tf.data.Dataset with batch_size, etc. set appropriately') self.tfdataset = tfdataset self.bs = next(tfdataset.as_numpy_iterator())[-1].shape[0] # extract batch_size from tfdataset self.n = n self.y = y @property def batch_size(self): return self.bs @batch_size.setter def batch_size(self, value): if value != self.bs: warnings.warn('batch_size parameter is ignored, as pre-configured batch_size of tf.data.Dataset is used') def nsamples(self): return self.n def get_y(self): return self.y def to_tfdataset(self, train=True): return self.tfdataset
Ancestors
Instance variables
var batch_size
-
Expand source code
@property def batch_size(self): return self.bs
Methods
def get_y(self)
-
Expand source code
def get_y(self): return self.y
def nsamples(self)
-
Expand source code
def nsamples(self): return self.n
def to_tfdataset(self, train=True)
-
Expand source code
def to_tfdataset(self, train=True): return self.tfdataset
Inherited members