import math
import numpy as np
import torch
from torchvision import datasets
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
[docs]def add_label_noise(y_trn, num_cls, noise_ratio=0.8):
"""
Adds noise to the specified list of labels.
This functionality is taken from CORDS and
applied here.
Parameters
----------
y_trn : list
The list of labels to add noise.
num_cls : int
The number of classes possible in the list.
noise_ratio : float, optional
The percentage of labels to modify. The default is 0.8.
Returns
-------
y_trn : list
The list of now-noisy labels
"""
noise_size = int(len(y_trn) * noise_ratio)
noise_indices = np.random.choice(np.arange(len(y_trn)), size=noise_size, replace=False)
y_trn[noise_indices] = np.random.choice(np.arange(num_cls), size=noise_size, replace=True)
return y_trn
[docs]def get_imbalanced_idx(y_trn, num_cls, class_ratio=0.6):
"""
Returns a list of indices of the supplied dataset that
constitute a class-imbalanced subset of the supplied
dataset. This functionality is taken from CORDS and
applied here.
Parameters
----------
y_trn : numpy ndarray
The label set to choose imbalance.
num_cls : int
The number of classes possible in the list.
class_ratio : float, optional
The percentage of classes to affect. The default is 0.6.
Returns
-------
subset_idxs : list
The list of indices of the supplied dataset that
constitute a class-imbalanced subset
"""
# Calculate the minimum samples in a class and take a small fraction of that number as the new sample
# count for that class
samples_per_class = torch.zeros(num_cls)
for i in range(num_cls):
samples_per_class[i] = len(torch.where(torch.Tensor(y_trn) == i)[0])
min_samples = int(torch.min(samples_per_class) * 0.1)
# Generate affected classes based on the specified class ratio
selected_classes = np.random.choice(np.arange(num_cls), size=int(class_ratio * num_cls), replace=False)
# For each class, either add the full class to the dataset (if not selected) or add only min_samples
# samples from that class to the dataset
for i in range(num_cls):
if i == 0:
if i in selected_classes:
subset_idxs = list(
np.random.choice(torch.where(torch.Tensor(y_trn) == i)[0].cpu().numpy(),
size=min_samples,
replace=False))
else:
subset_idxs = list(torch.where(torch.Tensor(y_trn) == i)[0].cpu().numpy())
else:
if i in selected_classes:
batch_subset_idxs = list(
np.random.choice(torch.where(torch.Tensor(y_trn) == i)[0].cpu().numpy(),
size=min_samples,
replace=False))
else:
batch_subset_idxs = list(torch.where(torch.Tensor(y_trn) == i)[0].cpu().numpy())
subset_idxs.extend(batch_subset_idxs)
return subset_idxs
[docs]def make_data_redundant(X,Y,amtRed=2):
"""
Modifies the input dataset in such a way that only X.shape(0)/amtRed are original
points and rest are repeated or redundant.
Parameters
----------
X : numpy ndarray
The feature set to be made redundant.
Y : numpy ndarray
The label set corresponding to the X.
amtRed : float, optional
Factor that determines redundancy. The default is 2.
Returns
-------
X : numpy ndarray
Modified feature set.
"""
classes,no_elements = np.unique(Y, return_counts=True)
for cl in range(len(classes)):
retain = math.ceil(no_elements[cl]/amtRed)
idxs = np.where(Y == classes[cl])[0]
for i in range(math.ceil(amtRed)):
if i == 0:
idxs_rep = idxs[:retain]
else:
idxs_rep = np.concatenate((idxs_rep,idxs[:retain]),axis=0)
X[idxs] = X[idxs_rep[:no_elements[cl]]]
return X
[docs]def get_dataset(name, path, tr_load_args = None, te_load_args = None):
"""
Loads dataset
Parameters
----------
name: str
Name of the dataset to be loaded. Supports MNIST and CIFAR10
path: str
Path to save the downloaded dataset
tr_load_args: dict
String dictionary for train distribution shift loading
te_load_args: dict
String dictionary for test distribution shift loading
Returns
----------
X_tr: numpy array
Train set
Y_tr: torch tensor
Training Labels
X_te: numpy array
Test Set
Y_te: torch tensor
Test labels
"""
if name == 'MNIST':
return get_MNIST(path, tr_load_args, te_load_args)
elif name == 'KMNIST':
return get_KMNIST(path, tr_load_args, te_load_args)
elif name == 'FASHION_MNIST':
return get_FASHION_MNIST(path, tr_load_args, te_load_args)
elif name == 'CIFAR10':
return get_CIFAR10(path, tr_load_args, te_load_args)
elif name == 'CIFAR100':
return get_CIFAR100(path, tr_load_args, te_load_args)
elif name == 'SVHN':
return get_SVHN(path, tr_load_args, te_load_args)
elif name == 'STL10':
return get_STL10(path, tr_load_args, te_load_args)
[docs]def get_SVHN(path, tr_load_args = None, te_load_args = None):
"""
Downloads SVHN dataset
Parameters
----------
path: str
Path to save the downloaded dataset
Returns
----------
X_tr: numpy array
Train set
Y_tr: torch tensor
Training Labels
X_te: numpy array
Test Set
Y_te: torch tensor
Test labels
"""
# Deterministic random seed to ensure data initialization is consistent
np.random.seed(42)
num_cls = 10
# Download the SVHN dataset
data_tr = datasets.SVHN(path + '/SVHN', split="train", download=True)
data_te = datasets.SVHN(path + '/SVHN', split="test", download=True)
# Obtain the raw data
X_tr = data_tr.data
Y_tr = data_tr.labels
X_te = data_te.data
Y_te = data_te.labels
# Initialize tr_idx and te_idx, which contain the full list of indices.
# Used to select a subset from the the full dataset.
tr_idx = [x for x in range(X_tr.shape[0])]
te_idx = [x for x in range(X_te.shape[0])]
# Prepare labels for subset selection
Y_tr = np.array(Y_tr)
Y_te = np.array(Y_te)
# If the load arguments specify a class imbalance or a noise ratio, apply the distribution
# shift to the appropriate dataset. Note that only one of class imbalance or noise is applied.
if tr_load_args is not None:
if "class_imbalance_ratio" in tr_load_args:
tr_idx = get_imbalanced_idx(Y_tr, num_cls, tr_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in tr_load_args:
Y_tr = add_label_noise(Y_tr, num_cls, tr_load_args["noisy_labels_ratio"])
if te_load_args is not None:
if "class_imbalance_ratio" in te_load_args:
te_idx = get_imbalanced_idx(Y_te, num_cls, te_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in te_load_args:
Y_te = add_label_noise(Y_te, num_cls, te_load_args["noisy_labels_ratio"])
# Select the subset specified by tr_idx and te_idx
X_tr = X_tr[tr_idx]
Y_tr = Y_tr[tr_idx]
X_te = X_te[te_idx]
Y_te = Y_te[te_idx]
# Shuffle train and test datasets.
train_permutation = np.random.choice(np.arange(len(Y_tr)), size=len(Y_tr), replace=False)
test_permutation = np.random.choice(np.arange(len(Y_te)), size=len(Y_te), replace=False)
X_tr = X_tr[train_permutation]
Y_tr = Y_tr[train_permutation]
X_te = X_te[test_permutation]
Y_te = Y_te[test_permutation]
# Convert labels to tensor
Y_tr = torch.from_numpy(Y_tr)
Y_te = torch.from_numpy(Y_te)
return X_tr, Y_tr, X_te, Y_te
[docs]def get_MNIST(path, tr_load_args = None, te_load_args = None):
"""
Downloads MNIST dataset
Parameters
----------
path: str
Path to save the downloaded dataset
Returns
----------
X_tr: numpy array
Train set
Y_tr: torch tensor
Training Labels
X_te: numpy array
Test Set
Y_te: torch tensor
Test labels
"""
# Deterministic random seed to ensure data initialization is consistent
np.random.seed(42)
num_cls = 10
# Download the MNIST dataset
data_tr = datasets.MNIST(path + '/MNIST', train=True, download=True)
data_te = datasets.MNIST(path + '/MNIST', train=False, download=True)
# Obtain the raw data
X_tr = data_tr.data.numpy()
Y_tr = data_tr.targets.numpy()
X_te = data_te.data.numpy()
Y_te = data_te.targets.numpy()
# Initialize tr_idx and te_idx, which contain the full list of indices.
# Used to select a subset from the the full dataset.
tr_idx = [x for x in range(X_tr.shape[0])]
te_idx = [x for x in range(X_te.shape[0])]
# Prepare labels for subset selection
Y_tr = np.array(Y_tr)
Y_te = np.array(Y_te)
# If the load arguments specify a class imbalance or a noise ratio, apply the distribution
# shift to the appropriate dataset. Note that only one of class imbalance or noise is applied.
if tr_load_args is not None:
if "class_imbalance_ratio" in tr_load_args:
tr_idx = get_imbalanced_idx(Y_tr, num_cls, tr_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in tr_load_args:
Y_tr = add_label_noise(Y_tr, num_cls, tr_load_args["noisy_labels_ratio"])
if te_load_args is not None:
if "class_imbalance_ratio" in te_load_args:
te_idx = get_imbalanced_idx(Y_te, num_cls, te_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in te_load_args:
Y_te = add_label_noise(Y_te, num_cls, te_load_args["noisy_labels_ratio"])
# Select the subset specified by tr_idx and te_idx
X_tr = X_tr[tr_idx]
Y_tr = Y_tr[tr_idx]
X_te = X_te[te_idx]
Y_te = Y_te[te_idx]
# Shuffle train and test datasets.
train_permutation = np.random.choice(np.arange(len(Y_tr)), size=len(Y_tr), replace=False)
test_permutation = np.random.choice(np.arange(len(Y_te)), size=len(Y_te), replace=False)
X_tr = X_tr[train_permutation]
Y_tr = Y_tr[train_permutation]
X_te = X_te[test_permutation]
Y_te = Y_te[test_permutation]
# Convert labels to tensor
Y_tr = torch.from_numpy(Y_tr)
Y_te = torch.from_numpy(Y_te)
return X_tr, Y_tr, X_te, Y_te
[docs]def get_KMNIST(path, tr_load_args = None, te_load_args = None):
"""
Downloads KMNIST dataset
Parameters
----------
path: str
Path to save the downloaded dataset
Returns
----------
X_tr: numpy array
Train set
Y_tr: torch tensor
Training Labels
X_te: numpy array
Test Set
Y_te: torch tensor
Test labels
"""
# Deterministic random seed to ensure data initialization is consistent
np.random.seed(42)
num_cls = 10
# Download the KMNIST dataset
data_tr = datasets.KMNIST(path + '/KMNIST', train=True, download=True)
data_te = datasets.KMNIST(path + '/KMNIST', train=False, download=True)
# Obtain the raw data
X_tr = data_tr.data.numpy()
Y_tr = data_tr.targets.numpy()
X_te = data_te.data.numpy()
Y_te = data_te.targets.numpy()
# Initialize tr_idx and te_idx, which contain the full list of indices.
# Used to select a subset from the the full dataset.
tr_idx = [x for x in range(X_tr.shape[0])]
te_idx = [x for x in range(X_te.shape[0])]
# Prepare labels for subset selection
Y_tr = np.array(Y_tr)
Y_te = np.array(Y_te)
# If the load arguments specify a class imbalance or a noise ratio, apply the distribution
# shift to the appropriate dataset. Note that only one of class imbalance or noise is applied.
if tr_load_args is not None:
if "class_imbalance_ratio" in tr_load_args:
tr_idx = get_imbalanced_idx(Y_tr, num_cls, tr_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in tr_load_args:
Y_tr = add_label_noise(Y_tr, num_cls, tr_load_args["noisy_labels_ratio"])
if te_load_args is not None:
if "class_imbalance_ratio" in te_load_args:
te_idx = get_imbalanced_idx(Y_te, num_cls, te_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in te_load_args:
Y_te = add_label_noise(Y_te, num_cls, te_load_args["noisy_labels_ratio"])
# Select the subset specified by tr_idx and te_idx
X_tr = X_tr[tr_idx]
Y_tr = Y_tr[tr_idx]
X_te = X_te[te_idx]
Y_te = Y_te[te_idx]
# Shuffle train and test datasets.
train_permutation = np.random.choice(np.arange(len(Y_tr)), size=len(Y_tr), replace=False)
test_permutation = np.random.choice(np.arange(len(Y_te)), size=len(Y_te), replace=False)
X_tr = X_tr[train_permutation]
Y_tr = Y_tr[train_permutation]
X_te = X_te[test_permutation]
Y_te = Y_te[test_permutation]
# Convert labels to tensor
Y_tr = torch.from_numpy(Y_tr)
Y_te = torch.from_numpy(Y_te)
return X_tr, Y_tr, X_te, Y_te
[docs]def get_FASHION_MNIST(path, tr_load_args = None, te_load_args = None):
"""
Downloads FASHION_MNIST dataset
Parameters
----------
path: str
Path to save the downloaded dataset
Returns
----------
X_tr: numpy array
Train set
Y_tr: torch tensor
Training Labels
X_te: numpy array
Test Set
Y_te: torch tensor
Test labels
"""
# Deterministic random seed to ensure data initialization is consistent
np.random.seed(42)
num_cls = 10
# Download the FASHION_MNIST dataset
data_tr = datasets.FashionMNIST(path + '/FASHION_MNIST', train=True, download=True)
data_te = datasets.FashionMNIST(path + '/FASHION_MNIST', train=False, download=True)
# Obtain the raw data
X_tr = data_tr.data.numpy()
Y_tr = data_tr.targets.numpy()
X_te = data_te.data.numpy()
Y_te = data_te.targets.numpy()
# Initialize tr_idx and te_idx, which contain the full list of indices.
# Used to select a subset from the the full dataset.
tr_idx = [x for x in range(X_tr.shape[0])]
te_idx = [x for x in range(X_te.shape[0])]
# Prepare labels for subset selection
Y_tr = np.array(Y_tr)
Y_te = np.array(Y_te)
# If the load arguments specify a class imbalance or a noise ratio, apply the distribution
# shift to the appropriate dataset. Note that only one of class imbalance or noise is applied.
if tr_load_args is not None:
if "class_imbalance_ratio" in tr_load_args:
tr_idx = get_imbalanced_idx(Y_tr, num_cls, tr_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in tr_load_args:
Y_tr = add_label_noise(Y_tr, num_cls, tr_load_args["noisy_labels_ratio"])
if te_load_args is not None:
if "class_imbalance_ratio" in te_load_args:
te_idx = get_imbalanced_idx(Y_te, num_cls, te_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in te_load_args:
Y_te = add_label_noise(Y_te, num_cls, te_load_args["noisy_labels_ratio"])
# Select the subset specified by tr_idx and te_idx
X_tr = X_tr[tr_idx]
Y_tr = Y_tr[tr_idx]
X_te = X_te[te_idx]
Y_te = Y_te[te_idx]
# Shuffle train and test datasets.
train_permutation = np.random.choice(np.arange(len(Y_tr)), size=len(Y_tr), replace=False)
test_permutation = np.random.choice(np.arange(len(Y_te)), size=len(Y_te), replace=False)
X_tr = X_tr[train_permutation]
Y_tr = Y_tr[train_permutation]
X_te = X_te[test_permutation]
Y_te = Y_te[test_permutation]
# Convert labels to tensor
Y_tr = torch.from_numpy(Y_tr)
Y_te = torch.from_numpy(Y_te)
return X_tr, Y_tr, X_te, Y_te
[docs]def get_CIFAR10(path, tr_load_args = None, te_load_args = None):
"""
Downloads CIFAR10 dataset
Parameters
----------
path: str
Path to save the downloaded dataset
Returns
----------
X_tr: numpy array
Train set
Y_tr: torch tensor
Training Labels
X_te: numpy array
Test Set
Y_te: torch tensor
Test labels
"""
# Deterministic random seed to ensure data initialization is consistent
np.random.seed(42)
num_cls = 10
# Download the CIFAR10 dataset
data_tr = datasets.CIFAR10(path + '/CIFAR10', train=True, download=True)
data_te = datasets.CIFAR10(path + '/CIFAR10', train=False, download=True)
# Obtain the raw data
X_tr = data_tr.data
Y_tr = data_tr.targets
X_te = data_te.data
Y_te = data_te.targets
# Initialize tr_idx and te_idx, which contain the full list of indices.
# Used to select a subset from the the full dataset.
tr_idx = [x for x in range(X_tr.shape[0])]
te_idx = [x for x in range(X_te.shape[0])]
# Prepare labels for subset selection
Y_tr = np.array(Y_tr)
Y_te = np.array(Y_te)
# If the load arguments specify a class imbalance or a noise ratio, apply the distribution
# shift to the appropriate dataset. Note that only one of class imbalance or noise is applied.
if tr_load_args is not None:
if "class_imbalance_ratio" in tr_load_args:
tr_idx = get_imbalanced_idx(Y_tr, num_cls, tr_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in tr_load_args:
Y_tr = add_label_noise(Y_tr, num_cls, tr_load_args["noisy_labels_ratio"])
if te_load_args is not None:
if "class_imbalance_ratio" in te_load_args:
te_idx = get_imbalanced_idx(Y_te, num_cls, te_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in te_load_args:
Y_te = add_label_noise(Y_te, num_cls, te_load_args["noisy_labels_ratio"])
# Select the subset specified by tr_idx and te_idx
X_tr = X_tr[tr_idx]
Y_tr = Y_tr[tr_idx]
X_te = X_te[te_idx]
Y_te = Y_te[te_idx]
# Shuffle train and test datasets.
train_permutation = np.random.choice(np.arange(len(Y_tr)), size=len(Y_tr), replace=False)
test_permutation = np.random.choice(np.arange(len(Y_te)), size=len(Y_te), replace=False)
X_tr = X_tr[train_permutation]
Y_tr = Y_tr[train_permutation]
X_te = X_te[test_permutation]
Y_te = Y_te[test_permutation]
# Convert labels to tensor
Y_tr = torch.from_numpy(Y_tr)
Y_te = torch.from_numpy(Y_te)
return X_tr, Y_tr, X_te, Y_te
[docs]def get_CIFAR100(path, tr_load_args = None, te_load_args = None):
"""
Downloads CIFAR100 dataset
Parameters
----------
path: str
Path to save the downloaded dataset
Returns
----------
X_tr: numpy array
Train set
Y_tr: torch tensor
Training Labels
X_te: numpy array
Test Set
Y_te: torch tensor
Test labels
"""
# Deterministic random seed to ensure data initialization is consistent
np.random.seed(42)
num_cls = 100
# Download the CIFAR100 dataset
data_tr = datasets.CIFAR100(path + '/CIFAR100', train=True, download=True)
data_te = datasets.CIFAR100(path + '/CIFAR100', train=False, download=True)
# Obtain the raw data
X_tr = data_tr.data
Y_tr = data_tr.targets
X_te = data_te.data
Y_te = data_te.targets
# Initialize tr_idx and te_idx, which contain the full list of indices.
# Used to select a subset from the the full dataset.
tr_idx = [x for x in range(X_tr.shape[0])]
te_idx = [x for x in range(X_te.shape[0])]
# Prepare labels for subset selection
Y_tr = np.array(Y_tr)
Y_te = np.array(Y_te)
# If the load arguments specify a class imbalance or a noise ratio, apply the distribution
# shift to the appropriate dataset. Note that only one of class imbalance or noise is applied.
if tr_load_args is not None:
if "class_imbalance_ratio" in tr_load_args:
tr_idx = get_imbalanced_idx(Y_tr, num_cls, tr_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in tr_load_args:
Y_tr = add_label_noise(Y_tr, num_cls, tr_load_args["noisy_labels_ratio"])
if te_load_args is not None:
if "class_imbalance_ratio" in te_load_args:
te_idx = get_imbalanced_idx(Y_te, num_cls, te_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in te_load_args:
Y_te = add_label_noise(Y_te, num_cls, te_load_args["noisy_labels_ratio"])
# Select the subset specified by tr_idx and te_idx
X_tr = X_tr[tr_idx]
Y_tr = Y_tr[tr_idx]
X_te = X_te[te_idx]
Y_te = Y_te[te_idx]
# Shuffle train and test datasets.
train_permutation = np.random.choice(np.arange(len(Y_tr)), size=len(Y_tr), replace=False)
test_permutation = np.random.choice(np.arange(len(Y_te)), size=len(Y_te), replace=False)
X_tr = X_tr[train_permutation]
Y_tr = Y_tr[train_permutation]
X_te = X_te[test_permutation]
Y_te = Y_te[test_permutation]
# Convert labels to tensor
Y_tr = torch.from_numpy(Y_tr)
Y_te = torch.from_numpy(Y_te)
return X_tr, Y_tr, X_te, Y_te
[docs]def get_STL10(path, tr_load_args = None, te_load_args = None):
"""
Downloads STL10 dataset
Parameters
----------
path: str
Path to save the downloaded dataset
Returns
----------
X_tr: numpy array
Train set
Y_tr: torch tensor
Training Labels
X_te: numpy array
Test Set
Y_te: torch tensor
Test labels
"""
# Deterministic random seed to ensure data initialization is consistent
np.random.seed(42)
num_cls = 100
# Download the STL10 dataset
data_tr = datasets.STL10(path + '/STL10', split="train", download=True)
data_te = datasets.STL10(path + '/STL10', split="test", download=True)
# Obtain the raw data
X_tr = data_tr.data
Y_tr = data_tr.labels
X_te = data_te.data
Y_te = data_te.labels
# Initialize tr_idx and te_idx, which contain the full list of indices.
# Used to select a subset from the the full dataset.
tr_idx = [x for x in range(X_tr.shape[0])]
te_idx = [x for x in range(X_te.shape[0])]
# Prepare labels for subset selection
Y_tr = np.array(Y_tr)
Y_te = np.array(Y_te)
# If the load arguments specify a class imbalance or a noise ratio, apply the distribution
# shift to the appropriate dataset. Note that only one of class imbalance or noise is applied.
if tr_load_args is not None:
if "class_imbalance_ratio" in tr_load_args:
tr_idx = get_imbalanced_idx(Y_tr, num_cls, tr_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in tr_load_args:
Y_tr = add_label_noise(Y_tr, num_cls, tr_load_args["noisy_labels_ratio"])
if te_load_args is not None:
if "class_imbalance_ratio" in te_load_args:
te_idx = get_imbalanced_idx(Y_te, num_cls, te_load_args["class_imbalance_ratio"])
elif "noisy_labels_ratio" in te_load_args:
Y_te = add_label_noise(Y_te, num_cls, te_load_args["noisy_labels_ratio"])
# Select the subset specified by tr_idx and te_idx
X_tr = X_tr[tr_idx]
Y_tr = Y_tr[tr_idx]
X_te = X_te[te_idx]
Y_te = Y_te[te_idx]
# Shuffle train and test datasets.
train_permutation = np.random.choice(np.arange(len(Y_tr)), size=len(Y_tr), replace=False)
test_permutation = np.random.choice(np.arange(len(Y_te)), size=len(Y_te), replace=False)
X_tr = X_tr[train_permutation]
Y_tr = Y_tr[train_permutation]
X_te = X_te[test_permutation]
Y_te = Y_te[test_permutation]
# Convert labels to tensor
Y_tr = torch.from_numpy(Y_tr)
Y_te = torch.from_numpy(Y_te)
return X_tr, Y_tr, X_te, Y_te