--- title: Spliting data keywords: fastai sidebar: home_sidebar summary: "Functions required to perform cross-validation and transform unique time series sequence into multiple samples ready to be used by a time series model." description: "Functions required to perform cross-validation and transform unique time series sequence into multiple samples ready to be used by a time series model." nb_path: "nbs/010_data.validation.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

check_overlap[source]

check_overlap(a, b, c=None)

{% endraw %} {% raw %}

check_splits_overlap[source]

check_splits_overlap(splits)

{% endraw %} {% raw %}

leakage_finder[source]

leakage_finder(*splits, verbose=True)

You can pass splits as a tuple, or train, valid, ...

{% endraw %} {% raw %}

balance_idx[source]

balance_idx(o, shuffle=False, random_state=None, verbose=False)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
a = np.arange(10)
b = np.arange(10, 20)
test_eq(check_overlap(a, b), False)
a = np.arange(10)
b = np.arange(9, 20)
test_eq(check_overlap(a, b), [9])
a = np.arange(10)
b = np.arange(10, 20)
c = np.arange(20, 30)
test_eq(check_overlap(a, b, c), False)
a = np.arange(10)
b = np.arange(10, 20)
c = np.arange(10, 30)
test_eq(check_overlap(a, b, c), ([], [], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]))
{% endraw %} {% raw %}
l = L(list(concat(np.zeros(5), np.ones(10)).astype(int)))
balanced_idx = balance_idx(l)
test_eq(np.mean(l[balanced_idx]), 0.5)
test_eq(isinstance(balanced_idx, L), True)

l = list(concat(np.zeros(5), np.ones(10)).astype(int))
balanced_idx = balance_idx(l)
test_eq(np.mean(L(l)[balanced_idx]), 0.5)
test_eq(isinstance(balanced_idx, L), True)

a = concat(np.zeros(5), np.ones(10)).astype(int)
balanced_idx = balance_idx(a)
test_eq(np.mean(a[balanced_idx]), 0.5)
test_eq(isinstance(balanced_idx, L), True)

t = concat(torch.zeros(5), torch.ones(10))
balanced_idx = balance_idx(t, shuffle=True)
test_eq(t[balanced_idx].mean(), 0.5)
test_eq(isinstance(balanced_idx, L), True)
{% endraw %} {% raw %}
a, b = np.arange(100_000), np.arange(100_000, 200_000)
{% endraw %} {% raw %}
soft_labels = True
filter_pseudolabels = .5
balanced_pseudolabels = True


pseudolabels = torch.rand(1000, 3)
pseudolabels = torch.softmax(pseudolabels, -1) if soft_labels else torch.argmax(pseudolabels, -1)
hpl = torch.argmax(pseudolabels, -1) if soft_labels else pseudolabels

if filter_pseudolabels and pseudolabels.ndim > 1: 
    error = 1 - pseudolabels.max(-1).values
    filt_pl_idx = np.arange(len(error))[error < filter_pseudolabels]
    filt_pl = pseudolabels[error < filter_pseudolabels]
    assert len(filt_pl) > 0, 'no filtered pseudolabels'
    filt_hpl = torch.argmax(filt_pl, -1)
else: 
    filt_pl_idx = np.arange(len(pseudolabels))
    filt_pl = filt_hpl = pseudolabels
pl_split = filt_pl_idx[balance_idx(filt_hpl)] if balanced_pseudolabels else filt_pl_idx
test_eq(hpl[pl_split].float().mean(), np.mean(np.unique(hpl)))
{% endraw %} {% raw %}

TrainValidTestSplitter[source]

TrainValidTestSplitter(n_splits:int=1, valid_size:Union[float, int]=0.2, test_size:Union[float, int]=0.0, train_only:bool=False, stratify:bool=True, balance:bool=False, shuffle:bool=True, random_state:Union[NoneType, int]=None, verbose:bool=False, **kwargs)

Split items into random train, valid (and test optional) subsets.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

plot_splits[source]

plot_splits(splits)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_splits[source]

get_splits(o, n_splits:int=1, valid_size:float=0.2, test_size:float=0.0, train_only:bool=False, train_size:Union[NoneType, float, int]=None, balance:bool=False, shuffle:bool=True, stratify:bool=True, check_splits:bool=True, random_state:Union[NoneType, int]=None, show_plot:bool=True, verbose:bool=False)

Arguments: o : object to which splits will be applied, usually target. n_splits : number of folds. Must be an int >= 1. valid_size : size of validation set. Only used if n_splits = 1. If n_splits > 1 valid_size = (1. - test_size) / n_splits. test_size : size of test set. Default = 0. train_only : if True valid set == train set. This may be useful for debugging purposes. train_size : size of the train set used. Default = None (the remainder after assigning both valid and test). Useful for to get learning curves with different train sizes or get a small batch to debug a neural net. balance : whether to balance data so that train always contain the same number of items per class. shuffle : whether to shuffle data before splitting into batches. Note that the samples within each split will be shuffle. stratify : whether to create folds preserving the percentage of samples for each class. check_splits : whether to perform leakage and completion checks. random_state : when shuffle is True, random_state affects the ordering of the indices. Pass an int for reproducible output. show_plot : plot the split distribution

{% endraw %} {% raw %}
{% endraw %} {% raw %}
n_splits                = 5
valid_size              = 0.2
test_size               = 0.2
train_only              = False  # set to True for debugging (valid = train)
train_size              = 5000
stratify                = True
balance                 = True
shuffle                 = True
predefined_splits       = None
show_plot               = True 


check_splits = True
random_state = 23

y = np.random.randint(0, 3, 10000) + 100

splits = get_splits(y, n_splits=n_splits, valid_size=valid_size, test_size=test_size, shuffle=shuffle, balance=balance, stratify=stratify,
                    train_only=train_only, train_size=train_size, check_splits=check_splits, random_state=random_state, show_plot=show_plot, verbose=True)
splits
(((#5000) [235,8976,4897,1498,4849,3475,1119,1342,3947,4486...],
  (#1600) [9713,351,8578,7491,2425,5132,2853,3901,4156,7189...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]),
 ((#5000) [6614,9281,2140,1505,3752,4701,7767,3767,3394,1169...],
  (#1600) [5559,4444,8943,7638,8802,6165,7681,3085,7725,9976...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]),
 ((#5000) [4284,7470,3786,4749,5333,7774,7929,2929,6967,893...],
  (#1600) [7898,2060,9287,9154,9123,3001,139,1751,4925,11...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]),
 ((#5000) [7302,5375,4133,7550,4632,2457,5838,7127,5026,7900...],
  (#1600) [2913,2077,2500,8569,5557,8610,7944,353,5834,4825...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]),
 ((#5000) [8602,7392,5931,9889,7471,282,9042,4722,3191,6620...],
  (#1600) [7156,1548,611,4376,3726,1689,237,4020,7402,8581...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]))
(((#5000) [235,8976,4897,1498,4849,3475,1119,1342,3947,4486...],
  (#1600) [9713,351,8578,7491,2425,5132,2853,3901,4156,7189...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]),
 ((#5000) [6614,9281,2140,1505,3752,4701,7767,3767,3394,1169...],
  (#1600) [5559,4444,8943,7638,8802,6165,7681,3085,7725,9976...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]),
 ((#5000) [4284,7470,3786,4749,5333,7774,7929,2929,6967,893...],
  (#1600) [7898,2060,9287,9154,9123,3001,139,1751,4925,11...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]),
 ((#5000) [7302,5375,4133,7550,4632,2457,5838,7127,5026,7900...],
  (#1600) [2913,2077,2500,8569,5557,8610,7944,353,5834,4825...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]),
 ((#5000) [8602,7392,5931,9889,7471,282,9042,4722,3191,6620...],
  (#1600) [7156,1548,611,4376,3726,1689,237,4020,7402,8581...],
  (#2000) [238,9763,2491,6625,9718,6583,2995,3743,4429,5657...]))
{% endraw %} {% raw %}
train_size=256
y = np.random.randint(0, 3, 1000) + 100
splits = get_splits(y, train_size=train_size, train_only=True)
test_eq(splits[0], splits[1])
test_eq(len(splits[0]), train_size)
splits
valid == train
valid == train
((#256) [769,379,269,919,433,165,797,549,529,500...],
 (#256) [769,379,269,919,433,165,797,549,529,500...])
((#256) [769,379,269,919,433,165,797,549,529,500...],
 (#256) [769,379,269,919,433,165,797,549,529,500...])
{% endraw %} {% raw %}

TSSplitter[source]

TSSplitter(valid_size:Union[int, float]=0.2, test_size:Union[int, float]=0.0, show_plot:bool=True)

Create function that splits items between train/val with valid_size without shuffling data.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
y = np.arange(1000) + 100
test_eq(TimeSplitter(valid_size=0.2)(y)[1], L(np.arange(800, 1000).tolist()))
test_eq(TimeSplitter(valid_size=0.2)(y)[0], TimeSplitter(valid_size=200)(y)[0])
TimeSplitter(valid_size=0.2, show_plot=True)(y)
((#800) [0,1,2,3,4,5,6,7,8,9...],
 (#200) [800,801,802,803,804,805,806,807,808,809...])
((#800) [0,1,2,3,4,5,6,7,8,9...],
 (#200) [800,801,802,803,804,805,806,807,808,809...])
{% endraw %} {% raw %}
n_splits                = 5
valid_size              = 0.2  
test_size               = 0
train_only              = False  # set to True for debugging (valid = train)
train_size              = None
stratify                = True
balance                 = True
shuffle                 = True
predefined_splits       = None
show_plot               = True 


check_splits = True
random_state = 23

splits = get_splits(y, n_splits=n_splits, valid_size=valid_size, test_size=test_size, shuffle=shuffle, balance=balance, stratify=stratify,
                    train_only=train_only, train_size=train_size, check_splits=check_splits, random_state=random_state, show_plot=show_plot, verbose=True)
split = splits[0] if n_splits == 1 else splits[0][0]
y[split].mean(), split
stratify set to False as n_splits=5 cannot be greater than the min number of members in each class (1).
stratify set to False as n_splits=5 cannot be greater than the min number of members in each class (1).
(601.11, (#800) [314,194,782,789,502,917,137,415,904,181...])
(601.11, (#800) [314,194,782,789,502,917,137,415,904,181...])
{% endraw %} {% raw %}
list([splits[0], splits[1], splits[2], splits[3], splits[4]])
[((#800) [314,194,782,789,502,917,137,415,904,181...],
  (#200) [362,151,934,378,95,597,500,117,980,844...]),
 ((#800) [312,198,777,788,515,910,145,413,898,186...],
  (#200) [352,133,955,396,64,596,442,79,991,882...]),
 ((#800) [311,197,783,791,507,922,145,416,908,184...],
  (#200) [338,125,912,361,54,594,486,88,994,859...]),
 ((#800) [296,181,782,789,493,917,130,401,905,165...],
  (#200) [405,199,953,444,113,610,515,137,997,881...]),
 ((#800) [320,190,782,788,506,906,141,412,893,178...],
  (#200) [336,149,942,358,49,582,472,70,990,907...])]
[((#800) [314,194,782,789,502,917,137,415,904,181...],
  (#200) [362,151,934,378,95,597,500,117,980,844...]),
 ((#800) [312,198,777,788,515,910,145,413,898,186...],
  (#200) [352,133,955,396,64,596,442,79,991,882...]),
 ((#800) [311,197,783,791,507,922,145,416,908,184...],
  (#200) [338,125,912,361,54,594,486,88,994,859...]),
 ((#800) [296,181,782,789,493,917,130,401,905,165...],
  (#200) [405,199,953,444,113,610,515,137,997,881...]),
 ((#800) [320,190,782,788,506,906,141,412,893,178...],
  (#200) [336,149,942,358,49,582,472,70,990,907...])]
{% endraw %} {% raw %}
n_splits = 5
valid_size = 0.
test_size = 0.
shuffle = True
stratify = True
train_only = True
train_size = None
check_splits = True
random_state = 1
show_plot = True 

splits = get_splits(y, n_splits=n_splits, valid_size=valid_size, test_size=test_size, shuffle=shuffle, stratify=stratify,
                    train_only=train_only, train_size=train_size, check_splits=check_splits, random_state=random_state, show_plot=show_plot, verbose=True)
for split in splits: 
    test_eq(len(split[0]), len(y))
    test_eq(np.sort(split[0]), np.arange(len(y)))
stratify set to False as n_splits=5 cannot be greater than the min number of members in each class (1).
valid == train
stratify set to False as n_splits=5 cannot be greater than the min number of members in each class (1).
valid == train
{% endraw %} {% raw %}
n_splits = 5
y = np.random.randint(0, 2, 1000)

splits = get_splits(y, n_splits=n_splits, shuffle=False, check_splits=True)
test_eq(np.concatenate((L(zip(*splits))[1])), np.arange(len(y)))

splits = get_splits(y, n_splits=n_splits, shuffle=True, check_splits=True)
test_eq(np.sort(np.concatenate((L(zip(*splits))[1]))), np.arange(len(y)))
{% endraw %} {% raw %}
n_splits = 2
y = np.random.randint(0, 2, 1000)

splits = get_splits(y, n_splits=n_splits, test_size=0.2, shuffle=False)
for i in range(n_splits): leakage_finder(*splits[i])
test_eq(len(splits), n_splits)
test_eq(len(splits[0]), 3)
s = []
[s.extend(split) for split in splits[0]]
test_eq(np.sort(s), np.arange(len(y)))
s = []
[s.extend(split) for split in splits[1]]
test_eq(np.sort(s), np.arange(len(y)))
{% endraw %} {% raw %}
y = np.random.randint(0, 2, 1000)
splits1 = get_splits(y, valid_size=.25, test_size=0, random_state=23, stratify=True, shuffle=True)
splits2 = get_splits(y, valid_size=.25, test_size=0, random_state=23, stratify=True, shuffle=True)
splits3 = get_splits(y, valid_size=.25, test_size=0, random_state=None, stratify=True, shuffle=True)
splits4 = get_splits(y, valid_size=.25, test_size=0, random_state=None, stratify=True, shuffle=True)
test_eq(splits1[0], splits2[0])
test_ne(splits3[0], splits4[0])
{% endraw %} {% raw %}
y = np.random.randint(0, 2, 100)
splits = get_splits(y, valid_size=.25, test_size=0, random_state=23, stratify=True, shuffle=True)
test_eq(len(splits), 2)
{% endraw %} {% raw %}
y = np.random.randint(0, 2, 100)
splits = get_splits(y, valid_size=.25, test_size=0, random_state=23, stratify=True)
test_eq(len(splits), 2)
{% endraw %} {% raw %}
y = np.random.randint(0, 2, 100)
splits = get_splits(y, valid_size=.25, test_size=20, random_state=23, stratify=True)
test_eq(len(splits), 3)
leakage_finder(*splits)
{% endraw %} {% raw %}
splits = TrainValidTestSplitter(valid_size=.25, test_size=20, random_state=23, stratify=True)(np.random.randint(0, 2, 100))
test_eq(len(splits[1]), 25)
test_eq(len(splits[2]), 20)
{% endraw %} {% raw %}
o = np.random.randint(0, 2, 1000)
for p in [1, .75, .5, .25, .125]:
    splits = get_splits(o, train_size=p)
    test_eq(len(splits[0]), len(o) * .8 * p)
{% endraw %} {% raw %}
y = L([0] * 50 + [1] * 25 + [2] * 15 + [3] * 10)
splits = get_splits(y, valid_size=.2, test_size=.2)
test_eq(np.mean(y[splits[0]])==np.mean(y[splits[1]])==np.mean(y[splits[2]]), True)
splits
((#60) [71,74,62,49,64,94,29,1,14,6...],
 (#20) [90,30,55,35,72,27,88,15,5,68...],
 (#20) [38,91,37,18,13,4,51,80,44,61...])
((#60) [71,74,62,49,64,94,29,1,14,6...],
 (#20) [90,30,55,35,72,27,88,15,5,68...],
 (#20) [38,91,37,18,13,4,51,80,44,61...])
{% endraw %} {% raw %}
y = L([0] * 50 + [1] * 25 + [2] * 15 + [3] * 10)
splits = get_splits(y, n_splits=1, valid_size=.2, test_size=.2, shuffle=False)
# test_eq(splits[0] + splits[1] + splits[2], np.arange(100))
splits
((#60) [0,1,2,3,4,5,6,7,8,9...],
 (#20) [60,61,62,63,64,65,66,67,68,69...],
 (#20) [80,81,82,83,84,85,86,87,88,89...])
((#60) [0,1,2,3,4,5,6,7,8,9...],
 (#20) [60,61,62,63,64,65,66,67,68,69...],
 (#20) [80,81,82,83,84,85,86,87,88,89...])
{% endraw %} {% raw %}
splits = get_splits(np.random.randint(0,5,100), valid_size=0.213, test_size=17)
test_eq(len(splits[1]), 21)
test_eq(len(splits[2]), 17)
{% endraw %} {% raw %}
splits = get_splits(np.random.randint(0,5,100), valid_size=0.213, test_size=17, train_size=.2)
splits
((#12) [65,23,66,37,97,79,91,45,20,75...],
 (#21) [93,76,46,85,11,96,70,41,54,56...],
 (#17) [73,4,71,67,8,55,22,19,12,40...])
((#12) [65,23,66,37,97,79,91,45,20,75...],
 (#21) [93,76,46,85,11,96,70,41,54,56...],
 (#17) [73,4,71,67,8,55,22,19,12,40...])
{% endraw %} {% raw %}

get_predefined_splits[source]

get_predefined_splits(*xs)

xs is a list with X_train, X_valid, ...

{% endraw %} {% raw %}

combine_split_data[source]

combine_split_data(xs, ys=None)

xs is a list with X_train, X_valid, .... ys is None or a list with y_train, y_valid, ....

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_splits_len[source]

get_splits_len(splits)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
X_train, y_train, X_valid, y_valid = np.random.rand(3,3,4), np.random.randint(0,2,3), np.random.rand(2,3,4), np.random.randint(0,2,2)
X, y, splits = combine_split_data([X_train, X_valid], [y_train, y_valid])
test_eq(X_train, X[splits[0]])
test_eq(X_valid, X[splits[1]])
test_type(X_train, X)
test_type(y_train, y)
{% endraw %} {% raw %}
X_train, y_train, X_valid, y_valid = np.random.rand(3,4), np.random.randint(0,2,3), np.random.rand(2,4), np.random.randint(0,2,2)
X, y, splits = combine_split_data([X_train, X_valid], [y_train, y_valid])
test_eq(X_train[:, None], X[splits[0]])
test_eq(X_valid[:, None], X[splits[1]])
test_type(X_train, X)
test_type(y_train, y)
{% endraw %}