--- title: Data preparation keywords: fastai sidebar: home_sidebar summary: "Functions required to prepare X (and y) from a pandas dataframe." description: "Functions required to prepare X (and y) from a pandas dataframe." nb_path: "nbs/000c_data.preparation.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

df2Xy[source]

df2Xy(df, sample_col=None, feat_col=None, data_cols=None, target_col=None, to3d=True, splits=None, sort_by=None, ascending=True, y_func=None)

This function allows you to transform a pandas dataframe into X and y numpy arrays that can be used to craete a TSDataset. sample_col: column that uniquely identifies each sample. feat_col: used for multivariate datasets. It indicates which is the column that indicates the feature by row. data_col: indicates ths column/s where the data is located. If None, it means all columns (except the sample_col, feat_col, and target_col) target_col: indicates the column/s where the target is. to3d: turns X to 3d (including univariate time series) sort_by: used to indicate how to sort the dataframe. y_func: function used to calculate y for each sample (and target_col)

{% endraw %} {% raw %}

split_Xy[source]

split_Xy(X, y=None, splits=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
n_samples = 1_000
n_rows = 10_000

sample_ids = np.arange(n_samples).repeat(n_rows//n_samples).reshape(-1,1)
feat_ids = np.tile(np.arange(n_rows // n_samples), n_samples).reshape(-1,1)
cont = np.random.randn(n_rows, 6)
ind_cat = np.random.randint(0, 3, (n_rows, 1))
target = np.array(['a', 'b', 'c'])[ind_cat]
ind_cat2 = np.random.randint(0, 3, (n_rows, 1))
target2 = np.array(['a', 'b', 'c'])[ind_cat2]
data = np.concatenate([sample_ids, feat_ids, cont, target, target], -1)
columns = ['sample_id', 'feat_id'] + (np.arange(6) + 1).astype(str).tolist() + ['target'] + ['target2']
df = pd.DataFrame(data, columns=columns)
idx = np.random.choice(np.arange(len(df)), len(df), False)
new_dtypes = {'sample_id':np.int32, 'feat_id':np.int32, '1':np.float32, '2':np.float32, '3':np.float32, '4':np.float32, '5':np.float32, '6':np.float32}
df = df.astype(dtype=new_dtypes)
df = df.loc[idx].reset_index(drop=True)
df
sample_id feat_id 1 2 3 4 5 6 target target2
0 404 4 -0.500670 -0.003038 1.029969 0.255419 1.486936 -0.463189 a a
1 481 8 2.002796 0.341656 0.449789 -0.384179 -0.559138 0.114824 b b
2 365 1 1.294387 0.037775 1.723457 0.694628 0.629030 -0.427940 b b
3 210 3 -0.796168 -1.519713 -0.965083 -0.225898 -0.692436 0.695402 a a
4 406 1 -0.694345 1.104593 -0.249907 -0.603763 -0.256302 0.421814 c c
... ... ... ... ... ... ... ... ... ... ...
9995 414 8 1.911863 1.388889 -1.275277 -0.212019 0.520438 -1.026410 c c
9996 697 6 -0.706451 -1.014673 0.723689 -0.025465 0.425886 -1.312825 a a
9997 213 6 1.572289 -0.262521 0.881096 0.866777 -0.708215 -0.440335 c c
9998 309 6 2.117161 1.655981 -0.241192 -0.093749 0.570686 -0.735474 c c
9999 567 0 -0.663932 -0.569744 -1.129407 -0.082510 0.609969 -0.060051 a a

10000 rows × 10 columns

{% endraw %} {% raw %}
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', data_cols=['1', '2', '3', '4', '5', '6'], target_col=['target'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000,))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(scipy.stats.mode(sorted_df[sorted_df.sample_id == rand_idx][['target']].values).mode), y[rand_idx])
{% endraw %} {% raw %}
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', target_col=['target', 'target2'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000, 2))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(scipy.stats.mode(sorted_df[sorted_df.sample_id == rand_idx][['target', 'target2']].values).mode), y[rand_idx])
{% endraw %} {% raw %}
from io import StringIO
TESTDATA = StringIO("""sample_id;value_0;value_1;target
    rob;2;3;hot
    alice;6;7;lukewarm
    eve;11;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
X, y = df2Xy(df, sample_col='sample_id', target_col='target', data_cols=['value_0', 'value_1'], sort_by='sample_id')
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3,))
X, y
sample_id value_0 value_1 target
0 rob 2 3 hot
1 alice 6 7 lukewarm
2 eve 11 12 cold
(array([[[ 6,  7]],
 
        [[11, 12]],
 
        [[ 2,  3]]]),
 array(['lukewarm', 'cold', 'hot'], dtype=object))
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;timestep;values;target
    rob;1;2;hot
    alice;1;6;lukewarm
    eve;1;11;cold
    
    rob;2;3;hot
    alice;2;7;lukewarm
    eve;2;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', target_col='target', data_cols=['values'], sort_by='timestep', to3d=True, y_func=y_func)
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3, ))
print(X, y)
sample_id timestep values target
0 rob 1 2 hot
1 alice 1 6 lukewarm
2 eve 1 11 cold
3 rob 2 3 hot
4 alice 2 7 lukewarm
5 eve 2 12 cold
[[[ 6 11]]

 [[ 2  7]]

 [[12  3]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col='target', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3,))
sample_id trait value_0 value_1 target
7 alice red 9 10 lukewarm
4 alice green 6 7 lukewarm
8 eve yellow 11 12 cold
0 rob green 2 3 hot
9 eve green 10 11 cold
10 eve blue 12 12 cold
2 rob blue 4 5 hot
3 rob red 5 6 hot
11 eve red 13 14 cold
1 rob yellow 3 4 hot
6 alice blue 8 9 lukewarm
5 alice yellow 7 8 lukewarm
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target1;target2
    rob;green;2;3;hot;good
    rob;yellow;3;4;hot;good
    rob;blue;4;5;hot;good
    rob;red;5;6;hot;good
    alice;green;6;7;lukewarm;good
    alice;yellow;7;8;lukewarm;good
    alice;blue;8;9;lukewarm;good
    alice;red;9;10;lukewarm;good
    eve;yellow;11;12;cold;bad
    eve;green;10;11;cold;bad
    eve;blue;12;12;cold;bad
    eve;red;13;14;cold;bad
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col=['target1', 'target2'], data_cols=['value_0', 'value_1'], y_func=y_func)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, 2))
print(X, y)
sample_id trait value_0 value_1 target1 target2
0 rob green 2 3 hot good
1 rob yellow 3 4 hot good
2 rob blue 4 5 hot good
3 rob red 5 6 hot good
4 alice green 6 7 lukewarm good
5 alice yellow 7 8 lukewarm good
6 alice blue 8 9 lukewarm good
7 alice red 9 10 lukewarm good
8 eve yellow 11 12 cold bad
9 eve green 10 11 cold bad
10 eve blue 12 12 cold bad
11 eve red 13 14 cold bad
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] [['lukewarm' 'good']
 ['cold' 'bad']
 ['hot' 'good']]
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y, None)
sample_id trait value_0 value_1 target
1 rob yellow 3 4 hot
0 rob green 2 3 hot
4 alice green 6 7 lukewarm
5 alice yellow 7 8 lukewarm
11 eve red 13 14 cold
2 rob blue 4 5 hot
8 eve yellow 11 12 cold
3 rob red 5 6 hot
7 alice red 9 10 lukewarm
9 eve green 10 11 cold
6 alice blue 8 9 lukewarm
10 eve blue 12 12 cold
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] None
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;timestep;values;target
    rob;green;1;2;hot
    rob;yellow;1;3;hot
    rob;blue;1;4;hot
    rob;red;1;5;hot
    alice;green;1;6;lukewarm
    alice;yellow;1;7;lukewarm
    alice;blue;1;8;lukewarm
    alice;red;1;9;lukewarm
    eve;yellow;1;11;cold
    eve;green;1;10;cold
    eve;blue;1;12;cold
    eve;red;1;13;cold
    
    rob;green;2;3;hot
    rob;yellow;2;4;hot
    rob;blue;2;5;hot
    rob;red;2;6;hot
    alice;green;2;7;lukewarm
    alice;yellow;2;8;lukewarm
    alice;blue;2;9;lukewarm
    alice;red;2;10;lukewarm
    eve;yellow;2;12;cold
    eve;green;2;11;cold
    eve;blue;2;13;cold
    eve;red;2;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', sort_by='timestep', target_col='target', data_cols=['values'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, ))
sample_id trait timestep values target
0 rob green 1 2 hot
1 rob yellow 1 3 hot
2 rob blue 1 4 hot
3 rob red 1 5 hot
4 alice green 1 6 lukewarm
5 alice yellow 1 7 lukewarm
6 alice blue 1 8 lukewarm
7 alice red 1 9 lukewarm
8 eve yellow 1 11 cold
9 eve green 1 10 cold
10 eve blue 1 12 cold
11 eve red 1 13 cold
12 rob green 2 3 hot
13 rob yellow 2 4 hot
14 rob blue 2 5 hot
15 rob red 2 6 hot
16 alice green 2 7 lukewarm
17 alice yellow 2 8 lukewarm
18 alice blue 2 9 lukewarm
19 alice red 2 10 lukewarm
20 eve yellow 2 12 cold
21 eve green 2 11 cold
22 eve blue 2 13 cold
23 eve red 2 14 cold
[[[ 8  6]
  [ 9  7]
  [12 10]
  [13 11]]

 [[ 4  2]
  [ 5  3]
  [ 9  7]
  [10  8]]

 [[13 11]
  [14 12]
  [ 5  3]
  [ 6  4]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}

df2np3d[source]

df2np3d(df, groupby, data_cols=None)

Transforms a df (with the same number of rows per group in groupby) to a 3d ndarray

{% endraw %} {% raw %}
{% endraw %} {% raw %}
user = np.array([1,2]).repeat(4).reshape(-1,1)
val = np.random.rand(8, 3)
data = np.concatenate([user, val], axis=-1)
df = pd.DataFrame(data, columns=['user', 'x1', 'x2', 'x3'])
test_eq(df2np3d(df, ['user'], ['x1', 'x2', 'x3']).shape, (2, 3, 4))
{% endraw %} {% raw %}

add_missing_value_cols[source]

add_missing_value_cols(df, cols=None, dtype=float)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
data = np.random.randn(10, 2)
mask = data > .8
data[mask] = np.nan
df = pd.DataFrame(data, columns=['A', 'B'])
df = add_missing_value_cols(df, cols=None, dtype=float)
test_eq(df['A'].isnull().sum(), df['missing_A'].sum())
test_eq(df['B'].isnull().sum(), df['missing_B'].sum())
df
A B missing_A missing_B
0 NaN 0.730925 1.0 0.0
1 -1.548153 -0.975422 0.0 0.0
2 NaN -2.221517 1.0 0.0
3 0.385764 -0.741613 0.0 0.0
4 -0.051031 NaN 0.0 1.0
5 NaN 0.673614 1.0 0.0
6 0.426181 -0.786155 0.0 0.0
7 0.436134 0.686743 0.0 0.0
8 0.472773 0.133506 0.0 0.0
9 -0.179016 NaN 0.0 1.0
{% endraw %} {% raw %}

add_missing_timestamps[source]

add_missing_timestamps(df, datetime_col, groupby=None, fill_value=nan, range_by_group=True, freq=None)

Fills missing timestamps in a dataframe to a desired frequency Args: df: pandas DataFrame datetime_col: column tha contains the datetime data groupby: column used to identify unique_ids fill_value: values that will be insert where missing dates exist. Default:np.nan range_by_group: if True, dates will be filled between min and max dates for each group. Otherwise, between the min and max dates in the df. freq: frequence used to fillin the missing datetime

{% endraw %} {% raw %}
{% endraw %} {% raw %}
today = datetime.now()
dates = pd.date_range('2021-05-01', '2021-05-07')
dates2 = np.concatenate([dates[:3], dates[-2:]])
dates3 = pd.date_range(dates2.min(), dates2.max())
test_eq(dates, dates3)
data = np.zeros((len(dates2), 3))
data[:, 0] = dates2
data[:, 1] = np.random.rand(len(dates2))
data[:, 2] = np.array([0, 1, 0, 0, 1])
cols = ['date', 'a', 'b']
date_df = pd.DataFrame(data, columns=cols)
date_df['date'] = pd.to_datetime(date_df['date'])
date_df
date a b
0 2021-05-01 0.909024 0.0
1 2021-05-02 0.624528 1.0
2 2021-05-03 0.354594 0.0
3 2021-05-06 0.833744 0.0
4 2021-05-07 0.456993 1.0
{% endraw %} {% raw %}
add_missing_timestamps(date_df, 'date')
date a b
0 2021-05-01 0.909024 0.0
1 2021-05-02 0.624528 1.0
2 2021-05-03 0.354594 0.0
3 2021-05-04 NaN NaN
4 2021-05-05 NaN NaN
5 2021-05-06 0.833744 0.0
6 2021-05-07 0.456993 1.0
{% endraw %} {% raw %}
add_missing_timestamps(date_df, 'date', groupby='b')
date a b
0 2021-05-01 0.909024 0.0
1 2021-05-02 NaN 0.0
2 2021-05-03 0.354594 0.0
3 2021-05-04 NaN 0.0
4 2021-05-05 NaN 0.0
5 2021-05-06 0.833744 0.0
6 2021-05-02 0.624528 1.0
7 2021-05-03 NaN 1.0
8 2021-05-04 NaN 1.0
9 2021-05-05 NaN 1.0
10 2021-05-06 NaN 1.0
11 2021-05-07 0.456993 1.0
{% endraw %} {% raw %}
add_missing_timestamps(date_df, 'date', groupby='b', range_by_group=False)
date a b
0 2021-05-01 0.909024 0.0
1 2021-05-02 NaN 0.0
2 2021-05-03 0.354594 0.0
3 2021-05-04 NaN 0.0
4 2021-05-05 NaN 0.0
5 2021-05-06 0.833744 0.0
6 2021-05-07 NaN 0.0
7 2021-05-01 NaN 1.0
8 2021-05-02 0.624528 1.0
9 2021-05-03 NaN 1.0
10 2021-05-04 NaN 1.0
11 2021-05-05 NaN 1.0
12 2021-05-06 NaN 1.0
13 2021-05-07 0.456993 1.0
{% endraw %} {% raw %}

time_encoding[source]

time_encoding(series, freq, max_val=None)

Transforms a pandas series of dtype datetime64 (of any freq) or DatetimeIndex into 2 float arrays

Available options: microsecond, millisecond, second, minute, hour, day = day_of_month = dayofmonth, day_of_week = weekday = dayofweek, day_of_year = dayofyear, week = week_of_year = weekofyear, month and year

{% endraw %} {% raw %}
{% endraw %} {% raw %}
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    tdf = pd.DataFrame(pd.date_range('2021-03-01', datetime.today()), columns=['date'])
    a,b = time_encoding(tdf.date, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()
{% endraw %} {% raw %}
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    dateindex = pd.date_range('2021-03-01', datetime.today())
    a,b = time_encoding(dateindex, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()
{% endraw %} {% raw %}
dow_sin, dow_cos = time_encoding(date_df['date'], 'dayofweek')
plt.plot(dow_sin)
plt.plot(dow_cos)
plt.title('DayOfWeek')
plt.show()
date_df['dow_sin'] = dow_sin
date_df['dow_cos'] = dow_cos
date_df
date a b dow_sin dow_cos
0 2021-05-01 0.909024 0.0 -0.974928 -0.222521
1 2021-05-02 0.624528 1.0 -0.781831 0.623490
2 2021-05-03 0.354594 0.0 0.000000 1.000000
3 2021-05-06 0.833744 0.0 0.433884 -0.900969
4 2021-05-07 0.456993 1.0 -0.433884 -0.900969
{% endraw %} {% raw %}

forward_gaps[source]

forward_gaps(o, nan_to_num=0, normalize=True)

Number of sequence steps since previous real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

backward_gaps[source]

backward_gaps(o, nan_to_num=0, normalize=True)

Number of sequence steps to next real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

nearest_gaps[source]

nearest_gaps(o, nan_to_num=0, normalize=True)

Number of sequence steps to nearest real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

get_gaps[source]

get_gaps(o:Tensor, nan_to_num:int=0, forward:bool=True, backward:bool=True, nearest:bool=True, normalize:bool=True)

Number of sequence steps from previous, to next and/or to nearest real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = torch.rand(1, 2, 8)
arr = t.numpy()
t[t <.6] = np.nan
test_ge(nearest_gaps(t).min().item(), 0)
test_ge(nearest_gaps(arr).min(), 0)
test_le(nearest_gaps(t).min().item(), 1)
test_le(nearest_gaps(arr).min(), 1)
test_eq(torch.isnan(forward_gaps(t)).sum(), 0)
test_eq(np.isnan(forward_gaps(arr)).sum(), 0)
ag = get_gaps(t)
test_eq(ag.shape, (1,6,8))
test_eq(torch.isnan(ag).sum(), 0)
{% endraw %} {% raw %}

add_delta_timestamp_cols[source]

add_delta_timestamp_cols(df, cols=None, groupby=None, forward=True, backward=True, nearest=True, nan_to_num=0, normalize=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
date_df = add_missing_timestamps(date_df, 'date', groupby='b')
add_delta_timestamp_cols(date_df, cols='a')
date a b dow_sin dow_cos a_dt_fwd a_dt_bwd a_dt_nearest
0 2021-05-01 0.909024 0.0 -0.974928 -0.222521 0.083333 0.083333 0.083333
1 2021-05-02 NaN 0.0 NaN NaN 0.166667 0.166667 0.166667
2 2021-05-03 0.354594 0.0 0.000000 1.000000 0.083333 0.083333 0.083333
3 2021-05-04 NaN 0.0 NaN NaN 0.166667 0.250000 0.166667
4 2021-05-05 NaN 0.0 NaN NaN 0.250000 0.166667 0.166667
5 2021-05-06 0.833744 0.0 0.433884 -0.900969 0.083333 0.083333 0.083333
6 2021-05-02 0.624528 1.0 -0.781831 0.623490 0.083333 0.083333 0.083333
7 2021-05-03 NaN 1.0 NaN NaN 0.166667 0.416667 0.166667
8 2021-05-04 NaN 1.0 NaN NaN 0.250000 0.333333 0.250000
9 2021-05-05 NaN 1.0 NaN NaN 0.333333 0.250000 0.250000
10 2021-05-06 NaN 1.0 NaN NaN 0.416667 0.166667 0.166667
11 2021-05-07 0.456993 1.0 -0.433884 -0.900969 0.083333 0.083333 0.083333
{% endraw %} {% raw %}

SlidingWindow[source]

SlidingWindow(window_len:int, stride:Union[NoneType, int]=1, start:int=0, pad_remainder:bool=False, padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType, int, list]=None, get_y:Union[NoneType, int, list]=None, y_func:Optional[callable]=None, copy:bool=False, horizon:Union[int, list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True)

Applies a sliding window to a 1d or 2d input (np.ndarray, torch.Tensor or pd.DataFrame) Args: window_len = length of lookback window stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default), a given step (int), or random within the 1st stride (None). pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict:

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y.
                    [] means no y data is created (unlabeled data).
y_func              = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
copy                = copy the original object to avoid changes in it.
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y

Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X
input shape: (13, 3)
array([[[ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [ 0.,  0.,  0.,  1.,  1.]]])
{% endraw %} {% raw %}
wl = 5
t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl)(t)
test_eq(X.shape[1:], (1, wl))
itemify(X,)
input shape: (10,)
(#5) [(array([[0, 1, 2, 3, 4]]),),(array([[1, 2, 3, 4, 5]]),),(array([[2, 3, 4, 5, 6]]),),(array([[3, 4, 5, 6, 7]]),),(array([[4, 5, 6, 7, 8]]),)]
{% endraw %} {% raw %}
wl = 5
h = 1

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9)]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=3, horizon=1, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[3, 4, 5, 6, 7]]), 8)]
{% endraw %} {% raw %}
wl = 5
start = 3

t = np.arange(20)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=None, horizon=1, start=start)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
[(array([[3, 4, 5, 6, 7]]), 8), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[13, 14, 15, 16, 17]]), 18)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
var
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=1, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
var
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9), (array([[5, 6, 7, 8, 9]]), 10), (array([[ 6,  7,  8,  9, 10]]), 11), (array([[ 7,  8,  9, 10, 11]]), 12), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[ 9, 10, 11, 12, 13]]), 14), (array([[10, 11, 12, 13, 14]]), 15), (array([[11, 12, 13, 14, 15]]), 16), (array([[12, 13, 14, 15, 16]]), 17), (array([[13, 14, 15, 16, 17]]), 18), (array([[14, 15, 16, 17, 18]]), 19)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var']).T
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None, seq_first=False)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
var 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
var_0 var_1 var_2
0 0 0 0
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
5 5 50 500
6 6 60 600
7 7 70 700
8 8 80 800
9 9 90 900
[(array([[  0,   1,   2,   3,   4],
       [  0,  10,  20,  30,  40],
       [  0, 100, 200, 300, 400]]), array([  5,  50, 500])), (array([[  1,   2,   3,   4,   5],
       [ 10,  20,  30,  40,  50],
       [100, 200, 300, 400, 500]]), array([  6,  60, 600])), (array([[  2,   3,   4,   5,   6],
       [ 20,  30,  40,  50,  60],
       [200, 300, 400, 500, 600]]), array([  7,  70, 700])), (array([[  3,   4,   5,   6,   7],
       [ 30,  40,  50,  60,  70],
       [300, 400, 500, 600, 700]]), array([  8,  80, 800])), (array([[  4,   5,   6,   7,   8],
       [ 40,  50,  60,  70,  80],
       [400, 500, 600, 700, 800]]), array([  9,  90, 900]))]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1, get_y=0)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
var_0 var_1 var_2
0 0 0 0
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
5 5 50 500
6 6 60 600
7 7 70 700
8 8 80 800
9 9 90 900
[(array([[  0,   1,   2,   3,   4],
       [  0,  10,  20,  30,  40],
       [  0, 100, 200, 300, 400]]), 5), (array([[  1,   2,   3,   4,   5],
       [ 10,  20,  30,  40,  50],
       [100, 200, 300, 400, 500]]), 6), (array([[  2,   3,   4,   5,   6],
       [ 20,  30,  40,  50,  60],
       [200, 300, 400, 500, 600]]), 7), (array([[  3,   4,   5,   6,   7],
       [ 30,  40,  50,  60,  70],
       [300, 400, 500, 600, 700]]), 8), (array([[  4,   5,   6,   7,   8],
       [ 40,  50,  60,  70,  80],
       [400, 500, 600, 700, 800]]), 9)]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(wl, horizon=1, get_x=columns[:-1], get_y='target')(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars-1, wl))
test_eq(items[0][1].shape, ())
input shape: torch.Size([10, 3])
var_0 var_1 target
0 0 0 0
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
5 5 50 500
6 6 60 600
7 7 70 700
8 8 80 800
9 9 90 900
[(array([[ 0,  1,  2,  3,  4],
       [ 0, 10, 20, 30, 40]]), 500), (array([[ 1,  2,  3,  4,  5],
       [10, 20, 30, 40, 50]]), 600), (array([[ 2,  3,  4,  5,  6],
       [20, 30, 40, 50, 60]]), 700), (array([[ 3,  4,  5,  6,  7],
       [30, 40, 50, 60, 70]]), 800), (array([[ 4,  5,  6,  7,  8],
       [40, 50, 60, 70, 80]]), 900)]
{% endraw %} {% raw %}
n_vars = 3

t = (np.random.rand(1000, n_vars) - .5).cumsum(0)
print(t.shape)
plt.plot(t)
plt.show()
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=[0,1], get_y=2)(t)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(1000, 3)
(200, 2, 5) (200,)
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (np.random.rand(100, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, horizon=0, get_x=columns[:-1], get_y='target')(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 3)
var_0 var_1 target
0 -0.326437 0.479251 0.310728
1 -0.750432 0.290482 0.519127
2 -0.664343 0.498198 0.502765
3 -1.112040 0.977210 0.979918
4 -1.169530 1.289907 0.499410
... ... ... ...
95 1.045623 -0.797442 3.577796
96 0.880116 -0.402711 3.726002
97 0.465610 -0.587076 4.185655
98 0.073296 -0.393737 3.852723
99 0.432735 -0.183962 4.112044

100 rows × 3 columns

(96, 2, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
var_0 var_1 var_2 var_3 target
0 0.122803 -0.192390 0.082444 -0.434987 -0.175407
1 0.077732 0.097740 0.548832 0.053376 -0.592007
2 0.378958 -0.114986 0.963581 -0.443566 -0.480206
3 -0.044738 -0.227157 1.035744 -0.231511 -0.740553
4 -0.473692 0.049397 1.306891 0.085110 -0.621726
... ... ... ... ... ...
95 -0.350472 4.259947 -1.097730 -1.190326 2.244842
96 -0.393774 4.111698 -0.845753 -1.085674 2.186361
97 -0.728022 4.506924 -0.558057 -1.454727 2.557359
98 -0.386212 4.313379 -0.339832 -1.674739 2.947028
99 -0.392555 4.753925 0.123853 -1.984791 2.501331

100 rows × 5 columns

(96, 4, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5

t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
var_0 -0.007642 -0.144568 -0.312489 -0.337328 -0.500791 -0.580364 -0.415202 0.046781 -0.439109 -0.217555 ... 2.929046 3.343044 3.726367 3.958468 3.613301 3.139824 3.607776 3.290137 3.509609 3.297065
var_1 -0.219972 0.263573 -0.200211 -0.048552 -0.037821 -0.323103 -0.782897 -1.031557 -0.558828 -0.213838 ... 2.211676 2.367846 2.249200 2.379222 2.037661 2.079086 1.605248 1.611417 1.236507 1.203168
var_2 0.003493 -0.445496 -0.930695 -1.341285 -1.571041 -1.466565 -1.038938 -1.206081 -0.711576 -0.399879 ... -0.327565 -0.639788 -0.696777 -1.118131 -0.622657 -0.908105 -1.249232 -0.828962 -0.657256 -1.111281
var_3 0.433242 0.296993 0.210311 0.156670 -0.091715 -0.299894 -0.246204 -0.371425 0.023263 -0.396230 ... -0.034904 0.109376 0.178159 0.405560 0.526120 0.179910 0.261559 0.676023 1.090130 1.190686
target -0.022664 -0.227535 -0.593308 -0.480140 -0.352854 -0.484074 -0.936421 -0.747688 -0.289341 -0.435805 ... 5.248195 4.831753 4.890124 5.068724 4.762473 5.003428 5.472158 5.615190 6.005997 5.807494

5 rows × 100 columns

(96, 4, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
var_0 -0.034849 -0.440509 -0.204999 -0.033072 0.348420 0.240488 0.716651 0.661727 0.678404 0.598049 ... -2.340155 -2.550953 -3.012781 -3.314128 -3.600779 -3.119236 -3.546340 -4.039925 -4.223019 -4.297021
var_1 0.059505 0.463835 0.092333 -0.173710 -0.284092 -0.507932 -0.053631 -0.402823 -0.730973 -0.446001 ... -2.489691 -2.295292 -2.793545 -2.544649 -2.514049 -2.558386 -2.851392 -2.453308 -2.770597 -3.147457
var_2 0.393941 -0.092135 0.000388 0.373813 0.765149 0.749741 1.180193 1.427538 1.416091 1.636049 ... 4.710221 4.638899 4.314867 4.756013 5.153733 4.781412 4.434786 4.502743 4.879928 4.729175
var_3 0.048540 -0.322998 -0.612821 -0.869725 -0.659956 -0.178982 -0.054909 0.331829 0.505056 0.538960 ... 1.399820 1.361124 1.802954 1.938037 1.657026 1.389876 1.584459 1.577697 1.248395 1.237694
target -0.041111 -0.339473 -0.160612 -0.612748 -0.639710 -0.836934 -1.229467 -0.730630 -1.103894 -1.563996 ... 2.276318 2.424404 2.620477 2.884868 2.675900 2.693670 2.931990 3.241520 3.069366 3.172559

5 rows × 100 columns

(20, 4, 5) (20,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
splits = TrainValidTestSplitter(valid_size=.2, shuffle=False)(y)
X.shape, y.shape, splits
(100, 5)
var_0 var_1 var_2 var_3 target
0 -0.483027 0.334016 -0.279455 -0.231867 0.021331
1 -0.443727 0.721818 -0.321614 -0.172625 0.172358
2 -0.864159 0.868601 -0.571087 -0.240461 0.251827
3 -1.080842 1.355742 -0.634676 -0.138883 0.529263
4 -1.022303 1.720073 -0.304356 0.214536 0.984019
... ... ... ... ... ...
95 0.211623 3.157325 0.553882 -6.067428 2.025751
96 -0.267864 2.874727 0.673211 -6.096492 1.931358
97 -0.756167 3.301079 0.471411 -6.585188 2.390795
98 -0.673001 3.092239 0.237150 -6.358828 2.386512
99 -0.583140 3.210068 0.214535 -6.253794 2.622534

100 rows × 5 columns

((96, 4, 5),
 (96,),
 ((#77) [0,1,2,3,4,5,6,7,8,9...], (#19) [77,78,79,80,81,82,83,84,85,86...]))
{% endraw %} {% raw %}
data = np.concatenate([np.linspace(0, 1, 11).reshape(-1,1).repeat(2, 1), np.arange(11).reshape(-1,1)], -1)
df_test = pd.DataFrame(data, columns=['col1', 'col2', 'target'])
df_test['target'] = df_test['target'].astype(int)
df_test
col1 col2 target
0 0.0 0.0 0
1 0.1 0.1 1
2 0.2 0.2 2
3 0.3 0.3 3
4 0.4 0.4 4
5 0.5 0.5 5
6 0.6 0.6 6
7 0.7 0.7 7
8 0.8 0.8 8
9 0.9 0.9 9
10 1.0 1.0 10
{% endraw %} {% raw %}
def _y_func(o): return o[:, 0]
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, y = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=_y_func)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/wl))
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[1], 3)
    test_eq(x.shape[2], wl)
    test_close(x[:, 0, 0]*10, y)
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, y = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=None)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/ wl))
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[1], 3)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(1, len(df_test)+1):
    x, y = SlidingWindow(wl, None, pad_remainder=False, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=None)(df_test)
    test_eq(x.shape[0], len(df_test) // wl)
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[1], 2)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, _ = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=[])(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/wl))
    test_eq(x.shape[1], 3)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(2, len(df_test)):
    x, _ = SlidingWindow(wl, wl, pad_remainder=False, get_x=['col1', 'col2'], get_y=[])(df_test)
    test_eq(x.shape[0], len(df_test) // wl)
    test_eq(x.shape[1], 2)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}

SlidingWindowPanel[source]

SlidingWindowPanel(window_len:int, unique_id_cols:list, stride:Union[NoneType, int]=1, start:int=0, pad_remainder:bool=False, padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType, int, list]=None, get_y:Union[NoneType, int, list]=None, y_func:Optional[callable]=None, copy:bool=False, horizon:Union[int, list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True, return_key:bool=False, verbose:bool=True)

Applies a sliding window to a pd.DataFrame.

Args: window_len = length of lookback window unique_id_cols = pd.DataFrame columns that will be used to identify a time series for each entity. stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default), a given step (int), or random within the 1st stride (None). pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict:

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y. [] means no y data is created (unlabeled data).
y_func              = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
copy                = copy the original object to avoid changes in it.
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y
return_key          = when True, the key corresponsing to unique_id_cols for each sample is returned
verbose             = controls verbosity. True or 1 displays progress bar. 2 or more show records that cannot be created due to its length.


Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
samples = 100_000
wl = 5
n_vars = 10

t = (torch.stack(n_vars * [torch.arange(samples)]).T * tensor([10**i for i in range(n_vars)]))
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
df['time'] = np.arange(len(t))
df['device'] = 0
df['target'] = np.random.randint(0, 2, len(df))
df2 = df.copy()
df3 = df.copy()
cols = ['var_0', 'var_1', 'var_2', 'device', 'target']
df2[cols] = df2[cols] + 1
df3[cols] = df3[cols] + 2
df2 = df2.loc[:3]
df['region'] = 'A'
df2['region'] = 'A'
df3['region'] = 'B'
df = df.append(df2).append(df3).reset_index(drop=True)
df['index'] = np.arange(len(df))
df = df.sample(frac=1).reset_index(drop=True)
display(df.head())
df.shape
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 time device target region index
0 23458 234580 2345800 23458000 234580000 2345800000 23458000000 234580000000 2345800000000 23458000000000 23458 0 1 A 23458
1 24003 240012 2400102 24001000 240010000 2400100000 24001000000 240010000000 2400100000000 24001000000000 24001 2 3 B 124005
2 722 7202 72002 720000 7200000 72000000 720000000 7200000000 72000000000 720000000000 720 2 2 B 100724
3 94375 943732 9437302 94373000 943730000 9437300000 94373000000 943730000000 9437300000000 94373000000000 94373 2 2 B 194377
4 64524 645240 6452400 64524000 645240000 6452400000 64524000000 645240000000 6452400000000 64524000000000 64524 0 0 A 64524
(200004, 15)
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=False)(df)
X.shape, y.shape
((199992, 10, 5), (199992,))
{% endraw %} {% raw %}
X, y, key = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                               horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=True)(df)
X.shape, y.shape, key.shape
((199992, 10, 5), (199992,), (199992,))
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
((199992, 10, 5), (199992,))
{% endraw %} {% raw %}
def y_max(o): return np.max(o, axis=1)
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          y_func=y_max, horizon=5, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
((199982, 10, 5), (199982,))
{% endraw %} {% raw %}

identify_padding[source]

identify_padding(float_mask, value=-1)

Identifies padded subsequences in a mask of type float

This function identifies as padded subsequences those where all values == nan from the end of the sequence (last dimension) across all channels, and sets those values to the selected value (default = -1)

Args: mask: boolean or float mask value: scalar that will be used to identify padded subsequences

{% endraw %} {% raw %}
{% endraw %} {% raw %}
wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, _ = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X = tensor(X)
X[0, 1, -2:] = np.nan
X[1,..., :3] = np.nan
print(X)
identify_padding(torch.isnan(X).float())
input shape: (13, 3)
tensor([[[ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  1.,  2., nan, nan],
         [ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  0.,  0.,  0.,  0.]],

        [[nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  0.,  0.]],

        [[10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [ 0.,  0.,  0.,  1.,  1.]]])
tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.]],

        [[0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.]]])
{% endraw %}