--- title: Data preparation keywords: fastai sidebar: home_sidebar summary: "Functions required to prepare X (and y) from a pandas dataframe." description: "Functions required to prepare X (and y) from a pandas dataframe." nb_path: "nbs/011_data.preparation.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

df2Xy[source]

df2Xy(df, sample_col=None, feat_col=None, data_cols=None, target_col=None, steps_in_rows=False, to3d=True, splits=None, sort_by=None, ascending=True, y_func=None, return_names=False)

This function allows you to transform a pandas dataframe into X and y numpy arrays that can be used to craete a TSDataset. sample_col: column that uniquely identifies each sample. feat_col: used for multivariate datasets. It indicates which is the column that indicates the feature by row. data_col: indicates ths column/s where the data is located. If None, it means all columns (except the sample_col, feat_col, and target_col) target_col: indicates the column/s where the target is. steps_in_rows: flag to indicate if each step is in a different row or in a different column (default). to3d: turns X to 3d (including univariate time series) sort_by: this is used to pass any colum/s that are needed to sort the steps in the sequence. If you pass a sample_col and/ or feat_col these will be automatically used before the sort_by column/s, and you don't need to add them to the sort_by column/s list. y_func: function used to calculate y for each sample (and target_col) return_names: flag to return the names of the columns from where X was generated

{% endraw %} {% raw %}
{% endraw %} {% raw %}

split_Xy[source]

split_Xy(X, y=None, splits=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
df = pd.DataFrame()
df['sample_id'] = np.array([1,1,1,2,2,2,3,3,3])
df['var1'] = df['sample_id'] * 10 + df.index.values
df['var2'] = df['sample_id'] * 100 + df.index.values
df
sample_id var1 var2
0 1 10 100
1 1 11 101
2 1 12 102
3 2 23 203
4 2 24 204
5 2 25 205
6 3 36 306
7 3 37 307
8 3 38 308
{% endraw %} {% raw %}
X_df, y_df = df2Xy(df, sample_col='sample_id', steps_in_rows=True)
test_eq(X_df[0], np.array([[10, 11, 12], [100, 101, 102]]))
{% endraw %} {% raw %}
n_samples = 1_000
n_rows = 10_000

sample_ids = np.arange(n_samples).repeat(n_rows//n_samples).reshape(-1,1)
feat_ids = np.tile(np.arange(n_rows // n_samples), n_samples).reshape(-1,1)
cont = np.random.randn(n_rows, 6)
ind_cat = np.random.randint(0, 3, (n_rows, 1))
target = np.array(['a', 'b', 'c'])[ind_cat]
ind_cat2 = np.random.randint(0, 3, (n_rows, 1))
target2 = np.array(['a', 'b', 'c'])[ind_cat2]
data = np.concatenate([sample_ids, feat_ids, cont, target, target], -1)
columns = ['sample_id', 'feat_id'] + (np.arange(6) + 1).astype(str).tolist() + ['target'] + ['target2']
df = pd.DataFrame(data, columns=columns)
idx = np.random.choice(np.arange(len(df)), len(df), False)
new_dtypes = {'sample_id':np.int32, 'feat_id':np.int32, '1':np.float32, '2':np.float32, '3':np.float32, '4':np.float32, '5':np.float32, '6':np.float32}
df = df.astype(dtype=new_dtypes)
df = df.loc[idx].reset_index(drop=True)
df
sample_id feat_id 1 2 3 4 5 6 target target2
0 650 2 0.496140 -1.297490 -0.190591 -1.285640 -0.350204 -1.070606 b b
1 137 4 1.239497 0.011340 -0.308471 1.216183 0.973595 1.191614 c c
2 674 8 1.358624 -0.448930 0.794747 -0.245278 -1.955356 -2.156479 c c
3 36 0 0.072900 -1.168425 -0.280945 -1.512159 0.110056 0.334453 b b
4 328 8 0.567568 0.257276 -1.074804 -0.764139 0.359364 -1.045714 c c
... ... ... ... ... ... ... ... ... ... ...
9995 806 3 1.441506 0.263167 -0.256720 -0.691258 1.832286 -0.064437 b b
9996 624 0 1.752068 1.485179 -3.356308 -1.078424 -0.082846 0.351802 a a
9997 846 7 -0.665935 0.807114 -0.400047 -0.125871 -1.418537 0.171302 c c
9998 987 7 0.555413 -0.118551 0.681364 -0.691787 0.609029 1.037420 b b
9999 939 7 -1.281956 -0.064546 0.604580 -0.704135 -0.019735 0.823299 a a

10000 rows × 10 columns

{% endraw %} {% raw %}
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', data_cols=['1', '2', '3', '4', '5', '6'], target_col=['target'], 
             sort_by=['sample_id', 'feat_id'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000,))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(scipy.stats.mode(sorted_df[sorted_df.sample_id == rand_idx][['target']].values).mode), y[rand_idx])
{% endraw %} {% raw %}
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', target_col=['target', 'target2'], sort_by=['sample_id', 'feat_id'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000, 2))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(scipy.stats.mode(sorted_df[sorted_df.sample_id == rand_idx][['target', 'target2']].values).mode), y[rand_idx])
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;value_0;value_1;target
    rob;2;3;hot
    alice;6;7;lukewarm
    eve;11;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
X, y = df2Xy(df, sample_col='sample_id', target_col='target', data_cols=['value_0', 'value_1'], sort_by='sample_id')
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3,))
X, y
sample_id value_0 value_1 target
0 rob 2 3 hot
1 alice 6 7 lukewarm
2 eve 11 12 cold
(array([[[ 6,  7]],
 
        [[11, 12]],
 
        [[ 2,  3]]]),
 array(['lukewarm', 'cold', 'hot'], dtype=object))
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;timestep;values;target
    rob;1;2;hot
    alice;1;6;lukewarm
    eve;1;11;cold
    
    rob;2;3;hot
    alice;2;7;lukewarm
    eve;2;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', target_col='target', data_cols=['values'], sort_by='timestep', to3d=True, y_func=y_func)
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3, ))
print(X, y)
sample_id timestep values target
0 rob 1 2 hot
1 alice 1 6 lukewarm
2 eve 1 11 cold
3 rob 2 3 hot
4 alice 2 7 lukewarm
5 eve 2 12 cold
[[[ 6  7]]

 [[11 12]]

 [[ 2  3]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col='target', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3,))
sample_id trait value_0 value_1 target
9 eve green 10 11 cold
0 rob green 2 3 hot
6 alice blue 8 9 lukewarm
7 alice red 9 10 lukewarm
5 alice yellow 7 8 lukewarm
11 eve red 13 14 cold
2 rob blue 4 5 hot
4 alice green 6 7 lukewarm
1 rob yellow 3 4 hot
3 rob red 5 6 hot
10 eve blue 12 12 cold
8 eve yellow 11 12 cold
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target1;target2
    rob;green;2;3;hot;good
    rob;yellow;3;4;hot;good
    rob;blue;4;5;hot;good
    rob;red;5;6;hot;good
    alice;green;6;7;lukewarm;good
    alice;yellow;7;8;lukewarm;good
    alice;blue;8;9;lukewarm;good
    alice;red;9;10;lukewarm;good
    eve;yellow;11;12;cold;bad
    eve;green;10;11;cold;bad
    eve;blue;12;12;cold;bad
    eve;red;13;14;cold;bad
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col=['target1', 'target2'], data_cols=['value_0', 'value_1'], y_func=y_func)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, 2))
print(X, y)
sample_id trait value_0 value_1 target1 target2
0 rob green 2 3 hot good
1 rob yellow 3 4 hot good
2 rob blue 4 5 hot good
3 rob red 5 6 hot good
4 alice green 6 7 lukewarm good
5 alice yellow 7 8 lukewarm good
6 alice blue 8 9 lukewarm good
7 alice red 9 10 lukewarm good
8 eve yellow 11 12 cold bad
9 eve green 10 11 cold bad
10 eve blue 12 12 cold bad
11 eve red 13 14 cold bad
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] [['lukewarm' 'good']
 ['cold' 'bad']
 ['hot' 'good']]
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y, None)
sample_id trait value_0 value_1 target
6 alice blue 8 9 lukewarm
9 eve green 10 11 cold
11 eve red 13 14 cold
5 alice yellow 7 8 lukewarm
4 alice green 6 7 lukewarm
10 eve blue 12 12 cold
0 rob green 2 3 hot
1 rob yellow 3 4 hot
7 alice red 9 10 lukewarm
3 rob red 5 6 hot
8 eve yellow 11 12 cold
2 rob blue 4 5 hot
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] None
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;timestep;values;target
    rob;green;1;2;hot
    rob;yellow;1;3;hot
    rob;blue;1;4;hot
    rob;red;1;5;hot
    alice;green;1;6;lukewarm
    alice;yellow;1;7;lukewarm
    alice;blue;1;8;lukewarm
    alice;red;1;9;lukewarm
    eve;yellow;1;11;cold
    eve;green;1;10;cold
    eve;blue;1;12;cold
    eve;red;1;13;cold
    
    rob;green;2;3;hot
    rob;yellow;2;4;hot
    rob;blue;2;5;hot
    rob;red;2;6;hot
    alice;green;2;7;lukewarm
    alice;yellow;2;8;lukewarm
    alice;blue;2;9;lukewarm
    alice;red;2;10;lukewarm
    eve;yellow;2;12;cold
    eve;green;2;11;cold
    eve;blue;2;13;cold
    eve;red;2;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', sort_by='timestep', target_col='target', data_cols=['values'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, ))
sample_id trait timestep values target
0 rob green 1 2 hot
1 rob yellow 1 3 hot
2 rob blue 1 4 hot
3 rob red 1 5 hot
4 alice green 1 6 lukewarm
5 alice yellow 1 7 lukewarm
6 alice blue 1 8 lukewarm
7 alice red 1 9 lukewarm
8 eve yellow 1 11 cold
9 eve green 1 10 cold
10 eve blue 1 12 cold
11 eve red 1 13 cold
12 rob green 2 3 hot
13 rob yellow 2 4 hot
14 rob blue 2 5 hot
15 rob red 2 6 hot
16 alice green 2 7 lukewarm
17 alice yellow 2 8 lukewarm
18 alice blue 2 9 lukewarm
19 alice red 2 10 lukewarm
20 eve yellow 2 12 cold
21 eve green 2 11 cold
22 eve blue 2 13 cold
23 eve red 2 14 cold
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 13]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}

df2np3d[source]

df2np3d(df, groupby, data_cols=None)

Transforms a df (with the same number of rows per group in groupby) to a 3d ndarray

{% endraw %} {% raw %}
{% endraw %} {% raw %}
user = np.array([1,2]).repeat(4).reshape(-1,1)
val = np.random.rand(8, 3)
data = np.concatenate([user, val], axis=-1)
df = pd.DataFrame(data, columns=['user', 'x1', 'x2', 'x3'])
test_eq(df2np3d(df, ['user'], ['x1', 'x2', 'x3']).shape, (2, 3, 4))
{% endraw %} {% raw %}

add_missing_value_cols[source]

add_missing_value_cols(df, cols=None, dtype=float, fill_value=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
data = np.random.randn(10, 2)
mask = data > .8
data[mask] = np.nan
df = pd.DataFrame(data, columns=['A', 'B'])
df = add_missing_value_cols(df, cols=None, dtype=float)
test_eq(df['A'].isnull().sum(), df['missing_A'].sum())
test_eq(df['B'].isnull().sum(), df['missing_B'].sum())
df
A B missing_A missing_B
0 NaN -0.376041 1.0 0.0
1 -0.734193 0.530183 0.0 0.0
2 0.406711 0.291015 0.0 0.0
3 -0.421393 -1.302529 0.0 0.0
4 0.032445 0.760513 0.0 0.0
5 -1.322853 NaN 0.0 1.0
6 0.678992 0.400528 0.0 0.0
7 -1.324564 -0.203378 0.0 0.0
8 NaN 0.270509 1.0 0.0
9 -0.789952 -0.653338 0.0 0.0
{% endraw %} {% raw %}

add_missing_timestamps[source]

add_missing_timestamps(df, datetime_col, groupby=None, fill_value=nan, range_by_group=True, freq=None)

Fills missing timestamps in a dataframe to a desired frequency Args: df: pandas DataFrame datetime_col: column that contains the datetime data (without duplicates within groups) groupby: column used to identify unique_ids fill_value: values that will be insert where missing dates exist. Default:np.nan range_by_group: if True, dates will be filled between min and max dates for each group. Otherwise, between the min and max dates in the df. freq: frequence used to fillin the missing datetime

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates
date feature1 feature2
0 2021-05-01 0.107641 0.560177
1 2021-05-03 0.130178 0.038290
2 2021-05-05 0.391507 0.646296
3 2021-05-06 0.869531 0.851778
4 2021-05-07 0.002044 0.232475
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby=None, 
                                   fill_value=np.nan, 
                                   range_by_group=False)
test_eq(output_df, expected_output_df)
date feature1 feature2
0 2021-05-01 0.107641 0.560177
1 2021-05-02 NaN NaN
2 2021-05-03 0.130178 0.038290
3 2021-05-04 NaN NaN
4 2021-05-05 0.391507 0.646296
5 2021-05-06 0.869531 0.851778
6 2021-05-07 0.002044 0.232475
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,3,8,11,13]).reset_index(drop=True)

date_df_with_missing_dates
date id feature1 feature2
0 2021-05-03 0 0.869091 0.405103
1 2021-05-05 0 0.708110 0.488442
2 2021-05-06 0 0.557568 0.256524
3 2021-05-07 0 0.457684 0.996438
4 2021-05-01 1 0.773929 0.829867
5 2021-05-03 1 0.416473 0.694339
6 2021-05-04 1 0.061791 0.656300
7 2021-05-06 1 0.533233 0.492698
{% endraw %} {% raw %}
expected_output_df = date_df.drop([0,1,13]).reset_index(drop=True)  
expected_output_df.loc[[1,6,9], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=True)
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-03 0 0.869091 0.405103
1 2021-05-04 0 NaN NaN
2 2021-05-05 0 0.708110 0.488442
3 2021-05-06 0 0.557568 0.256524
4 2021-05-07 0 0.457684 0.996438
5 2021-05-01 1 0.773929 0.829867
6 2021-05-02 1 NaN NaN
7 2021-05-03 1 0.416473 0.694339
8 2021-05-04 1 0.061791 0.656300
9 2021-05-05 1 NaN NaN
10 2021-05-06 1 0.533233 0.492698
{% endraw %} {% raw %}
expected_output_df = date_df.copy() 
expected_output_df.loc[[0,1,3,8,11,13], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=False)
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-01 0 NaN NaN
1 2021-05-02 0 NaN NaN
2 2021-05-03 0 0.869091 0.405103
3 2021-05-04 0 NaN NaN
4 2021-05-05 0 0.708110 0.488442
5 2021-05-06 0 0.557568 0.256524
6 2021-05-07 0 0.457684 0.996438
7 2021-05-01 1 0.773929 0.829867
8 2021-05-02 1 NaN NaN
9 2021-05-03 1 0.416473 0.694339
10 2021-05-04 1 0.061791 0.656300
11 2021-05-05 1 NaN NaN
12 2021-05-06 1 0.533233 0.492698
13 2021-05-07 1 NaN NaN
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates
date feature1 feature2
0 2021-05-01 00:00:00 0.090806 0.995103
1 2021-05-01 08:00:00 0.301375 0.255982
2 2021-05-01 16:00:00 0.235061 0.267396
3 2021-05-01 20:00:00 0.760944 0.502640
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 'date', groupby=None, fill_value=np.nan, range_by_group=False, freq='4H')
test_eq(output_df, expected_output_df)
date feature1 feature2
0 2021-05-01 00:00:00 0.090806 0.995103
1 2021-05-01 04:00:00 NaN NaN
2 2021-05-01 08:00:00 0.301375 0.255982
3 2021-05-01 12:00:00 NaN NaN
4 2021-05-01 16:00:00 0.235061 0.267396
5 2021-05-01 20:00:00 0.760944 0.502640
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,3,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.692439 0.038288
1 2021-05-01 16:00:00 0 0.326419 0.587040
2 2021-05-01 20:00:00 0 0.814016 0.874346
3 2021-05-01 00:00:00 1 0.635172 0.703396
4 2021-05-01 04:00:00 1 0.112123 0.564187
5 2021-05-01 16:00:00 1 0.873603 0.743482
{% endraw %} {% raw %}
expected_output_df = date_df.drop([0,1,11]).reset_index(drop=True)  
expected_output_df.loc[[1,6,7], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=True, 
                                   freq='4H')
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.692439 0.038288
1 2021-05-01 12:00:00 0 NaN NaN
2 2021-05-01 16:00:00 0 0.326419 0.587040
3 2021-05-01 20:00:00 0 0.814016 0.874346
4 2021-05-01 00:00:00 1 0.635172 0.703396
5 2021-05-01 04:00:00 1 0.112123 0.564187
6 2021-05-01 08:00:00 1 NaN NaN
7 2021-05-01 12:00:00 1 NaN NaN
8 2021-05-01 16:00:00 1 0.873603 0.743482
{% endraw %} {% raw %}
expected_output_df = date_df.copy() 
expected_output_df.loc[[0,1,3,8,9,11], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=False, 
                                   freq='4H')
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-01 00:00:00 0 NaN NaN
1 2021-05-01 04:00:00 0 NaN NaN
2 2021-05-01 08:00:00 0 0.692439 0.038288
3 2021-05-01 12:00:00 0 NaN NaN
4 2021-05-01 16:00:00 0 0.326419 0.587040
5 2021-05-01 20:00:00 0 0.814016 0.874346
6 2021-05-01 00:00:00 1 0.635172 0.703396
7 2021-05-01 04:00:00 1 0.112123 0.564187
8 2021-05-01 08:00:00 1 NaN NaN
9 2021-05-01 12:00:00 1 NaN NaN
10 2021-05-01 16:00:00 1 0.873603 0.743482
11 2021-05-01 20:00:00 1 NaN NaN
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby=None, fill_value=np.nan, range_by_group=False, freq='4H'), 
          contains='cannot reindex from a duplicate axis')
date feature1 feature2
0 2021-05-01 00:00:00 0.299408 0.666063
1 2021-05-01 08:00:00 0.120877 0.490112
2 2021-05-01 16:00:00 0.721475 0.534208
3 2021-05-01 16:00:00 0.470794 0.429649
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=True, freq='4H'), 
          contains='cannot handle a non-unique multi-index!')
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.205682 0.112418
1 2021-05-01 12:00:00 0 0.924086 0.947679
2 2021-05-01 16:00:00 0 0.246820 0.405505
3 2021-05-01 16:00:00 0 0.369901 0.610015
4 2021-05-01 00:00:00 1 0.379320 0.815567
5 2021-05-01 04:00:00 1 0.632240 0.882816
6 2021-05-01 16:00:00 1 0.357889 0.731978
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=False, freq='4H'), 
          contains='cannot handle a non-unique multi-index!')
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.521592 0.961618
1 2021-05-01 12:00:00 0 0.344581 0.530947
2 2021-05-01 16:00:00 0 0.596286 0.976873
3 2021-05-01 16:00:00 0 0.322172 0.249452
4 2021-05-01 00:00:00 1 0.332885 0.450764
5 2021-05-01 04:00:00 1 0.702561 0.865437
6 2021-05-01 16:00:00 1 0.148691 0.342936
{% endraw %} {% raw %}

time_encoding[source]

time_encoding(series, freq, max_val=None)

Transforms a pandas series of dtype datetime64 (of any freq) or DatetimeIndex into 2 float arrays

Available options: microsecond, millisecond, second, minute, hour, day = day_of_month = dayofmonth, day_of_week = weekday = dayofweek, day_of_year = dayofyear, week = week_of_year = weekofyear, month and year

{% endraw %} {% raw %}
{% endraw %} {% raw %}
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    tdf = pd.DataFrame(pd.date_range('2021-03-01', datetime.today()), columns=['date'])
    a,b = time_encoding(tdf.date, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()
{% endraw %} {% raw %}
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    dateindex = pd.date_range('2021-03-01', datetime.today())
    a,b = time_encoding(dateindex, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()
{% endraw %} {% raw %}
dow_sin, dow_cos = time_encoding(date_df['date'], 'dayofweek')
plt.plot(dow_sin)
plt.plot(dow_cos)
plt.title('DayOfWeek')
plt.show()
date_df['dow_sin'] = dow_sin
date_df['dow_cos'] = dow_cos
date_df
date id feature1 feature2 dow_sin dow_cos
0 2021-05-01 00:00:00 0 0.234642 0.575798 -0.974928 -0.222521
1 2021-05-01 04:00:00 0 0.815493 0.632887 -0.974928 -0.222521
2 2021-05-01 08:00:00 0 0.521592 0.961618 -0.974928 -0.222521
3 2021-05-01 12:00:00 0 0.344581 0.530947 -0.974928 -0.222521
4 2021-05-01 16:00:00 0 0.596286 0.976873 -0.974928 -0.222521
5 2021-05-01 20:00:00 0 0.322172 0.249452 -0.974928 -0.222521
6 2021-05-01 00:00:00 1 0.332885 0.450764 -0.974928 -0.222521
7 2021-05-01 04:00:00 1 0.702561 0.865437 -0.974928 -0.222521
8 2021-05-01 08:00:00 1 0.757722 0.237408 -0.974928 -0.222521
9 2021-05-01 12:00:00 1 0.357577 0.940346 -0.974928 -0.222521
10 2021-05-01 16:00:00 1 0.148691 0.342936 -0.974928 -0.222521
11 2021-05-01 20:00:00 1 0.025426 0.176593 -0.974928 -0.222521
{% endraw %} {% raw %}

forward_gaps[source]

forward_gaps(o, normalize=True)

Number of sequence steps since previous real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

backward_gaps[source]

backward_gaps(o, normalize=True)

Number of sequence steps to next real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

nearest_gaps[source]

nearest_gaps(o, normalize=True)

Number of sequence steps to nearest real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

get_gaps[source]

get_gaps(o:Tensor, forward:bool=True, backward:bool=True, nearest:bool=True, normalize:bool=True)

Number of sequence steps from previous, to next and/or to nearest real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = torch.rand(1, 2, 8)
arr = t.numpy()
t[t <.6] = np.nan
test_ge(nearest_gaps(t).min().item(), 0)
test_ge(nearest_gaps(arr).min(), 0)
test_le(nearest_gaps(t).min().item(), 1)
test_le(nearest_gaps(arr).min(), 1)
test_eq(torch.isnan(forward_gaps(t)).sum(), 0)
test_eq(np.isnan(forward_gaps(arr)).sum(), 0)
ag = get_gaps(t)
test_eq(ag.shape, (1,6,8))
test_eq(torch.isnan(ag).sum(), 0)
{% endraw %} {% raw %}

add_delta_timestamp_cols[source]

add_delta_timestamp_cols(df, cols=None, groupby=None, forward=True, backward=True, nearest=True, normalize=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
data = np.zeros((len(dates), 2))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))

cols = ['date', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float})
date_df.loc[[1,3,4],'feature1'] = np.nan
date_df
date feature1
0 2021-05-01 0.701402
1 2021-05-02 NaN
2 2021-05-03 0.368235
3 2021-05-04 NaN
4 2021-05-05 NaN
5 2021-05-06 0.908930
6 2021-05-07 0.535344
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = np.array([1,1,2,1,2,3,1])
expected_output_df['feature1_dt_bwd'] = np.array([2,1,3,2,1,1,1])
expected_output_df['feature1_dt_nearest'] = np.array([1,1,2,1,1,1,1])

display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1', normalize=False)
test_eq(expected_output_df, output_df)
date feature1 feature1_dt_fwd feature1_dt_bwd feature1_dt_nearest
0 2021-05-01 0.701402 1 2 1
1 2021-05-02 NaN 1 1 1
2 2021-05-03 0.368235 2 3 2
3 2021-05-04 NaN 1 2 1
4 2021-05-05 NaN 2 1 1
5 2021-05-06 0.908930 3 1 1
6 2021-05-07 0.535344 1 1 1
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))

cols = ['date', 'id', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float})
date_df.loc[[1,3,4,8,9,11],'feature1'] = np.nan
date_df
date id feature1
0 2021-05-01 0 0.954881
1 2021-05-02 0 NaN
2 2021-05-03 0 0.402601
3 2021-05-04 0 NaN
4 2021-05-05 0 NaN
5 2021-05-06 0 0.644042
6 2021-05-07 0 0.263104
7 2021-05-01 1 0.655757
8 2021-05-02 1 NaN
9 2021-05-03 1 NaN
10 2021-05-04 1 0.695076
11 2021-05-05 1 NaN
12 2021-05-06 1 0.049489
13 2021-05-07 1 0.751240
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = np.array([1,1,2,1,2,3,1,1,1,2,3,1,2,1])
expected_output_df['feature1_dt_bwd'] = np.array([2,1,3,2,1,1,1,3,2,1,2,1,1,1])
expected_output_df['feature1_dt_nearest'] = np.array([1,1,2,1,1,1,1,1,1,1,2,1,1,1])

display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1', groupby='id', normalize=False)
test_eq(expected_output_df, output_df)
date id feature1 feature1_dt_fwd feature1_dt_bwd feature1_dt_nearest
0 2021-05-01 0 0.954881 1 2 1
1 2021-05-02 0 NaN 1 1 1
2 2021-05-03 0 0.402601 2 3 2
3 2021-05-04 0 NaN 1 2 1
4 2021-05-05 0 NaN 2 1 1
5 2021-05-06 0 0.644042 3 1 1
6 2021-05-07 0 0.263104 1 1 1
7 2021-05-01 1 0.655757 1 3 1
8 2021-05-02 1 NaN 1 2 1
9 2021-05-03 1 NaN 2 1 1
10 2021-05-04 1 0.695076 3 2 2
11 2021-05-05 1 NaN 1 1 1
12 2021-05-06 1 0.049489 2 1 1
13 2021-05-07 1 0.751240 1 1 1
{% endraw %}

SlidingWindow and SlidingWindowPanel are 2 useful functions that will allow you to create an array with segments of a pandas dataframe based on multiple criteria.

{% raw %}

SlidingWindow[source]

SlidingWindow(window_len:int, stride:Union[NoneType, int]=1, start:int=0, pad_remainder:bool=False, padding:str='post', padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType, int, list]=None, get_y:Union[NoneType, int, list]=None, y_func:Optional[callable]=None, output_processor:Optional[callable]=None, copy:bool=False, horizon:Union[int, list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True)

Applies a sliding window to a 1d or 2d input (np.ndarray, torch.Tensor or pd.DataFrame) Args: window_len = length of lookback window stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default) or a given step (int). Previous steps will be discarded. pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding = 'pre' or 'post' (optional, defaults to 'pre'): pad either before or after each sequence. If pad_remainder == False, it indicates the starting point to create the sequence ('pre' from the end, and 'post' from the beginning) padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict (y). If get_y is [] horizon will be set to 0.

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y.
                      [] means no y data is created (unlabeled data).
y_func              = optional function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
output_processor    = optional function to process the final output (X (and y if available)). This is useful when some values need to be removed.
                      The function should take X and y (even if it's None) as arguments.
copy                = copy the original object to avoid changes in it.
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y

Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X
input shape: (13, 3)
array([[[ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [ 0.,  0.,  0.,  1.,  1.]]])
{% endraw %} {% raw %}
wl = 5
t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl)(t)
test_eq(X.shape[1:], (1, wl))
itemify(X,)
input shape: (10,)
(#5) [(array([[0, 1, 2, 3, 4]]),),(array([[1, 2, 3, 4, 5]]),),(array([[2, 3, 4, 5, 6]]),),(array([[3, 4, 5, 6, 7]]),),(array([[4, 5, 6, 7, 8]]),)]
{% endraw %} {% raw %}
wl = 5
h = 1

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9)]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=3, horizon=1, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[3, 4, 5, 6, 7]]), 8)]
{% endraw %} {% raw %}
wl = 5
start = 3

t = np.arange(20)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=None, horizon=1, start=start)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
[(array([[3, 4, 5, 6, 7]]), 8), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[13, 14, 15, 16, 17]]), 18)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
var
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=1, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
var
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9), (array([[5, 6, 7, 8, 9]]), 10), (array([[ 6,  7,  8,  9, 10]]), 11), (array([[ 7,  8,  9, 10, 11]]), 12), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[ 9, 10, 11, 12, 13]]), 14), (array([[10, 11, 12, 13, 14]]), 15), (array([[11, 12, 13, 14, 15]]), 16), (array([[12, 13, 14, 15, 16]]), 17), (array([[13, 14, 15, 16, 17]]), 18), (array([[14, 15, 16, 17, 18]]), 19)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var']).T
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None, seq_first=False)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
var 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
var_0 var_1 var_2
0 0 0 0
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
5 5 50 500
6 6 60 600
7 7 70 700
8 8 80 800
9 9 90 900
[(array([[  0,   1,   2,   3,   4],
       [  0,  10,  20,  30,  40],
       [  0, 100, 200, 300, 400]]), array([  5,  50, 500])), (array([[  1,   2,   3,   4,   5],
       [ 10,  20,  30,  40,  50],
       [100, 200, 300, 400, 500]]), array([  6,  60, 600])), (array([[  2,   3,   4,   5,   6],
       [ 20,  30,  40,  50,  60],
       [200, 300, 400, 500, 600]]), array([  7,  70, 700])), (array([[  3,   4,   5,   6,   7],
       [ 30,  40,  50,  60,  70],
       [300, 400, 500, 600, 700]]), array([  8,  80, 800])), (array([[  4,   5,   6,   7,   8],
       [ 40,  50,  60,  70,  80],
       [400, 500, 600, 700, 800]]), array([  9,  90, 900]))]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1, get_y="var_0")(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
var_0 var_1 var_2
0 0 0 0
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
5 5 50 500
6 6 60 600
7 7 70 700
8 8 80 800
9 9 90 900
[(array([[  0,   1,   2,   3,   4],
       [  0,  10,  20,  30,  40],
       [  0, 100, 200, 300, 400]]), 5), (array([[  1,   2,   3,   4,   5],
       [ 10,  20,  30,  40,  50],
       [100, 200, 300, 400, 500]]), 6), (array([[  2,   3,   4,   5,   6],
       [ 20,  30,  40,  50,  60],
       [200, 300, 400, 500, 600]]), 7), (array([[  3,   4,   5,   6,   7],
       [ 30,  40,  50,  60,  70],
       [300, 400, 500, 600, 700]]), 8), (array([[  4,   5,   6,   7,   8],
       [ 40,  50,  60,  70,  80],
       [400, 500, 600, 700, 800]]), 9)]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(wl, horizon=1, get_x=columns[:-1], get_y='target')(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars-1, wl))
test_eq(items[0][1].shape, ())
input shape: torch.Size([10, 3])
var_0 var_1 target
0 0 0 0
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
5 5 50 500
6 6 60 600
7 7 70 700
8 8 80 800
9 9 90 900
[(array([[ 0,  1,  2,  3,  4],
       [ 0, 10, 20, 30, 40]]), 500), (array([[ 1,  2,  3,  4,  5],
       [10, 20, 30, 40, 50]]), 600), (array([[ 2,  3,  4,  5,  6],
       [20, 30, 40, 50, 60]]), 700), (array([[ 3,  4,  5,  6,  7],
       [30, 40, 50, 60, 70]]), 800), (array([[ 4,  5,  6,  7,  8],
       [40, 50, 60, 70, 80]]), 900)]
{% endraw %} {% raw %}
n_vars = 3

t = (np.random.rand(1000, n_vars) - .5).cumsum(0)
print(t.shape)
plt.plot(t)
plt.show()
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=[0,1], get_y=2)(t)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(1000, 3)
(200, 2, 5) (200,)
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (np.random.rand(100, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, horizon=0, get_x=columns[:-1], get_y='target')(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 3)
var_0 var_1 target
0 0.151837 -0.048944 -0.086911
1 0.442175 0.442330 0.305716
2 0.695260 0.470773 0.783916
3 0.917282 0.756567 0.633725
4 1.265482 0.618966 0.870565
... ... ... ...
95 -1.062540 -3.714890 2.216145
96 -0.665062 -3.484321 2.108725
97 -0.296625 -3.538047 2.053786
98 0.164675 -3.990557 2.096369
99 -0.199011 -3.734923 2.145809

100 rows × 3 columns

(96, 2, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
var_0 var_1 var_2 var_3 target
0 0.270118 0.401213 0.272670 0.493517 -0.021673
1 -0.106424 0.565247 -0.203798 0.598261 0.274794
2 0.117606 0.110515 -0.614011 0.801915 -0.069158
3 -0.007133 0.095893 -0.890897 1.297610 -0.281133
4 0.253050 0.320340 -1.014919 1.567637 -0.047330
... ... ... ... ... ...
95 1.203746 3.171611 -5.285653 4.292907 -3.504508
96 1.074430 2.824502 -5.048195 4.589520 -3.584942
97 1.291623 2.609772 -4.916663 4.669692 -3.606693
98 0.857525 2.266118 -5.220657 4.567347 -3.887462
99 0.633159 1.910812 -5.687722 4.922521 -4.132490

100 rows × 5 columns

(96, 4, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5

t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
var_0 0.191243 0.105209 0.033522 -0.311248 -0.058717 -0.135371 0.132678 -0.183393 -0.119809 0.017090 ... 2.209496 2.057069 2.276367 2.197653 2.396582 2.795786 3.075803 3.196442 3.627014 3.555307
var_1 -0.168046 -0.181401 -0.597787 -0.501323 -0.947870 -0.497390 -0.917158 -0.903838 -0.732819 -1.095582 ... 1.629074 1.417814 1.690487 2.109474 2.283191 2.049056 2.008262 1.930229 1.912034 1.768289
var_2 -0.159580 0.296010 0.783018 0.879135 0.432601 0.570982 0.603589 0.478303 0.733599 0.603375 ... 1.566387 1.240595 1.149536 0.884109 0.648623 1.003764 1.390233 1.294555 1.295826 1.078335
var_3 -0.151572 -0.453714 -0.132955 -0.518455 -0.655113 -0.303544 0.045424 0.064264 -0.387244 -0.625874 ... -5.213131 -5.510535 -5.650693 -5.845154 -5.550343 -5.581261 -5.579763 -5.818226 -6.303455 -6.532711
target 0.255565 0.525829 0.962865 1.330833 1.630930 1.279859 0.984724 1.402589 1.454553 1.026170 ... 1.422217 1.252997 1.307983 0.845494 0.447796 0.756131 0.481165 0.232801 0.395575 0.089069

5 rows × 100 columns

(96, 4, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
var_0 -0.180890 -0.022236 -0.261760 -0.448416 -0.155361 -0.579406 -0.956505 -1.254000 -1.259994 -1.296660 ... -5.980942 -6.129956 -6.368420 -6.094082 -5.939042 -6.239170 -6.365764 -6.516177 -6.859611 -7.096180
var_1 -0.480764 -0.134124 0.224772 -0.048849 0.001466 -0.469721 -0.880207 -0.722787 -1.063595 -1.558938 ... -2.073902 -2.573169 -2.851342 -2.712820 -2.564170 -2.468164 -2.510227 -2.507263 -2.262299 -1.995484
var_2 -0.298537 -0.546567 -0.626739 -0.854745 -1.295200 -1.390286 -1.196262 -1.161465 -1.334420 -1.424862 ... -1.699517 -1.267482 -1.408220 -1.680382 -1.883404 -1.784232 -1.749767 -2.151378 -1.696802 -1.538076
var_3 0.095424 -0.240506 -0.513861 -0.165390 -0.177009 -0.121117 -0.509962 -0.722685 -0.304477 -0.669883 ... -0.565985 -0.578893 -0.795680 -1.131773 -1.540651 -1.766411 -2.059863 -2.045588 -1.852777 -1.850153
target -0.120314 -0.236068 -0.709575 -0.885634 -0.891293 -0.724318 -0.689796 -0.991039 -1.220667 -1.075680 ... -3.785433 -3.618677 -3.501625 -3.329855 -3.294141 -3.576343 -3.614416 -3.661525 -3.915929 -3.707572

5 rows × 100 columns

(20, 4, 5) (20,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
splits = TrainValidTestSplitter(valid_size=.2, shuffle=False)(y)
X.shape, y.shape, splits
(100, 5)
var_0 var_1 var_2 var_3 target
0 -0.448035 0.381311 0.401826 -0.343200 0.035960
1 -0.465432 0.218660 0.197774 -0.285720 -0.152869
2 -0.155135 -0.236964 0.137070 -0.473257 -0.242305
3 -0.167248 0.187235 0.617105 -0.618643 0.228207
4 -0.074304 0.470079 0.431819 -0.450250 -0.209693
... ... ... ... ... ...
95 -0.572532 0.637036 -2.283661 -1.459671 -6.100509
96 -0.169409 0.667968 -2.525221 -1.930077 -5.616771
97 0.047321 0.530486 -2.115905 -1.606114 -6.095264
98 0.020069 0.117581 -2.198609 -1.348084 -5.734830
99 -0.446887 -0.106282 -2.603213 -1.698979 -5.295172

100 rows × 5 columns

((96, 4, 5),
 (96,),
 ((#77) [0,1,2,3,4,5,6,7,8,9...], (#19) [77,78,79,80,81,82,83,84,85,86...]))
{% endraw %} {% raw %}
data = np.concatenate([np.linspace(0, 1, 11).reshape(-1,1).repeat(2, 1), np.arange(11).reshape(-1,1)], -1)
df_test = pd.DataFrame(data, columns=['col1', 'col2', 'target'])
df_test['target'] = df_test['target'].astype(int)
df_test
col1 col2 target
0 0.0 0.0 0
1 0.1 0.1 1
2 0.2 0.2 2
3 0.3 0.3 3
4 0.4 0.4 4
5 0.5 0.5 5
6 0.6 0.6 6
7 0.7 0.7 7
8 0.8 0.8 8
9 0.9 0.9 9
10 1.0 1.0 10
{% endraw %} {% raw %}
def _y_func(o): return o[:, 0]
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, y = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=_y_func)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/wl))
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[2], wl)
    test_close(x[:, 0, 0]*10, y)
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, y = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=None)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/ wl))
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(1, len(df_test)+1):
    x, y = SlidingWindow(wl, None, pad_remainder=False, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=None)(df_test)
    test_eq(x.shape[0], len(df_test) // wl)
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, _ = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=[], horizon=0)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/wl))
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(2, len(df_test)):
    x, _ = SlidingWindow(wl, wl, pad_remainder=False, get_x=['col1', 'col2'], get_y=[], horizon=0)(df_test)
    test_eq(x.shape[0], len(df_test) // wl)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
df = pd.DataFrame()
df['sample_id'] = np.concatenate([np.ones(n)*(i + 1) for i,n in enumerate([13])])
df['var1'] = df['sample_id'] + df.index.values - 1
df['var2'] = df['var1'] * 10
df['target'] = (df['var1']).astype(int)
df['sample_id'] = df['sample_id'].astype(int)
df
sample_id var1 var2 target
0 1 0.0 0.0 0
1 1 1.0 10.0 1
2 1 2.0 20.0 2
3 1 3.0 30.0 3
4 1 4.0 40.0 4
5 1 5.0 50.0 5
6 1 6.0 60.0 6
7 1 7.0 70.0 7
8 1 8.0 80.0 8
9 1 9.0 90.0 9
10 1 10.0 100.0 10
11 1 11.0 110.0 11
12 1 12.0 120.0 12
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=False, padding="pre", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (2, 2, 3))
test_eq(y.shape, (2, 4))
X, y
(array([[[ 4.,  5.,  6.],
         [40., 50., 60.]],
 
        [[ 6.,  7.,  8.],
         [60., 70., 80.]]]),
 array([[ 7,  8,  9, 10],
        [ 9, 10, 11, 12]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=True, padding="pre", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (3, 2, 3))
test_eq(y.shape, (3, 4))
X, y
(array([[[nan,  3.,  4.],
         [nan, 30., 40.]],
 
        [[ 4.,  5.,  6.],
         [40., 50., 60.]],
 
        [[ 6.,  7.,  8.],
         [60., 70., 80.]]]),
 array([[ 5,  6,  7,  8],
        [ 7,  8,  9, 10],
        [ 9, 10, 11, 12]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=False, padding="post", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (2, 2, 3))
test_eq(y.shape, (2, 4))
X, y
(array([[[ 3.,  4.,  5.],
         [30., 40., 50.]],
 
        [[ 5.,  6.,  7.],
         [50., 60., 70.]]]),
 array([[ 6,  7,  8,  9],
        [ 8,  9, 10, 11]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=True, padding="post", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (3, 2, 3))
test_eq(y.shape, (3, 4))
X, y
(array([[[ 3.,  4.,  5.],
         [30., 40., 50.]],
 
        [[ 5.,  6.,  7.],
         [50., 60., 70.]],
 
        [[ 7.,  8.,  9.],
         [70., 80., 90.]]]),
 array([[ 6.,  7.,  8.,  9.],
        [ 8.,  9., 10., 11.],
        [10., 11., 12., nan]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=10, stride=2, start=3, pad_remainder=True, padding="pre", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (1, 2, 10))
test_eq(y.shape, (1, 4))
X, y
(array([[[nan, nan, nan, nan,  3.,  4.,  5.,  6.,  7.,  8.],
         [nan, nan, nan, nan, 30., 40., 50., 60., 70., 80.]]]),
 array([[ 9, 10, 11, 12]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=10, stride=2, start=3, pad_remainder=True, padding="post", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (1, 2, 10))
test_eq(y.shape, (1, 4))
X, y
(array([[[  3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.],
         [ 30.,  40.,  50.,  60.,  70.,  80.,  90., 100., 110., 120.]]]),
 array([[nan, nan, nan, nan]]))
{% endraw %} {% raw %}

SlidingWindowPanel[source]

SlidingWindowPanel(window_len:int, unique_id_cols:list, stride:Union[NoneType, int]=1, start:int=0, pad_remainder:bool=False, padding:str='post', padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType, int, list]=None, get_y:Union[NoneType, int, list]=None, y_func:Optional[callable]=None, output_processor:Optional[callable]=None, copy:bool=False, horizon:Union[int, list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True, return_key:bool=False, verbose:bool=True)

Applies a sliding window to a pd.DataFrame.

Args: window_len = length of lookback window unique_id_cols = pd.DataFrame columns that will be used to identify a time series for each entity. stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default), a given step (int), or random within the 1st stride (None). pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding = 'pre' or 'post' (optional, defaults to 'pre'): pad either before or after each sequence. If pad_remainder == False, it indicates the starting point to create the sequence ('pre' from the end, and 'post' from the beginning) padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict (y). If get_y is [] horizon will be set to 0.

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y.
                      [] means no y data is created (unlabeled data).
y_func              = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
output_processor    = optional function to filter output (X (and y if available)). This is useful when some values need to be removed. The function
                      should take X and y (even if it's None) as arguments.
copy                = copy the original object to avoid changes in it.
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y
return_key          = when True, the key corresponsing to unique_id_cols for each sample is returned
verbose             = controls verbosity. True or 1 displays progress bar. 2 or more show records that cannot be created due to its length.


Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
samples = 100_000
wl = 5
n_vars = 10

t = (torch.stack(n_vars * [torch.arange(samples)]).T * tensor([10**i for i in range(n_vars)]))
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
df['time'] = np.arange(len(t))
df['device'] = 0
df['target'] = np.random.randint(0, 2, len(df))
df2 = df.copy()
df3 = df.copy()
cols = ['var_0', 'var_1', 'var_2', 'device', 'target']
df2[cols] = df2[cols] + 1
df3[cols] = df3[cols] + 2
df2 = df2.loc[:3]
df['region'] = 'A'
df2['region'] = 'A'
df3['region'] = 'B'
df = df.append(df2).append(df3).reset_index(drop=True)
df['index'] = np.arange(len(df))
df = df.sample(frac=1).reset_index(drop=True)
display(df.head())
df.shape
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 time device target region index
0 18180 181782 1817802 18178000 181780000 1817800000 18178000000 181780000000 1817800000000 18178000000000 18178 2 2 B 118182
1 65334 653322 6533202 65332000 653320000 6533200000 65332000000 653320000000 6533200000000 65332000000000 65332 2 2 B 165336
2 33920 339182 3391802 33918000 339180000 3391800000 33918000000 339180000000 3391800000000 33918000000000 33918 2 2 B 133922
3 93444 934422 9344202 93442000 934420000 9344200000 93442000000 934420000000 9344200000000 93442000000000 93442 2 3 B 193446
4 39971 399692 3996902 39969000 399690000 3996900000 39969000000 399690000000 3996900000000 39969000000000 39969 2 2 B 139973
(200004, 15)
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=False)(df)
X.shape, y.shape
((199992, 10, 5), (199992,))
{% endraw %} {% raw %}
X, y, key = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                               horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=True)(df)
X.shape, y.shape, key.shape
((199992, 10, 5), (199992,), (199992,))
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
((199992, 10, 5), (199992,))
{% endraw %} {% raw %}
def y_max(o): return np.max(o, axis=1)
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          y_func=y_max, horizon=5, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
((199982, 10, 5), (199982,))
{% endraw %} {% raw %}

identify_padding[source]

identify_padding(float_mask, value=-1)

Identifies padded subsequences in a mask of type float

This function identifies as padded subsequences those where all values == nan from the end of the sequence (last dimension) across all channels, and sets those values to the selected value (default = -1)

Args: mask: boolean or float mask value: scalar that will be used to identify padded subsequences

{% endraw %} {% raw %}
{% endraw %} {% raw %}
wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, _ = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X = tensor(X)
X[0, 1, -2:] = np.nan
X[1,..., :3] = np.nan
print(X)
identify_padding(torch.isnan(X).float())
input shape: (13, 3)
tensor([[[ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  1.,  2., nan, nan],
         [ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  0.,  0.,  0.,  0.]],

        [[nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  0.,  0.]],

        [[10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [ 0.,  0.,  0.,  1.,  1.]]])
tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.]],

        [[0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.]]])
{% endraw %}