--- title: Data preparation keywords: fastai sidebar: home_sidebar summary: "Functions required to prepare X (and y) from a pandas dataframe." description: "Functions required to prepare X (and y) from a pandas dataframe." nb_path: "nbs/011_data.preparation.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

df2Xy[source]

df2Xy(df, sample_col=None, feat_col=None, data_cols=None, target_col=None, steps_in_rows=False, to3d=True, splits=None, sort_by=None, ascending=True, y_func=None, return_names=False)

This function allows you to transform a pandas dataframe into X and y numpy arrays that can be used to craete a TSDataset. sample_col: column that uniquely identifies each sample. feat_col: used for multivariate datasets. It indicates which is the column that indicates the feature by row. data_col: indicates ths column/s where the data is located. If None, it means all columns (except the sample_col, feat_col, and target_col) target_col: indicates the column/s where the target is. steps_in_rows: flag to indicate if each step is in a different row or in a different column (default). to3d: turns X to 3d (including univariate time series) sort_by: this is used to pass any colum/s that are needed to sort the steps in the sequence. If you pass a sample_col and/ or feat_col these will be automatically used before the sort_by column/s, and you don't need to add them to the sort_by column/s list. y_func: function used to calculate y for each sample (and target_col) return_names: flag to return the names of the columns from where X was generated

{% endraw %} {% raw %}
{% endraw %} {% raw %}

split_Xy[source]

split_Xy(X, y=None, splits=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
df = pd.DataFrame()
df['sample_id'] = np.array([1,1,1,2,2,2,3,3,3])
df['var1'] = df['sample_id'] * 10 + df.index.values
df['var2'] = df['sample_id'] * 100 + df.index.values
df
sample_id var1 var2
0 1 10 100
1 1 11 101
2 1 12 102
3 2 23 203
4 2 24 204
5 2 25 205
6 3 36 306
7 3 37 307
8 3 38 308
{% endraw %} {% raw %}
X_df, y_df = df2Xy(df, sample_col='sample_id', steps_in_rows=True)
test_eq(X_df[0], np.array([[10, 11, 12], [100, 101, 102]]))
{% endraw %} {% raw %}
n_samples = 1_000
n_rows = 10_000

sample_ids = np.arange(n_samples).repeat(n_rows//n_samples).reshape(-1,1)
feat_ids = np.tile(np.arange(n_rows // n_samples), n_samples).reshape(-1,1)
cont = np.random.randn(n_rows, 6)
ind_cat = np.random.randint(0, 3, (n_rows, 1))
target = np.array(['a', 'b', 'c'])[ind_cat]
ind_cat2 = np.random.randint(0, 3, (n_rows, 1))
target2 = np.array(['a', 'b', 'c'])[ind_cat2]
data = np.concatenate([sample_ids, feat_ids, cont, target, target], -1)
columns = ['sample_id', 'feat_id'] + (np.arange(6) + 1).astype(str).tolist() + ['target'] + ['target2']
df = pd.DataFrame(data, columns=columns)
idx = np.random.choice(np.arange(len(df)), len(df), False)
new_dtypes = {'sample_id':np.int32, 'feat_id':np.int32, '1':np.float32, '2':np.float32, '3':np.float32, '4':np.float32, '5':np.float32, '6':np.float32}
df = df.astype(dtype=new_dtypes)
df = df.loc[idx].reset_index(drop=True)
df
sample_id feat_id 1 2 3 4 5 6 target target2
0 465 4 -0.963435 0.638825 0.020541 -1.106062 -0.890338 1.756991 b b
1 859 4 0.350560 0.380634 -1.123738 0.402504 0.481785 -0.490545 b b
2 149 4 -1.760913 -0.065597 1.141374 1.066082 -1.222298 1.402482 b b
3 388 9 -1.470538 1.261534 0.761395 -1.029569 1.222427 1.173090 b b
4 662 4 1.547554 -0.278609 -0.500586 0.733009 1.369350 0.861844 a a
... ... ... ... ... ... ... ... ... ... ...
9995 214 8 0.282241 0.420880 0.513440 -2.129133 -0.562271 -0.770627 b b
9996 281 4 2.223891 0.848771 0.611845 -2.042017 1.230044 0.240165 a a
9997 188 1 0.617227 0.826373 -0.307580 -0.156759 -0.351246 1.371284 c c
9998 435 9 0.417898 0.451260 0.139802 1.222988 0.273921 0.584279 a a
9999 600 2 1.071223 -0.892138 0.033806 -2.139459 -0.966180 0.441530 a a

10000 rows × 10 columns

{% endraw %} {% raw %}
from scipy.stats import mode
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', data_cols=['1', '2', '3', '4', '5', '6'], target_col=['target'], 
             sort_by=['sample_id', 'feat_id'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000,))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(mode(sorted_df[sorted_df.sample_id == rand_idx][['target']].values).mode), y[rand_idx])
{% endraw %} {% raw %}
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', target_col=['target', 'target2'], sort_by=['sample_id', 'feat_id'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000, 2))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(mode(sorted_df[sorted_df.sample_id == rand_idx][['target', 'target2']].values).mode), y[rand_idx])
{% endraw %} {% raw %}
from io import StringIO
TESTDATA = StringIO("""sample_id;value_0;value_1;target
    rob;2;3;hot
    alice;6;7;lukewarm
    eve;11;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
X, y = df2Xy(df, sample_col='sample_id', target_col='target', data_cols=['value_0', 'value_1'], sort_by='sample_id')
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3,))
X, y
sample_id value_0 value_1 target
0 rob 2 3 hot
1 alice 6 7 lukewarm
2 eve 11 12 cold
(array([[[ 6,  7]],
 
        [[11, 12]],
 
        [[ 2,  3]]]),
 array(['lukewarm', 'cold', 'hot'], dtype=object))
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;timestep;values;target
    rob;1;2;hot
    alice;1;6;lukewarm
    eve;1;11;cold
    
    rob;2;3;hot
    alice;2;7;lukewarm
    eve;2;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', target_col='target', data_cols=['values'], sort_by='timestep', to3d=True, y_func=y_func)
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3, ))
print(X, y)
sample_id timestep values target
0 rob 1 2 hot
1 alice 1 6 lukewarm
2 eve 1 11 cold
3 rob 2 3 hot
4 alice 2 7 lukewarm
5 eve 2 12 cold
[[[ 6  7]]

 [[11 12]]

 [[ 2  3]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col='target', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3,))
sample_id trait value_0 value_1 target
1 rob yellow 3 4 hot
3 rob red 5 6 hot
0 rob green 2 3 hot
4 alice green 6 7 lukewarm
2 rob blue 4 5 hot
10 eve blue 12 12 cold
11 eve red 13 14 cold
7 alice red 9 10 lukewarm
8 eve yellow 11 12 cold
9 eve green 10 11 cold
6 alice blue 8 9 lukewarm
5 alice yellow 7 8 lukewarm
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target1;target2
    rob;green;2;3;hot;good
    rob;yellow;3;4;hot;good
    rob;blue;4;5;hot;good
    rob;red;5;6;hot;good
    alice;green;6;7;lukewarm;good
    alice;yellow;7;8;lukewarm;good
    alice;blue;8;9;lukewarm;good
    alice;red;9;10;lukewarm;good
    eve;yellow;11;12;cold;bad
    eve;green;10;11;cold;bad
    eve;blue;12;12;cold;bad
    eve;red;13;14;cold;bad
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col=['target1', 'target2'], data_cols=['value_0', 'value_1'], y_func=y_func)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, 2))
print(X, y)
sample_id trait value_0 value_1 target1 target2
0 rob green 2 3 hot good
1 rob yellow 3 4 hot good
2 rob blue 4 5 hot good
3 rob red 5 6 hot good
4 alice green 6 7 lukewarm good
5 alice yellow 7 8 lukewarm good
6 alice blue 8 9 lukewarm good
7 alice red 9 10 lukewarm good
8 eve yellow 11 12 cold bad
9 eve green 10 11 cold bad
10 eve blue 12 12 cold bad
11 eve red 13 14 cold bad
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] [['lukewarm' 'good']
 ['cold' 'bad']
 ['hot' 'good']]
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y, None)
sample_id trait value_0 value_1 target
4 alice green 6 7 lukewarm
10 eve blue 12 12 cold
6 alice blue 8 9 lukewarm
3 rob red 5 6 hot
11 eve red 13 14 cold
2 rob blue 4 5 hot
5 alice yellow 7 8 lukewarm
8 eve yellow 11 12 cold
9 eve green 10 11 cold
0 rob green 2 3 hot
7 alice red 9 10 lukewarm
1 rob yellow 3 4 hot
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] None
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;timestep;values;target
    rob;green;1;2;hot
    rob;yellow;1;3;hot
    rob;blue;1;4;hot
    rob;red;1;5;hot
    alice;green;1;6;lukewarm
    alice;yellow;1;7;lukewarm
    alice;blue;1;8;lukewarm
    alice;red;1;9;lukewarm
    eve;yellow;1;11;cold
    eve;green;1;10;cold
    eve;blue;1;12;cold
    eve;red;1;13;cold
    
    rob;green;2;3;hot
    rob;yellow;2;4;hot
    rob;blue;2;5;hot
    rob;red;2;6;hot
    alice;green;2;7;lukewarm
    alice;yellow;2;8;lukewarm
    alice;blue;2;9;lukewarm
    alice;red;2;10;lukewarm
    eve;yellow;2;12;cold
    eve;green;2;11;cold
    eve;blue;2;13;cold
    eve;red;2;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', sort_by='timestep', target_col='target', data_cols=['values'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, ))
sample_id trait timestep values target
0 rob green 1 2 hot
1 rob yellow 1 3 hot
2 rob blue 1 4 hot
3 rob red 1 5 hot
4 alice green 1 6 lukewarm
5 alice yellow 1 7 lukewarm
6 alice blue 1 8 lukewarm
7 alice red 1 9 lukewarm
8 eve yellow 1 11 cold
9 eve green 1 10 cold
10 eve blue 1 12 cold
11 eve red 1 13 cold
12 rob green 2 3 hot
13 rob yellow 2 4 hot
14 rob blue 2 5 hot
15 rob red 2 6 hot
16 alice green 2 7 lukewarm
17 alice yellow 2 8 lukewarm
18 alice blue 2 9 lukewarm
19 alice red 2 10 lukewarm
20 eve yellow 2 12 cold
21 eve green 2 11 cold
22 eve blue 2 13 cold
23 eve red 2 14 cold
[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 13]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}

df2np3d[source]

df2np3d(df, groupby, data_cols=None)

Transforms a df (with the same number of rows per group in groupby) to a 3d ndarray

{% endraw %} {% raw %}
{% endraw %} {% raw %}
user = np.array([1,2]).repeat(4).reshape(-1,1)
val = np.random.rand(8, 3)
data = np.concatenate([user, val], axis=-1)
df = pd.DataFrame(data, columns=['user', 'x1', 'x2', 'x3'])
test_eq(df2np3d(df, ['user'], ['x1', 'x2', 'x3']).shape, (2, 3, 4))
{% endraw %} {% raw %}

add_missing_value_cols[source]

add_missing_value_cols(df, cols=None, dtype=float, fill_value=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
data = np.random.randn(10, 2)
mask = data > .8
data[mask] = np.nan
df = pd.DataFrame(data, columns=['A', 'B'])
df = add_missing_value_cols(df, cols=None, dtype=float)
test_eq(df['A'].isnull().sum(), df['missing_A'].sum())
test_eq(df['B'].isnull().sum(), df['missing_B'].sum())
df
A B missing_A missing_B
0 -0.951902 -0.337558 0.0 0.0
1 0.479144 NaN 0.0 1.0
2 -0.839601 -1.165401 0.0 0.0
3 NaN -0.395976 1.0 0.0
4 0.302734 0.643594 0.0 0.0
5 0.303700 NaN 0.0 1.0
6 NaN -0.198111 1.0 0.0
7 -0.733932 -0.834613 0.0 0.0
8 -0.212730 0.031541 0.0 0.0
9 -2.515209 NaN 0.0 1.0
{% endraw %} {% raw %}

add_missing_timestamps[source]

add_missing_timestamps(df, datetime_col, groupby=None, fill_value=nan, range_by_group=True, freq=None)

Fills missing timestamps in a dataframe to a desired frequency Args: df: pandas DataFrame datetime_col: column that contains the datetime data (without duplicates within groups) groupby: column used to identify unique_ids fill_value: values that will be insert where missing dates exist. Default:np.nan range_by_group: if True, dates will be filled between min and max dates for each group. Otherwise, between the min and max dates in the df. freq: frequence used to fillin the missing datetime

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates
date feature1 feature2
0 2021-05-01 0.224848 0.149688
1 2021-05-03 0.695715 0.603295
2 2021-05-05 0.274871 0.015257
3 2021-05-06 0.813230 0.344615
4 2021-05-07 0.368379 0.701564
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby=None, 
                                   fill_value=np.nan, 
                                   range_by_group=False)
test_eq(output_df, expected_output_df)
date feature1 feature2
0 2021-05-01 0.224848 0.149688
1 2021-05-02 NaN NaN
2 2021-05-03 0.695715 0.603295
3 2021-05-04 NaN NaN
4 2021-05-05 0.274871 0.015257
5 2021-05-06 0.813230 0.344615
6 2021-05-07 0.368379 0.701564
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,3,8,11,13]).reset_index(drop=True)

date_df_with_missing_dates
date id feature1 feature2
0 2021-05-03 0 0.861781 0.199095
1 2021-05-05 0 0.670151 0.290920
2 2021-05-06 0 0.089097 0.162642
3 2021-05-07 0 0.068788 0.806722
4 2021-05-01 1 0.004083 0.862333
5 2021-05-03 1 0.579692 0.776127
6 2021-05-04 1 0.094719 0.475509
7 2021-05-06 1 0.981296 0.126282
{% endraw %} {% raw %}
expected_output_df = date_df.drop([0,1,13]).reset_index(drop=True)  
expected_output_df.loc[[1,6,9], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=True)
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-03 0 0.861781 0.199095
1 2021-05-04 0 NaN NaN
2 2021-05-05 0 0.670151 0.290920
3 2021-05-06 0 0.089097 0.162642
4 2021-05-07 0 0.068788 0.806722
5 2021-05-01 1 0.004083 0.862333
6 2021-05-02 1 NaN NaN
7 2021-05-03 1 0.579692 0.776127
8 2021-05-04 1 0.094719 0.475509
9 2021-05-05 1 NaN NaN
10 2021-05-06 1 0.981296 0.126282
{% endraw %} {% raw %}
expected_output_df = date_df.copy() 
expected_output_df.loc[[0,1,3,8,11,13], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=False)
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-01 0 NaN NaN
1 2021-05-02 0 NaN NaN
2 2021-05-03 0 0.861781 0.199095
3 2021-05-04 0 NaN NaN
4 2021-05-05 0 0.670151 0.290920
5 2021-05-06 0 0.089097 0.162642
6 2021-05-07 0 0.068788 0.806722
7 2021-05-01 1 0.004083 0.862333
8 2021-05-02 1 NaN NaN
9 2021-05-03 1 0.579692 0.776127
10 2021-05-04 1 0.094719 0.475509
11 2021-05-05 1 NaN NaN
12 2021-05-06 1 0.981296 0.126282
13 2021-05-07 1 NaN NaN
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates
date feature1 feature2
0 2021-05-01 00:00:00 0.176007 0.999164
1 2021-05-01 08:00:00 0.985267 0.154921
2 2021-05-01 16:00:00 0.989456 0.118419
3 2021-05-01 20:00:00 0.382382 0.009891
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 'date', groupby=None, fill_value=np.nan, range_by_group=False, freq='4H')
test_eq(output_df, expected_output_df)
date feature1 feature2
0 2021-05-01 00:00:00 0.176007 0.999164
1 2021-05-01 04:00:00 NaN NaN
2 2021-05-01 08:00:00 0.985267 0.154921
3 2021-05-01 12:00:00 NaN NaN
4 2021-05-01 16:00:00 0.989456 0.118419
5 2021-05-01 20:00:00 0.382382 0.009891
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,3,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.325401 0.005866
1 2021-05-01 16:00:00 0 0.741933 0.938274
2 2021-05-01 20:00:00 0 0.364071 0.210364
3 2021-05-01 00:00:00 1 0.763889 0.989521
4 2021-05-01 04:00:00 1 0.517116 0.550497
5 2021-05-01 16:00:00 1 0.451959 0.469879
{% endraw %} {% raw %}
expected_output_df = date_df.drop([0,1,11]).reset_index(drop=True)  
expected_output_df.loc[[1,6,7], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=True, 
                                   freq='4H')
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.325401 0.005866
1 2021-05-01 12:00:00 0 NaN NaN
2 2021-05-01 16:00:00 0 0.741933 0.938274
3 2021-05-01 20:00:00 0 0.364071 0.210364
4 2021-05-01 00:00:00 1 0.763889 0.989521
5 2021-05-01 04:00:00 1 0.517116 0.550497
6 2021-05-01 08:00:00 1 NaN NaN
7 2021-05-01 12:00:00 1 NaN NaN
8 2021-05-01 16:00:00 1 0.451959 0.469879
{% endraw %} {% raw %}
expected_output_df = date_df.copy() 
expected_output_df.loc[[0,1,3,8,9,11], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=False, 
                                   freq='4H')
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-01 00:00:00 0 NaN NaN
1 2021-05-01 04:00:00 0 NaN NaN
2 2021-05-01 08:00:00 0 0.325401 0.005866
3 2021-05-01 12:00:00 0 NaN NaN
4 2021-05-01 16:00:00 0 0.741933 0.938274
5 2021-05-01 20:00:00 0 0.364071 0.210364
6 2021-05-01 00:00:00 1 0.763889 0.989521
7 2021-05-01 04:00:00 1 0.517116 0.550497
8 2021-05-01 08:00:00 1 NaN NaN
9 2021-05-01 12:00:00 1 NaN NaN
10 2021-05-01 16:00:00 1 0.451959 0.469879
11 2021-05-01 20:00:00 1 NaN NaN
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby=None, fill_value=np.nan, range_by_group=False, freq='4H'), 
          contains='cannot reindex from a duplicate axis')
date feature1 feature2
0 2021-05-01 00:00:00 0.395495 0.382825
1 2021-05-01 08:00:00 0.708897 0.567553
2 2021-05-01 16:00:00 0.357399 0.279567
3 2021-05-01 16:00:00 0.389376 0.731096
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=True, freq='4H'), 
          contains='cannot handle a non-unique multi-index!')
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.411064 0.315694
1 2021-05-01 12:00:00 0 0.074582 0.044496
2 2021-05-01 16:00:00 0 0.596242 0.045364
3 2021-05-01 16:00:00 0 0.740436 0.896142
4 2021-05-01 00:00:00 1 0.560498 0.550509
5 2021-05-01 04:00:00 1 0.302942 0.520996
6 2021-05-01 16:00:00 1 0.334496 0.455860
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=False, freq='4H'), 
          contains='cannot handle a non-unique multi-index!')
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.639965 0.634852
1 2021-05-01 12:00:00 0 0.565357 0.121770
2 2021-05-01 16:00:00 0 0.662815 0.526833
3 2021-05-01 16:00:00 0 0.406274 0.439197
4 2021-05-01 00:00:00 1 0.059250 0.331034
5 2021-05-01 04:00:00 1 0.644653 0.493763
6 2021-05-01 16:00:00 1 0.268020 0.287030
{% endraw %} {% raw %}

time_encoding[source]

time_encoding(series, freq, max_val=None)

Transforms a pandas series of dtype datetime64 (of any freq) or DatetimeIndex into 2 float arrays

Available options: microsecond, millisecond, second, minute, hour, day = day_of_month = dayofmonth, day_of_week = weekday = dayofweek, day_of_year = dayofyear, week = week_of_year = weekofyear, month and year

{% endraw %} {% raw %}
{% endraw %} {% raw %}
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    tdf = pd.DataFrame(pd.date_range('2021-03-01', datetime.datetime.today()), columns=['date'])
    a,b = time_encoding(tdf.date, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()
{% endraw %} {% raw %}
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    dateindex = pd.date_range('2021-03-01', datetime.datetime.today())
    a,b = time_encoding(dateindex, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()
{% endraw %} {% raw %}
dow_sin, dow_cos = time_encoding(date_df['date'], 'dayofweek')
plt.plot(dow_sin)
plt.plot(dow_cos)
plt.title('DayOfWeek')
plt.show()
date_df['dow_sin'] = dow_sin
date_df['dow_cos'] = dow_cos
date_df
date id feature1 feature2 dow_sin dow_cos
0 2021-05-01 00:00:00 0 0.180139 0.949770 -0.974928 -0.222521
1 2021-05-01 04:00:00 0 0.745061 0.392594 -0.974928 -0.222521
2 2021-05-01 08:00:00 0 0.639965 0.634852 -0.974928 -0.222521
3 2021-05-01 12:00:00 0 0.565357 0.121770 -0.974928 -0.222521
4 2021-05-01 16:00:00 0 0.662815 0.526833 -0.974928 -0.222521
5 2021-05-01 20:00:00 0 0.406274 0.439197 -0.974928 -0.222521
6 2021-05-01 00:00:00 1 0.059250 0.331034 -0.974928 -0.222521
7 2021-05-01 04:00:00 1 0.644653 0.493763 -0.974928 -0.222521
8 2021-05-01 08:00:00 1 0.009817 0.442220 -0.974928 -0.222521
9 2021-05-01 12:00:00 1 0.327216 0.902367 -0.974928 -0.222521
10 2021-05-01 16:00:00 1 0.268020 0.287030 -0.974928 -0.222521
11 2021-05-01 20:00:00 1 0.451195 0.431382 -0.974928 -0.222521
{% endraw %} {% raw %}

forward_gaps[source]

forward_gaps(o, normalize=True)

Number of sequence steps since previous real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

backward_gaps[source]

backward_gaps(o, normalize=True)

Number of sequence steps to next real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

nearest_gaps[source]

nearest_gaps(o, normalize=True)

Number of sequence steps to nearest real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

get_gaps[source]

get_gaps(o:Tensor, forward:bool=True, backward:bool=True, nearest:bool=True, normalize:bool=True)

Number of sequence steps from previous, to next and/or to nearest real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = torch.rand(1, 2, 8)
arr = t.numpy()
t[t <.6] = np.nan
test_ge(nearest_gaps(t).min().item(), 0)
test_ge(nearest_gaps(arr).min(), 0)
test_le(nearest_gaps(t).min().item(), 1)
test_le(nearest_gaps(arr).min(), 1)
test_eq(torch.isnan(forward_gaps(t)).sum(), 0)
test_eq(np.isnan(forward_gaps(arr)).sum(), 0)
ag = get_gaps(t)
test_eq(ag.shape, (1,6,8))
test_eq(torch.isnan(ag).sum(), 0)
{% endraw %} {% raw %}

add_delta_timestamp_cols[source]

add_delta_timestamp_cols(df, cols=None, groupby=None, forward=True, backward=True, nearest=True, normalize=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
data = np.zeros((len(dates), 2))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))

cols = ['date', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float})
date_df.loc[[1,3,4],'feature1'] = np.nan
date_df
date feature1
0 2021-05-01 0.393696
1 2021-05-02 NaN
2 2021-05-03 0.601472
3 2021-05-04 NaN
4 2021-05-05 NaN
5 2021-05-06 0.994360
6 2021-05-07 0.964120
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = np.array([1,1,2,1,2,3,1])
expected_output_df['feature1_dt_bwd'] = np.array([2,1,3,2,1,1,1])
expected_output_df['feature1_dt_nearest'] = np.array([1,1,2,1,1,1,1])

display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1', normalize=False)
test_eq(expected_output_df, output_df)
date feature1 feature1_dt_fwd feature1_dt_bwd feature1_dt_nearest
0 2021-05-01 0.393696 1 2 1
1 2021-05-02 NaN 1 1 1
2 2021-05-03 0.601472 2 3 2
3 2021-05-04 NaN 1 2 1
4 2021-05-05 NaN 2 1 1
5 2021-05-06 0.994360 3 1 1
6 2021-05-07 0.964120 1 1 1
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))

cols = ['date', 'id', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float})
date_df.loc[[1,3,4,8,9,11],'feature1'] = np.nan
date_df
date id feature1
0 2021-05-01 0 0.548022
1 2021-05-02 0 NaN
2 2021-05-03 0 0.342199
3 2021-05-04 0 NaN
4 2021-05-05 0 NaN
5 2021-05-06 0 0.692583
6 2021-05-07 0 0.658843
7 2021-05-01 1 0.917405
8 2021-05-02 1 NaN
9 2021-05-03 1 NaN
10 2021-05-04 1 0.730965
11 2021-05-05 1 NaN
12 2021-05-06 1 0.300240
13 2021-05-07 1 0.580900
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = np.array([1,1,2,1,2,3,1,1,1,2,3,1,2,1])
expected_output_df['feature1_dt_bwd'] = np.array([2,1,3,2,1,1,1,3,2,1,2,1,1,1])
expected_output_df['feature1_dt_nearest'] = np.array([1,1,2,1,1,1,1,1,1,1,2,1,1,1])

display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1', groupby='id', normalize=False)
test_eq(expected_output_df, output_df)
date id feature1 feature1_dt_fwd feature1_dt_bwd feature1_dt_nearest
0 2021-05-01 0 0.548022 1 2 1
1 2021-05-02 0 NaN 1 1 1
2 2021-05-03 0 0.342199 2 3 2
3 2021-05-04 0 NaN 1 2 1
4 2021-05-05 0 NaN 2 1 1
5 2021-05-06 0 0.692583 3 1 1
6 2021-05-07 0 0.658843 1 1 1
7 2021-05-01 1 0.917405 1 3 1
8 2021-05-02 1 NaN 1 2 1
9 2021-05-03 1 NaN 2 1 1
10 2021-05-04 1 0.730965 3 2 2
11 2021-05-05 1 NaN 1 1 1
12 2021-05-06 1 0.300240 2 1 1
13 2021-05-07 1 0.580900 1 1 1
{% endraw %}

SlidingWindow and SlidingWindowPanel are 2 useful functions that will allow you to create an array with segments of a pandas dataframe based on multiple criteria.

{% raw %}

SlidingWindow[source]

SlidingWindow(window_len:int, stride:Union[NoneType, int]=1, start:int=0, pad_remainder:bool=False, padding:str='post', padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType, int, list]=None, get_y:Union[NoneType, int, list]=None, y_func:Optional[callable]=None, output_processor:Optional[callable]=None, copy:bool=False, horizon:Union[int, list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True)

Applies a sliding window to a 1d or 2d input (np.ndarray, torch.Tensor or pd.DataFrame) Args: window_len = length of lookback window stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default) or a given step (int). Previous steps will be discarded. pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding = 'pre' or 'post' (optional, defaults to 'pre'): pad either before or after each sequence. If pad_remainder == False, it indicates the starting point to create the sequence ('pre' from the end, and 'post' from the beginning) padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict (y). If get_y is [] horizon will be set to 0.

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y.
                      [] means no y data is created (unlabeled data).
y_func              = optional function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
output_processor    = optional function to process the final output (X (and y if available)). This is useful when some values need to be removed.
                      The function should take X and y (even if it's None) as arguments.
copy                = copy the original object to avoid changes in it.
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y

Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X
input shape: (13, 3)
array([[[ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [ 0.,  0.,  0.,  1.,  1.]]])
{% endraw %} {% raw %}
wl = 5
t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl)(t)
test_eq(X.shape[1:], (1, wl))
itemify(X,)
input shape: (10,)
(#5) [(array([[0, 1, 2, 3, 4]]),),(array([[1, 2, 3, 4, 5]]),),(array([[2, 3, 4, 5, 6]]),),(array([[3, 4, 5, 6, 7]]),),(array([[4, 5, 6, 7, 8]]),)]
{% endraw %} {% raw %}
wl = 5
h = 1

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9)]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=3, horizon=1, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[3, 4, 5, 6, 7]]), 8)]
{% endraw %} {% raw %}
wl = 5
start = 3

t = np.arange(20)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=None, horizon=1, start=start)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
[(array([[3, 4, 5, 6, 7]]), 8), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[13, 14, 15, 16, 17]]), 18)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
var
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=1, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
var
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9), (array([[5, 6, 7, 8, 9]]), 10), (array([[ 6,  7,  8,  9, 10]]), 11), (array([[ 7,  8,  9, 10, 11]]), 12), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[ 9, 10, 11, 12, 13]]), 14), (array([[10, 11, 12, 13, 14]]), 15), (array([[11, 12, 13, 14, 15]]), 16), (array([[12, 13, 14, 15, 16]]), 17), (array([[13, 14, 15, 16, 17]]), 18), (array([[14, 15, 16, 17, 18]]), 19)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var']).T
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None, seq_first=False)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
var 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
var_0 var_1 var_2
0 0 0 0
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
5 5 50 500
6 6 60 600
7 7 70 700
8 8 80 800
9 9 90 900
[(array([[  0,   1,   2,   3,   4],
       [  0,  10,  20,  30,  40],
       [  0, 100, 200, 300, 400]]), array([  5,  50, 500])), (array([[  1,   2,   3,   4,   5],
       [ 10,  20,  30,  40,  50],
       [100, 200, 300, 400, 500]]), array([  6,  60, 600])), (array([[  2,   3,   4,   5,   6],
       [ 20,  30,  40,  50,  60],
       [200, 300, 400, 500, 600]]), array([  7,  70, 700])), (array([[  3,   4,   5,   6,   7],
       [ 30,  40,  50,  60,  70],
       [300, 400, 500, 600, 700]]), array([  8,  80, 800])), (array([[  4,   5,   6,   7,   8],
       [ 40,  50,  60,  70,  80],
       [400, 500, 600, 700, 800]]), array([  9,  90, 900]))]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1, get_y="var_0")(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
var_0 var_1 var_2
0 0 0 0
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
5 5 50 500
6 6 60 600
7 7 70 700
8 8 80 800
9 9 90 900
[(array([[  0,   1,   2,   3,   4],
       [  0,  10,  20,  30,  40],
       [  0, 100, 200, 300, 400]]), 5), (array([[  1,   2,   3,   4,   5],
       [ 10,  20,  30,  40,  50],
       [100, 200, 300, 400, 500]]), 6), (array([[  2,   3,   4,   5,   6],
       [ 20,  30,  40,  50,  60],
       [200, 300, 400, 500, 600]]), 7), (array([[  3,   4,   5,   6,   7],
       [ 30,  40,  50,  60,  70],
       [300, 400, 500, 600, 700]]), 8), (array([[  4,   5,   6,   7,   8],
       [ 40,  50,  60,  70,  80],
       [400, 500, 600, 700, 800]]), 9)]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(wl, horizon=1, get_x=columns[:-1], get_y='target')(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars-1, wl))
test_eq(items[0][1].shape, ())
input shape: torch.Size([10, 3])
var_0 var_1 target
0 0 0 0
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
5 5 50 500
6 6 60 600
7 7 70 700
8 8 80 800
9 9 90 900
[(array([[ 0,  1,  2,  3,  4],
       [ 0, 10, 20, 30, 40]]), 500), (array([[ 1,  2,  3,  4,  5],
       [10, 20, 30, 40, 50]]), 600), (array([[ 2,  3,  4,  5,  6],
       [20, 30, 40, 50, 60]]), 700), (array([[ 3,  4,  5,  6,  7],
       [30, 40, 50, 60, 70]]), 800), (array([[ 4,  5,  6,  7,  8],
       [40, 50, 60, 70, 80]]), 900)]
{% endraw %} {% raw %}
n_vars = 3

t = (np.random.rand(1000, n_vars) - .5).cumsum(0)
print(t.shape)
plt.plot(t)
plt.show()
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=[0,1], get_y=2)(t)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(1000, 3)
(200, 2, 5) (200,)
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (np.random.rand(100, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, horizon=0, get_x=columns[:-1], get_y='target')(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 3)
var_0 var_1 target
0 0.038224 -0.376706 -0.217861
1 -0.160785 -0.835162 -0.126483
2 -0.027506 -1.017696 -0.468260
3 -0.511896 -1.424673 -0.680972
4 -0.509468 -1.659093 -0.334035
... ... ... ...
95 -3.402801 -1.505637 8.376279
96 -3.022874 -2.000598 8.321213
97 -2.755679 -1.701845 8.771309
98 -2.806226 -1.721845 8.318951
99 -2.489690 -1.738804 7.969768

100 rows × 3 columns

(96, 2, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
var_0 var_1 var_2 var_3 target
0 0.342768 0.123367 -0.230604 0.434579 -0.075454
1 -0.087556 -0.060395 -0.670323 0.209174 0.002378
2 -0.205709 -0.440546 -0.907669 -0.037758 -0.030163
3 -0.678621 -0.635955 -0.873133 -0.252857 -0.501020
4 -0.549280 -0.328119 -0.738205 -0.234619 -0.757449
... ... ... ... ... ...
95 1.532412 -3.624153 -2.711303 -0.308570 1.315960
96 1.844532 -3.245411 -2.871037 -0.634700 1.117677
97 2.193751 -3.406014 -2.843080 -0.593142 1.132929
98 2.265922 -3.370546 -3.029408 -0.877587 0.699351
99 2.546047 -3.197526 -2.853532 -0.910442 0.958781

100 rows × 5 columns

(96, 4, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5

t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
var_0 0.428969 0.576676 0.716230 0.499310 0.228932 0.133305 0.576377 0.545225 0.707670 0.498284 ... 3.360701 3.247632 3.553385 3.572756 3.255296 2.781663 2.917785 3.367665 3.672235 3.439769
var_1 0.450992 0.137983 0.315005 0.738603 0.890131 1.011961 1.377271 0.966110 1.453431 1.837302 ... -1.480663 -1.903888 -1.635508 -1.629121 -1.513037 -1.473930 -0.986340 -1.275641 -1.182216 -1.538784
var_2 0.015815 0.366287 0.150317 0.166561 0.039830 0.095263 0.198012 0.367608 0.132965 -0.067098 ... 4.276629 4.102779 4.601456 4.547265 4.219582 3.879478 3.750958 3.783258 4.232514 4.513494
var_3 0.362741 0.479916 0.465278 0.004941 -0.169799 -0.071437 0.277505 0.621059 0.226092 0.690035 ... -2.706781 -2.231692 -2.641493 -2.566393 -2.948474 -2.589325 -2.239048 -2.554037 -2.499734 -2.657905
target -0.441334 -0.254336 0.190958 0.383855 0.065698 0.522897 0.736381 0.685222 1.027500 1.265993 ... 1.997635 1.538644 1.664044 2.084619 2.100733 1.699683 1.553842 1.673061 1.784292 1.795289

5 rows × 100 columns

(96, 4, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
var_0 0.350052 0.173699 0.005922 0.355863 0.365957 0.615358 0.700977 0.543214 0.726413 0.776172 ... 0.389378 0.628442 0.993668 0.591645 0.212132 0.057988 0.433855 0.662808 0.544976 0.459968
var_1 0.478304 0.025725 -0.259219 0.212233 0.541163 1.039264 1.521434 1.165189 0.987953 0.633477 ... 0.466418 0.568701 0.736825 1.235017 0.783833 1.240199 1.635467 1.526467 1.770768 1.355009
var_2 -0.238427 -0.249681 -0.314833 -0.422697 -0.891657 -1.027271 -0.756490 -1.014797 -1.179974 -1.622540 ... -3.771943 -3.625772 -3.864125 -3.407895 -3.556346 -3.534115 -3.794378 -4.194241 -3.848895 -4.031116
var_3 0.296829 0.677988 0.948550 1.310321 1.102458 1.451807 1.428559 1.845095 2.243779 2.632196 ... 1.919704 1.896870 2.327217 2.568496 2.948287 2.913993 3.209910 3.683606 3.694195 3.704561
target -0.112487 0.231846 0.418266 0.309910 0.493302 0.137467 0.027788 -0.433830 -0.079208 -0.014561 ... -0.692240 -0.970414 -1.082150 -1.047488 -0.991087 -0.753753 -0.581393 -0.615165 -0.455276 -0.325558

5 rows × 100 columns

(20, 4, 5) (20,)
{% endraw %} {% raw %}
from tsai.data.validation import TrainValidTestSplitter
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
splits = TrainValidTestSplitter(valid_size=.2, shuffle=False)(y)
X.shape, y.shape, splits
(100, 5)
var_0 var_1 var_2 var_3 target
0 -0.303251 -0.415882 0.436840 -0.496633 0.209879
1 -0.356825 0.013693 0.808101 -0.402868 -0.024402
2 -0.395547 0.457111 0.799137 -0.629520 0.045770
3 -0.651738 0.056039 0.804739 -0.376082 0.187842
4 -0.453526 -0.002073 0.692950 -0.802900 0.595395
... ... ... ... ... ...
95 -2.190805 0.710151 -2.266219 1.644569 -5.241154
96 -2.550820 0.282175 -2.651494 1.454871 -4.800781
97 -2.965517 0.351453 -2.764412 1.883754 -4.662149
98 -2.750529 0.193644 -2.559884 1.710565 -4.537585
99 -2.407802 0.027132 -2.679037 1.649252 -4.595858

100 rows × 5 columns

((96, 4, 5),
 (96,),
 ((#77) [0,1,2,3,4,5,6,7,8,9...], (#19) [77,78,79,80,81,82,83,84,85,86...]))
{% endraw %} {% raw %}
data = np.concatenate([np.linspace(0, 1, 11).reshape(-1,1).repeat(2, 1), np.arange(11).reshape(-1,1)], -1)
df_test = pd.DataFrame(data, columns=['col1', 'col2', 'target'])
df_test['target'] = df_test['target'].astype(int)
df_test
col1 col2 target
0 0.0 0.0 0
1 0.1 0.1 1
2 0.2 0.2 2
3 0.3 0.3 3
4 0.4 0.4 4
5 0.5 0.5 5
6 0.6 0.6 6
7 0.7 0.7 7
8 0.8 0.8 8
9 0.9 0.9 9
10 1.0 1.0 10
{% endraw %} {% raw %}
def _y_func(o): return o[:, 0]
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, y = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=_y_func)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/wl))
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[2], wl)
    test_close(x[:, 0, 0]*10, y)
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, y = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=None)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/ wl))
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(1, len(df_test)+1):
    x, y = SlidingWindow(wl, None, pad_remainder=False, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=None)(df_test)
    test_eq(x.shape[0], len(df_test) // wl)
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, _ = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=[], horizon=0)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/wl))
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(2, len(df_test)):
    x, _ = SlidingWindow(wl, wl, pad_remainder=False, get_x=['col1', 'col2'], get_y=[], horizon=0)(df_test)
    test_eq(x.shape[0], len(df_test) // wl)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
df = pd.DataFrame()
df['sample_id'] = np.concatenate([np.ones(n)*(i + 1) for i,n in enumerate([13])])
df['var1'] = df['sample_id'] + df.index.values - 1
df['var2'] = df['var1'] * 10
df['target'] = (df['var1']).astype(int)
df['sample_id'] = df['sample_id'].astype(int)
df
sample_id var1 var2 target
0 1 0.0 0.0 0
1 1 1.0 10.0 1
2 1 2.0 20.0 2
3 1 3.0 30.0 3
4 1 4.0 40.0 4
5 1 5.0 50.0 5
6 1 6.0 60.0 6
7 1 7.0 70.0 7
8 1 8.0 80.0 8
9 1 9.0 90.0 9
10 1 10.0 100.0 10
11 1 11.0 110.0 11
12 1 12.0 120.0 12
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=False, padding="pre", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (2, 2, 3))
test_eq(y.shape, (2, 4))
X, y
(array([[[ 4.,  5.,  6.],
         [40., 50., 60.]],
 
        [[ 6.,  7.,  8.],
         [60., 70., 80.]]]),
 array([[ 7,  8,  9, 10],
        [ 9, 10, 11, 12]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=True, padding="pre", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (3, 2, 3))
test_eq(y.shape, (3, 4))
X, y
(array([[[nan,  3.,  4.],
         [nan, 30., 40.]],
 
        [[ 4.,  5.,  6.],
         [40., 50., 60.]],
 
        [[ 6.,  7.,  8.],
         [60., 70., 80.]]]),
 array([[ 5,  6,  7,  8],
        [ 7,  8,  9, 10],
        [ 9, 10, 11, 12]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=False, padding="post", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (2, 2, 3))
test_eq(y.shape, (2, 4))
X, y
(array([[[ 3.,  4.,  5.],
         [30., 40., 50.]],
 
        [[ 5.,  6.,  7.],
         [50., 60., 70.]]]),
 array([[ 6,  7,  8,  9],
        [ 8,  9, 10, 11]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=True, padding="post", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (3, 2, 3))
test_eq(y.shape, (3, 4))
X, y
(array([[[ 3.,  4.,  5.],
         [30., 40., 50.]],
 
        [[ 5.,  6.,  7.],
         [50., 60., 70.]],
 
        [[ 7.,  8.,  9.],
         [70., 80., 90.]]]),
 array([[ 6.,  7.,  8.,  9.],
        [ 8.,  9., 10., 11.],
        [10., 11., 12., nan]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=10, stride=2, start=3, pad_remainder=True, padding="pre", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (1, 2, 10))
test_eq(y.shape, (1, 4))
X, y
(array([[[nan, nan, nan, nan,  3.,  4.,  5.,  6.,  7.,  8.],
         [nan, nan, nan, nan, 30., 40., 50., 60., 70., 80.]]]),
 array([[ 9, 10, 11, 12]]))
{% endraw %} {% raw %}
X, y = SlidingWindow(window_len=10, stride=2, start=3, pad_remainder=True, padding="post", padding_value=np.nan, add_padding_feature=False,
                     get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
                     ascending=True, check_leakage=True)(df)
test_eq(X.shape, (1, 2, 10))
test_eq(y.shape, (1, 4))
X, y
(array([[[  3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.],
         [ 30.,  40.,  50.,  60.,  70.,  80.,  90., 100., 110., 120.]]]),
 array([[nan, nan, nan, nan]]))
{% endraw %} {% raw %}

SlidingWindowPanel[source]

SlidingWindowPanel(window_len:int, unique_id_cols:list, stride:Union[NoneType, int]=1, start:int=0, pad_remainder:bool=False, padding:str='post', padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType, int, list]=None, get_y:Union[NoneType, int, list]=None, y_func:Optional[callable]=None, output_processor:Optional[callable]=None, copy:bool=False, horizon:Union[int, list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True, return_key:bool=False, verbose:bool=True)

Applies a sliding window to a pd.DataFrame.

Args: window_len = length of lookback window unique_id_cols = pd.DataFrame columns that will be used to identify a time series for each entity. stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default), a given step (int), or random within the 1st stride (None). pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding = 'pre' or 'post' (optional, defaults to 'pre'): pad either before or after each sequence. If pad_remainder == False, it indicates the starting point to create the sequence ('pre' from the end, and 'post' from the beginning) padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict (y). If get_y is [] horizon will be set to 0.

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y.
                      [] means no y data is created (unlabeled data).
y_func              = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
output_processor    = optional function to filter output (X (and y if available)). This is useful when some values need to be removed. The function
                      should take X and y (even if it's None) as arguments.
copy                = copy the original object to avoid changes in it.
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y
return_key          = when True, the key corresponsing to unique_id_cols for each sample is returned
verbose             = controls verbosity. True or 1 displays progress bar. 2 or more show records that cannot be created due to its length.


Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
samples = 100_000
wl = 5
n_vars = 10

t = (torch.stack(n_vars * [torch.arange(samples)]).T * tensor([10**i for i in range(n_vars)]))
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
df['time'] = np.arange(len(t))
df['device'] = 0
df['target'] = np.random.randint(0, 2, len(df))
df2 = df.copy()
df3 = df.copy()
cols = ['var_0', 'var_1', 'var_2', 'device', 'target']
df2[cols] = df2[cols] + 1
df3[cols] = df3[cols] + 2
df2 = df2.loc[:3]
df['region'] = 'A'
df2['region'] = 'A'
df3['region'] = 'B'
df = df.append(df2).append(df3).reset_index(drop=True)
df['index'] = np.arange(len(df))
df = df.sample(frac=1).reset_index(drop=True)
display(df.head())
df.shape
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 time device target region index
0 44547 445470 4454700 44547000 445470000 4454700000 44547000000 445470000000 4454700000000 44547000000000 44547 0 0 A 44547
1 73202 732020 7320200 73202000 732020000 7320200000 73202000000 732020000000 7320200000000 73202000000000 73202 0 1 A 73202
2 85388 853862 8538602 85386000 853860000 8538600000 85386000000 853860000000 8538600000000 85386000000000 85386 2 3 B 185390
3 58426 584260 5842600 58426000 584260000 5842600000 58426000000 584260000000 5842600000000 58426000000000 58426 0 1 A 58426
4 93637 936370 9363700 93637000 936370000 9363700000 93637000000 936370000000 9363700000000 93637000000000 93637 0 1 A 93637
(200004, 15)
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=False)(df)
X.shape, y.shape
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199992, 10, 5), (199992,))
{% endraw %} {% raw %}
X, y, key = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                               horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=True)(df)
X.shape, y.shape, key.shape
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199992, 10, 5), (199992,), (199992,))
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199992, 10, 5), (199992,))
{% endraw %} {% raw %}
def y_max(o): return np.max(o, axis=1)
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          y_func=y_max, horizon=5, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199982, 10, 5), (199982,))
{% endraw %} {% raw %}

identify_padding[source]

identify_padding(float_mask, value=-1)

Identifies padded subsequences in a mask of type float

This function identifies as padded subsequences those where all values == nan from the end of the sequence (last dimension) across all channels, and sets those values to the selected value (default = -1)

Args: mask: boolean or float mask value: scalar that will be used to identify padded subsequences

{% endraw %} {% raw %}
{% endraw %} {% raw %}
wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, _ = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X = tensor(X)
X[0, 1, -2:] = np.nan
X[1,..., :3] = np.nan
print(X)
identify_padding(torch.isnan(X).float())
input shape: (13, 3)
tensor([[[ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  1.,  2., nan, nan],
         [ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  0.,  0.,  0.,  0.]],

        [[nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  0.,  0.]],

        [[10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [ 0.,  0.,  0.,  1.,  1.]]])
tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.]],

        [[0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.]]])
{% endraw %}