--- title: Data preparation keywords: fastai sidebar: home_sidebar summary: "Functions required to prepare X (and y) from a pandas dataframe." description: "Functions required to prepare X (and y) from a pandas dataframe." nb_path: "nbs/011_data.preparation.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

df2Xy[source]

df2Xy(df, sample_col=None, feat_col=None, data_cols=None, target_col=None, steps_in_rows=False, to3d=True, splits=None, sort_by=None, ascending=True, y_func=None, return_names=False)

This function allows you to transform a pandas dataframe into X and y numpy arrays that can be used to craete a TSDataset. sample_col: column that uniquely identifies each sample. feat_col: used for multivariate datasets. It indicates which is the column that indicates the feature by row. data_col: indicates ths column/s where the data is located. If None, it means all columns (except the sample_col, feat_col, and target_col) target_col: indicates the column/s where the target is. steps_in_rows: flag to indicate if each step is in a different row or in a different column (default). to3d: turns X to 3d (including univariate time series) sort_by: used to indicate how to sort the dataframe. y_func: function used to calculate y for each sample (and target_col) return_names: flag to return the names of the columns from where X was generated

{% endraw %} {% raw %}
{% endraw %} {% raw %}

split_Xy[source]

split_Xy(X, y=None, splits=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
df = pd.DataFrame()
df['sample_id'] = np.array([1,1,1,2,2,2,3,3,3])
df['var1'] = df['sample_id'] * 10 + df.index.values
df['var2'] = df['sample_id'] * 100 + df.index.values
df
sample_id var1 var2
0 1 10 100
1 1 11 101
2 1 12 102
3 2 23 203
4 2 24 204
5 2 25 205
6 3 36 306
7 3 37 307
8 3 38 308
{% endraw %} {% raw %}
X_df, y_df = df2Xy(df, sample_col='sample_id', steps_in_rows=True)
test_eq(X_df[0], np.array([[10, 11, 12], [100, 101, 102]]))
{% endraw %} {% raw %}
n_samples = 1_000
n_rows = 10_000

sample_ids = np.arange(n_samples).repeat(n_rows//n_samples).reshape(-1,1)
feat_ids = np.tile(np.arange(n_rows // n_samples), n_samples).reshape(-1,1)
cont = np.random.randn(n_rows, 6)
ind_cat = np.random.randint(0, 3, (n_rows, 1))
target = np.array(['a', 'b', 'c'])[ind_cat]
ind_cat2 = np.random.randint(0, 3, (n_rows, 1))
target2 = np.array(['a', 'b', 'c'])[ind_cat2]
data = np.concatenate([sample_ids, feat_ids, cont, target, target], -1)
columns = ['sample_id', 'feat_id'] + (np.arange(6) + 1).astype(str).tolist() + ['target'] + ['target2']
df = pd.DataFrame(data, columns=columns)
idx = np.random.choice(np.arange(len(df)), len(df), False)
new_dtypes = {'sample_id':np.int32, 'feat_id':np.int32, '1':np.float32, '2':np.float32, '3':np.float32, '4':np.float32, '5':np.float32, '6':np.float32}
df = df.astype(dtype=new_dtypes)
df = df.loc[idx].reset_index(drop=True)
df
sample_id feat_id 1 2 3 4 5 6 target target2
0 846 8 -0.925849 0.674087 0.379934 0.518507 0.097288 0.771532 a a
1 58 1 0.839221 0.389044 -0.608590 -0.246981 -1.615276 0.824759 b b
2 934 3 -0.947548 -0.906396 -2.291011 -0.742853 1.807500 0.203940 a a
3 920 8 -0.592634 -0.146599 -0.123489 0.330388 -0.791193 0.062297 c c
4 427 1 0.059581 -0.932821 -0.135653 0.341220 1.501105 -0.728656 c c
... ... ... ... ... ... ... ... ... ... ...
9995 528 2 1.014302 -1.207861 1.655349 0.473558 0.157909 -0.926913 a a
9996 74 8 0.764139 -1.152124 1.401205 1.353842 -0.334016 0.604483 c c
9997 633 8 0.274706 -1.818892 -0.403739 1.216053 -0.053223 0.523597 c c
9998 137 7 0.634010 -0.222664 0.256361 -0.310036 0.039044 -2.166115 b b
9999 37 9 -0.100446 -0.458405 -1.289415 0.465968 1.206267 1.447000 a a

10000 rows × 10 columns

{% endraw %} {% raw %}
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', data_cols=['1', '2', '3', '4', '5', '6'], target_col=['target'], 
             sort_by=['sample_id', 'feat_id'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000,))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(scipy.stats.mode(sorted_df[sorted_df.sample_id == rand_idx][['target']].values).mode), y[rand_idx])
{% endraw %} {% raw %}
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', target_col=['target', 'target2'], sort_by=['sample_id', 'feat_id'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000, 2))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(scipy.stats.mode(sorted_df[sorted_df.sample_id == rand_idx][['target', 'target2']].values).mode), y[rand_idx])
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;value_0;value_1;target
    rob;2;3;hot
    alice;6;7;lukewarm
    eve;11;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
X, y = df2Xy(df, sample_col='sample_id', target_col='target', data_cols=['value_0', 'value_1'], sort_by='sample_id')
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3,))
X, y
sample_id value_0 value_1 target
0 rob 2 3 hot
1 alice 6 7 lukewarm
2 eve 11 12 cold
(array([[[ 6,  7]],
 
        [[11, 12]],
 
        [[ 2,  3]]]),
 array(['lukewarm', 'cold', 'hot'], dtype=object))
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;timestep;values;target
    rob;1;2;hot
    alice;1;6;lukewarm
    eve;1;11;cold
    
    rob;2;3;hot
    alice;2;7;lukewarm
    eve;2;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', target_col='target', data_cols=['values'], sort_by='timestep', to3d=True, y_func=y_func)
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3, ))
print(X, y)
sample_id timestep values target
0 rob 1 2 hot
1 alice 1 6 lukewarm
2 eve 1 11 cold
3 rob 2 3 hot
4 alice 2 7 lukewarm
5 eve 2 12 cold
[[[ 6 11]]

 [[ 2  7]]

 [[12  3]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col='target', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3,))
sample_id trait value_0 value_1 target
10 eve blue 12 12 cold
1 rob yellow 3 4 hot
0 rob green 2 3 hot
6 alice blue 8 9 lukewarm
7 alice red 9 10 lukewarm
2 rob blue 4 5 hot
11 eve red 13 14 cold
9 eve green 10 11 cold
3 rob red 5 6 hot
8 eve yellow 11 12 cold
4 alice green 6 7 lukewarm
5 alice yellow 7 8 lukewarm
[[[12 12]
  [ 3  4]
  [ 2  3]
  [ 8  9]]

 [[ 9 10]
  [ 4  5]
  [13 14]
  [10 11]]

 [[ 5  6]
  [11 12]
  [ 6  7]
  [ 7  8]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target1;target2
    rob;green;2;3;hot;good
    rob;yellow;3;4;hot;good
    rob;blue;4;5;hot;good
    rob;red;5;6;hot;good
    alice;green;6;7;lukewarm;good
    alice;yellow;7;8;lukewarm;good
    alice;blue;8;9;lukewarm;good
    alice;red;9;10;lukewarm;good
    eve;yellow;11;12;cold;bad
    eve;green;10;11;cold;bad
    eve;blue;12;12;cold;bad
    eve;red;13;14;cold;bad
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col=['target1', 'target2'], data_cols=['value_0', 'value_1'], y_func=y_func)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, 2))
print(X, y)
sample_id trait value_0 value_1 target1 target2
0 rob green 2 3 hot good
1 rob yellow 3 4 hot good
2 rob blue 4 5 hot good
3 rob red 5 6 hot good
4 alice green 6 7 lukewarm good
5 alice yellow 7 8 lukewarm good
6 alice blue 8 9 lukewarm good
7 alice red 9 10 lukewarm good
8 eve yellow 11 12 cold bad
9 eve green 10 11 cold bad
10 eve blue 12 12 cold bad
11 eve red 13 14 cold bad
[[[ 2  3]
  [ 3  4]
  [ 4  5]
  [ 5  6]]

 [[ 6  7]
  [ 7  8]
  [ 8  9]
  [ 9 10]]

 [[11 12]
  [10 11]
  [12 12]
  [13 14]]] [['lukewarm' 'good']
 ['cold' 'bad']
 ['hot' 'good']]
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y, None)
sample_id trait value_0 value_1 target
7 alice red 9 10 lukewarm
6 alice blue 8 9 lukewarm
3 rob red 5 6 hot
10 eve blue 12 12 cold
9 eve green 10 11 cold
5 alice yellow 7 8 lukewarm
4 alice green 6 7 lukewarm
11 eve red 13 14 cold
1 rob yellow 3 4 hot
0 rob green 2 3 hot
8 eve yellow 11 12 cold
2 rob blue 4 5 hot
[[[ 9 10]
  [ 8  9]
  [ 5  6]
  [12 12]]

 [[10 11]
  [ 7  8]
  [ 6  7]
  [13 14]]

 [[ 3  4]
  [ 2  3]
  [11 12]
  [ 4  5]]] None
{% endraw %} {% raw %}
TESTDATA = StringIO("""sample_id;trait;timestep;values;target
    rob;green;1;2;hot
    rob;yellow;1;3;hot
    rob;blue;1;4;hot
    rob;red;1;5;hot
    alice;green;1;6;lukewarm
    alice;yellow;1;7;lukewarm
    alice;blue;1;8;lukewarm
    alice;red;1;9;lukewarm
    eve;yellow;1;11;cold
    eve;green;1;10;cold
    eve;blue;1;12;cold
    eve;red;1;13;cold
    
    rob;green;2;3;hot
    rob;yellow;2;4;hot
    rob;blue;2;5;hot
    rob;red;2;6;hot
    alice;green;2;7;lukewarm
    alice;yellow;2;8;lukewarm
    alice;blue;2;9;lukewarm
    alice;red;2;10;lukewarm
    eve;yellow;2;12;cold
    eve;green;2;11;cold
    eve;blue;2;13;cold
    eve;red;2;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', sort_by='timestep', target_col='target', data_cols=['values'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, ))
sample_id trait timestep values target
0 rob green 1 2 hot
1 rob yellow 1 3 hot
2 rob blue 1 4 hot
3 rob red 1 5 hot
4 alice green 1 6 lukewarm
5 alice yellow 1 7 lukewarm
6 alice blue 1 8 lukewarm
7 alice red 1 9 lukewarm
8 eve yellow 1 11 cold
9 eve green 1 10 cold
10 eve blue 1 12 cold
11 eve red 1 13 cold
12 rob green 2 3 hot
13 rob yellow 2 4 hot
14 rob blue 2 5 hot
15 rob red 2 6 hot
16 alice green 2 7 lukewarm
17 alice yellow 2 8 lukewarm
18 alice blue 2 9 lukewarm
19 alice red 2 10 lukewarm
20 eve yellow 2 12 cold
21 eve green 2 11 cold
22 eve blue 2 13 cold
23 eve red 2 14 cold
[[[ 8  6]
  [ 9  7]
  [12 10]
  [13 11]]

 [[ 4  2]
  [ 5  3]
  [ 9  7]
  [10  8]]

 [[13 11]
  [14 12]
  [ 5  3]
  [ 6  4]]] ['lukewarm' 'cold' 'hot']
{% endraw %} {% raw %}

df2np3d[source]

df2np3d(df, groupby, data_cols=None)

Transforms a df (with the same number of rows per group in groupby) to a 3d ndarray

{% endraw %} {% raw %}
{% endraw %} {% raw %}
user = np.array([1,2]).repeat(4).reshape(-1,1)
val = np.random.rand(8, 3)
data = np.concatenate([user, val], axis=-1)
df = pd.DataFrame(data, columns=['user', 'x1', 'x2', 'x3'])
test_eq(df2np3d(df, ['user'], ['x1', 'x2', 'x3']).shape, (2, 3, 4))
{% endraw %} {% raw %}

add_missing_value_cols[source]

add_missing_value_cols(df, cols=None, dtype=float)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
data = np.random.randn(10, 2)
mask = data > .8
data[mask] = np.nan
df = pd.DataFrame(data, columns=['A', 'B'])
df = add_missing_value_cols(df, cols=None, dtype=float)
test_eq(df['A'].isnull().sum(), df['missing_A'].sum())
test_eq(df['B'].isnull().sum(), df['missing_B'].sum())
df
A B missing_A missing_B
0 -0.468662 -0.022762 0.0 0.0
1 -0.792910 NaN 0.0 1.0
2 0.458363 NaN 0.0 1.0
3 -0.743246 NaN 0.0 1.0
4 -1.198832 NaN 0.0 1.0
5 -0.892410 -0.919750 0.0 0.0
6 -0.355290 -0.139825 0.0 0.0
7 -0.329055 -0.565804 0.0 0.0
8 NaN 0.374796 1.0 0.0
9 0.483930 0.515686 0.0 0.0
{% endraw %} {% raw %}

add_missing_timestamps[source]

add_missing_timestamps(df, datetime_col, groupby=None, fill_value=nan, range_by_group=True, freq=None)

Fills missing timestamps in a dataframe to a desired frequency Args: df: pandas DataFrame datetime_col: column that contains the datetime data (without duplicates within groups) groupby: column used to identify unique_ids fill_value: values that will be insert where missing dates exist. Default:np.nan range_by_group: if True, dates will be filled between min and max dates for each group. Otherwise, between the min and max dates in the df. freq: frequence used to fillin the missing datetime

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates
date feature1 feature2
0 2021-05-01 0.184415 0.149166
1 2021-05-03 0.792900 0.068131
2 2021-05-05 0.211425 0.085824
3 2021-05-06 0.953127 0.682888
4 2021-05-07 0.012330 0.982041
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby=None, 
                                   fill_value=np.nan, 
                                   range_by_group=False)
test_eq(output_df, expected_output_df)
date feature1 feature2
0 2021-05-01 0.184415 0.149166
1 2021-05-02 NaN NaN
2 2021-05-03 0.792900 0.068131
3 2021-05-04 NaN NaN
4 2021-05-05 0.211425 0.085824
5 2021-05-06 0.953127 0.682888
6 2021-05-07 0.012330 0.982041
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,3,8,11,13]).reset_index(drop=True)

date_df_with_missing_dates
date id feature1 feature2
0 2021-05-03 0 0.118226 0.742834
1 2021-05-05 0 0.115969 0.084061
2 2021-05-06 0 0.125286 0.849047
3 2021-05-07 0 0.929838 0.455664
4 2021-05-01 1 0.742209 0.988968
5 2021-05-03 1 0.167396 0.907378
6 2021-05-04 1 0.971844 0.065978
7 2021-05-06 1 0.780535 0.961809
{% endraw %} {% raw %}
expected_output_df = date_df.drop([0,1,13]).reset_index(drop=True)  
expected_output_df.loc[[1,6,9], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=True)
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-03 0 0.118226 0.742834
1 2021-05-04 0 NaN NaN
2 2021-05-05 0 0.115969 0.084061
3 2021-05-06 0 0.125286 0.849047
4 2021-05-07 0 0.929838 0.455664
5 2021-05-01 1 0.742209 0.988968
6 2021-05-02 1 NaN NaN
7 2021-05-03 1 0.167396 0.907378
8 2021-05-04 1 0.971844 0.065978
9 2021-05-05 1 NaN NaN
10 2021-05-06 1 0.780535 0.961809
{% endraw %} {% raw %}
expected_output_df = date_df.copy() 
expected_output_df.loc[[0,1,3,8,11,13], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=False)
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-01 0 NaN NaN
1 2021-05-02 0 NaN NaN
2 2021-05-03 0 0.118226 0.742834
3 2021-05-04 0 NaN NaN
4 2021-05-05 0 0.115969 0.084061
5 2021-05-06 0 0.125286 0.849047
6 2021-05-07 0 0.929838 0.455664
7 2021-05-01 1 0.742209 0.988968
8 2021-05-02 1 NaN NaN
9 2021-05-03 1 0.167396 0.907378
10 2021-05-04 1 0.971844 0.065978
11 2021-05-05 1 NaN NaN
12 2021-05-06 1 0.780535 0.961809
13 2021-05-07 1 NaN NaN
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates
date feature1 feature2
0 2021-05-01 00:00:00 0.967668 0.835644
1 2021-05-01 08:00:00 0.671406 0.875836
2 2021-05-01 16:00:00 0.289419 0.887240
3 2021-05-01 20:00:00 0.539612 0.320326
{% endraw %} {% raw %}
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 'date', groupby=None, fill_value=np.nan, range_by_group=False, freq='4H')
test_eq(output_df, expected_output_df)
date feature1 feature2
0 2021-05-01 00:00:00 0.967668 0.835644
1 2021-05-01 04:00:00 NaN NaN
2 2021-05-01 08:00:00 0.671406 0.875836
3 2021-05-01 12:00:00 NaN NaN
4 2021-05-01 16:00:00 0.289419 0.887240
5 2021-05-01 20:00:00 0.539612 0.320326
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,3,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.795633 0.296898
1 2021-05-01 16:00:00 0 0.968405 0.654058
2 2021-05-01 20:00:00 0 0.132000 0.469576
3 2021-05-01 00:00:00 1 0.650885 0.622190
4 2021-05-01 04:00:00 1 0.040657 0.237668
5 2021-05-01 16:00:00 1 0.055866 0.254711
{% endraw %} {% raw %}
expected_output_df = date_df.drop([0,1,11]).reset_index(drop=True)  
expected_output_df.loc[[1,6,7], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=True, 
                                   freq='4H')
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.795633 0.296898
1 2021-05-01 12:00:00 0 NaN NaN
2 2021-05-01 16:00:00 0 0.968405 0.654058
3 2021-05-01 20:00:00 0 0.132000 0.469576
4 2021-05-01 00:00:00 1 0.650885 0.622190
5 2021-05-01 04:00:00 1 0.040657 0.237668
6 2021-05-01 08:00:00 1 NaN NaN
7 2021-05-01 12:00:00 1 NaN NaN
8 2021-05-01 16:00:00 1 0.055866 0.254711
{% endraw %} {% raw %}
expected_output_df = date_df.copy() 
expected_output_df.loc[[0,1,3,8,9,11], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates, 
                                   'date', 
                                   groupby='id', 
                                   fill_value=np.nan, 
                                   range_by_group=False, 
                                   freq='4H')
test_eq(expected_output_df, output_df)
date id feature1 feature2
0 2021-05-01 00:00:00 0 NaN NaN
1 2021-05-01 04:00:00 0 NaN NaN
2 2021-05-01 08:00:00 0 0.795633 0.296898
3 2021-05-01 12:00:00 0 NaN NaN
4 2021-05-01 16:00:00 0 0.968405 0.654058
5 2021-05-01 20:00:00 0 0.132000 0.469576
6 2021-05-01 00:00:00 1 0.650885 0.622190
7 2021-05-01 04:00:00 1 0.040657 0.237668
8 2021-05-01 08:00:00 1 NaN NaN
9 2021-05-01 12:00:00 1 NaN NaN
10 2021-05-01 16:00:00 1 0.055866 0.254711
11 2021-05-01 20:00:00 1 NaN NaN
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby=None, fill_value=np.nan, range_by_group=False, freq='4H'), 
          contains='cannot reindex from a duplicate axis')
date feature1 feature2
0 2021-05-01 00:00:00 0.250907 0.965406
1 2021-05-01 08:00:00 0.846068 0.045838
2 2021-05-01 16:00:00 0.338036 0.954646
3 2021-05-01 16:00:00 0.749792 0.452075
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=True, freq='4H'), 
          contains='cannot handle a non-unique multi-index!')
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.462174 0.933420
1 2021-05-01 12:00:00 0 0.748958 0.153208
2 2021-05-01 16:00:00 0 0.241910 0.920466
3 2021-05-01 16:00:00 0 0.189210 0.319617
4 2021-05-01 00:00:00 1 0.263759 0.105668
5 2021-05-01 04:00:00 1 0.115575 0.365087
6 2021-05-01 16:00:00 1 0.491920 0.411007
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=False, freq='4H'), 
          contains='cannot handle a non-unique multi-index!')
date id feature1 feature2
0 2021-05-01 08:00:00 0 0.663514 0.439459
1 2021-05-01 12:00:00 0 0.662052 0.106350
2 2021-05-01 16:00:00 0 0.358776 0.164377
3 2021-05-01 16:00:00 0 0.908822 0.877258
4 2021-05-01 00:00:00 1 0.932823 0.294688
5 2021-05-01 04:00:00 1 0.786879 0.063408
6 2021-05-01 16:00:00 1 0.233234 0.706419
{% endraw %} {% raw %}

time_encoding[source]

time_encoding(series, freq, max_val=None)

Transforms a pandas series of dtype datetime64 (of any freq) or DatetimeIndex into 2 float arrays

Available options: microsecond, millisecond, second, minute, hour, day = day_of_month = dayofmonth, day_of_week = weekday = dayofweek, day_of_year = dayofyear, week = week_of_year = weekofyear, month and year

{% endraw %} {% raw %}
{% endraw %} {% raw %}
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    tdf = pd.DataFrame(pd.date_range('2021-03-01', datetime.today()), columns=['date'])
    a,b = time_encoding(tdf.date, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()
{% endraw %} {% raw %}
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    dateindex = pd.date_range('2021-03-01', datetime.today())
    a,b = time_encoding(dateindex, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()
{% endraw %} {% raw %}
dow_sin, dow_cos = time_encoding(date_df['date'], 'dayofweek')
plt.plot(dow_sin)
plt.plot(dow_cos)
plt.title('DayOfWeek')
plt.show()
date_df['dow_sin'] = dow_sin
date_df['dow_cos'] = dow_cos
date_df
date id feature1 feature2 dow_sin dow_cos
0 2021-05-01 00:00:00 0 0.919966 0.457696 -0.974928 -0.222521
1 2021-05-01 04:00:00 0 0.731663 0.014716 -0.974928 -0.222521
2 2021-05-01 08:00:00 0 0.663514 0.439459 -0.974928 -0.222521
3 2021-05-01 12:00:00 0 0.662052 0.106350 -0.974928 -0.222521
4 2021-05-01 16:00:00 0 0.358776 0.164377 -0.974928 -0.222521
5 2021-05-01 20:00:00 0 0.908822 0.877258 -0.974928 -0.222521
6 2021-05-01 00:00:00 1 0.932823 0.294688 -0.974928 -0.222521
7 2021-05-01 04:00:00 1 0.786879 0.063408 -0.974928 -0.222521
8 2021-05-01 08:00:00 1 0.136083 0.616008 -0.974928 -0.222521
9 2021-05-01 12:00:00 1 0.015671 0.223310 -0.974928 -0.222521
10 2021-05-01 16:00:00 1 0.233234 0.706419 -0.974928 -0.222521
11 2021-05-01 20:00:00 1 0.013097 0.901210 -0.974928 -0.222521
{% endraw %} {% raw %}

forward_gaps[source]

forward_gaps(o, nan_to_num=0, normalize=True)

Number of sequence steps since previous real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

backward_gaps[source]

backward_gaps(o, nan_to_num=0, normalize=True)

Number of sequence steps to next real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

nearest_gaps[source]

nearest_gaps(o, nan_to_num=0, normalize=True)

Number of sequence steps to nearest real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}

get_gaps[source]

get_gaps(o:Tensor, nan_to_num:int=0, forward:bool=True, backward:bool=True, nearest:bool=True, normalize:bool=True)

Number of sequence steps from previous, to next and/or to nearest real value along the last dimension of 3D arrays or tensors

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = torch.rand(1, 2, 8)
arr = t.numpy()
t[t <.6] = np.nan
test_ge(nearest_gaps(t).min().item(), 0)
test_ge(nearest_gaps(arr).min(), 0)
test_le(nearest_gaps(t).min().item(), 1)
test_le(nearest_gaps(arr).min(), 1)
test_eq(torch.isnan(forward_gaps(t)).sum(), 0)
test_eq(np.isnan(forward_gaps(arr)).sum(), 0)
ag = get_gaps(t)
test_eq(ag.shape, (1,6,8))
test_eq(torch.isnan(ag).sum(), 0)
{% endraw %} {% raw %}

add_delta_timestamp_cols[source]

add_delta_timestamp_cols(df, cols=None, groupby=None, forward=True, backward=True, nearest=True, nan_to_num=0, normalize=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
data = np.zeros((len(dates), 2))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))

cols = ['date', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float})
date_df.loc[[1,3,4],'feature1'] = np.nan
{% endraw %} {% raw %}
normalize = 1/len(dates)
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = normalize
expected_output_df.loc[[1,3,4], 'feature1_dt_fwd'] = np.array([2,2,3]) * normalize

expected_output_df['feature1_dt_bwd'] = normalize
expected_output_df.loc[[1,3,4], 'feature1_dt_bwd'] = np.array([2,3,2]) * normalize

expected_output_df['feature1_dt_nearest'] = normalize
expected_output_df.loc[[1,3,4], 'feature1_dt_nearest'] =np.array([2,2,2]) * normalize

display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1')
test_eq(expected_output_df, output_df)
date feature1 feature1_dt_fwd feature1_dt_bwd feature1_dt_nearest
0 2021-05-01 0.271697 0.142857 0.142857 0.142857
1 2021-05-02 NaN 0.285714 0.285714 0.285714
2 2021-05-03 0.545554 0.142857 0.142857 0.142857
3 2021-05-04 NaN 0.285714 0.428571 0.285714
4 2021-05-05 NaN 0.428571 0.285714 0.285714
5 2021-05-06 0.289601 0.142857 0.142857 0.142857
6 2021-05-07 0.410591 0.142857 0.142857 0.142857
{% endraw %} {% raw %}
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))

cols = ['date', 'id', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float})
date_df.loc[[1,3,4,8,9,11],'feature1'] = np.nan
{% endraw %} {% raw %}
normalize = 2/len(dates)
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = normalize
expected_output_df.loc[[1,3,4,8,9,11], 'feature1_dt_fwd'] = np.array([2,2,3,2,3,2]) * normalize

expected_output_df['feature1_dt_bwd'] = normalize
expected_output_df.loc[[1,3,4,8,9,11], 'feature1_dt_bwd'] = np.array([2,3,2,3,2,2]) * normalize

expected_output_df['feature1_dt_nearest'] = normalize
expected_output_df.loc[[1,3,4,8,9,11], 'feature1_dt_nearest'] =np.array([2,2,2,2,2,2]) * normalize

display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1', groupby='id')
test_eq(expected_output_df, output_df)
date id feature1 feature1_dt_fwd feature1_dt_bwd feature1_dt_nearest
0 2021-05-01 0 0.679742 0.142857 0.142857 0.142857
1 2021-05-02 0 NaN 0.285714 0.285714 0.285714
2 2021-05-03 0 0.781253 0.142857 0.142857 0.142857
3 2021-05-04 0 NaN 0.285714 0.428571 0.285714
4 2021-05-05 0 NaN 0.428571 0.285714 0.285714
5 2021-05-06 0 0.484430 0.142857 0.142857 0.142857
6 2021-05-07 0 0.759332 0.142857 0.142857 0.142857
7 2021-05-01 1 0.859231 0.142857 0.142857 0.142857
8 2021-05-02 1 NaN 0.285714 0.428571 0.285714
9 2021-05-03 1 NaN 0.428571 0.285714 0.285714
10 2021-05-04 1 0.167406 0.142857 0.142857 0.142857
11 2021-05-05 1 NaN 0.285714 0.285714 0.285714
12 2021-05-06 1 0.279209 0.142857 0.142857 0.142857
13 2021-05-07 1 0.507666 0.142857 0.142857 0.142857
{% endraw %} {% raw %}

SlidingWindow[source]

SlidingWindow(window_len:int, stride:Union[NoneType, int]=1, start:int=0, pad_remainder:bool=False, padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType, int, list]=None, get_y:Union[NoneType, int, list]=None, y_func:Optional[callable]=None, copy:bool=False, horizon:Union[int, list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True)

Applies a sliding window to a 1d or 2d input (np.ndarray, torch.Tensor or pd.DataFrame) Args: window_len = length of lookback window stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default), a given step (int), or random within the 1st stride (None). pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict:

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y.
                    [] means no y data is created (unlabeled data).
y_func              = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
copy                = copy the original object to avoid changes in it.
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y

Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X
input shape: (13, 3)
array([[[ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [ 0.,  0.,  0.,  1.,  1.]]])
{% endraw %} {% raw %}
wl = 5
t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl)(t)
test_eq(X.shape[1:], (1, wl))
itemify(X,)
input shape: (10,)
(#5) [(array([[0, 1, 2, 3, 4]]),),(array([[1, 2, 3, 4, 5]]),),(array([[2, 3, 4, 5, 6]]),),(array([[3, 4, 5, 6, 7]]),),(array([[4, 5, 6, 7, 8]]),)]
{% endraw %} {% raw %}
wl = 5
h = 1

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9)]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
{% endraw %} {% raw %}
wl = 5

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=3, horizon=1, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[3, 4, 5, 6, 7]]), 8)]
{% endraw %} {% raw %}
wl = 5
start = 3

t = np.arange(20)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=None, horizon=1, start=start)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
[(array([[3, 4, 5, 6, 7]]), 8), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[13, 14, 15, 16, 17]]), 18)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
var
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=1, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
var
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9), (array([[5, 6, 7, 8, 9]]), 10), (array([[ 6,  7,  8,  9, 10]]), 11), (array([[ 7,  8,  9, 10, 11]]), 12), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[ 9, 10, 11, 12, 13]]), 14), (array([[10, 11, 12, 13, 14]]), 15), (array([[11, 12, 13, 14, 15]]), 16), (array([[12, 13, 14, 15, 16]]), 17), (array([[13, 14, 15, 16, 17]]), 18), (array([[14, 15, 16, 17, 18]]), 19)]
{% endraw %} {% raw %}
wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var']).T
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None, seq_first=False)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
var 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
var_0 var_1 var_2
0 tensor(0) tensor(0) tensor(0)
1 tensor(1) tensor(10) tensor(100)
2 tensor(2) tensor(20) tensor(200)
3 tensor(3) tensor(30) tensor(300)
4 tensor(4) tensor(40) tensor(400)
5 tensor(5) tensor(50) tensor(500)
6 tensor(6) tensor(60) tensor(600)
7 tensor(7) tensor(70) tensor(700)
8 tensor(8) tensor(80) tensor(800)
9 tensor(9) tensor(90) tensor(900)
[(array([[tensor(0), tensor(1), tensor(2), tensor(3), tensor(4)],
       [tensor(0), tensor(10), tensor(20), tensor(30), tensor(40)],
       [tensor(0), tensor(100), tensor(200), tensor(300), tensor(400)]],
      dtype=object), array([tensor(5), tensor(50), tensor(500)], dtype=object)), (array([[tensor(1), tensor(2), tensor(3), tensor(4), tensor(5)],
       [tensor(10), tensor(20), tensor(30), tensor(40), tensor(50)],
       [tensor(100), tensor(200), tensor(300), tensor(400), tensor(500)]],
      dtype=object), array([tensor(6), tensor(60), tensor(600)], dtype=object)), (array([[tensor(2), tensor(3), tensor(4), tensor(5), tensor(6)],
       [tensor(20), tensor(30), tensor(40), tensor(50), tensor(60)],
       [tensor(200), tensor(300), tensor(400), tensor(500), tensor(600)]],
      dtype=object), array([tensor(7), tensor(70), tensor(700)], dtype=object)), (array([[tensor(3), tensor(4), tensor(5), tensor(6), tensor(7)],
       [tensor(30), tensor(40), tensor(50), tensor(60), tensor(70)],
       [tensor(300), tensor(400), tensor(500), tensor(600), tensor(700)]],
      dtype=object), array([tensor(8), tensor(80), tensor(800)], dtype=object)), (array([[tensor(4), tensor(5), tensor(6), tensor(7), tensor(8)],
       [tensor(40), tensor(50), tensor(60), tensor(70), tensor(80)],
       [tensor(400), tensor(500), tensor(600), tensor(700), tensor(800)]],
      dtype=object), array([tensor(9), tensor(90), tensor(900)], dtype=object))]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1, get_y="var_0")(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
var_0 var_1 var_2
0 tensor(0) tensor(0) tensor(0)
1 tensor(1) tensor(10) tensor(100)
2 tensor(2) tensor(20) tensor(200)
3 tensor(3) tensor(30) tensor(300)
4 tensor(4) tensor(40) tensor(400)
5 tensor(5) tensor(50) tensor(500)
6 tensor(6) tensor(60) tensor(600)
7 tensor(7) tensor(70) tensor(700)
8 tensor(8) tensor(80) tensor(800)
9 tensor(9) tensor(90) tensor(900)
[(array([[tensor(0), tensor(1), tensor(2), tensor(3), tensor(4)],
       [tensor(0), tensor(10), tensor(20), tensor(30), tensor(40)],
       [tensor(0), tensor(100), tensor(200), tensor(300), tensor(400)]],
      dtype=object), tensor(5)), (array([[tensor(1), tensor(2), tensor(3), tensor(4), tensor(5)],
       [tensor(10), tensor(20), tensor(30), tensor(40), tensor(50)],
       [tensor(100), tensor(200), tensor(300), tensor(400), tensor(500)]],
      dtype=object), tensor(6)), (array([[tensor(2), tensor(3), tensor(4), tensor(5), tensor(6)],
       [tensor(20), tensor(30), tensor(40), tensor(50), tensor(60)],
       [tensor(200), tensor(300), tensor(400), tensor(500), tensor(600)]],
      dtype=object), tensor(7)), (array([[tensor(3), tensor(4), tensor(5), tensor(6), tensor(7)],
       [tensor(30), tensor(40), tensor(50), tensor(60), tensor(70)],
       [tensor(300), tensor(400), tensor(500), tensor(600), tensor(700)]],
      dtype=object), tensor(8)), (array([[tensor(4), tensor(5), tensor(6), tensor(7), tensor(8)],
       [tensor(40), tensor(50), tensor(60), tensor(70), tensor(80)],
       [tensor(400), tensor(500), tensor(600), tensor(700), tensor(800)]],
      dtype=object), tensor(9))]
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(wl, horizon=1, get_x=columns[:-1], get_y='target')(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars-1, wl))
test_eq(items[0][1].shape, ())
input shape: torch.Size([10, 3])
var_0 var_1 target
0 tensor(0) tensor(0) tensor(0)
1 tensor(1) tensor(10) tensor(100)
2 tensor(2) tensor(20) tensor(200)
3 tensor(3) tensor(30) tensor(300)
4 tensor(4) tensor(40) tensor(400)
5 tensor(5) tensor(50) tensor(500)
6 tensor(6) tensor(60) tensor(600)
7 tensor(7) tensor(70) tensor(700)
8 tensor(8) tensor(80) tensor(800)
9 tensor(9) tensor(90) tensor(900)
[(array([[tensor(0), tensor(1), tensor(2), tensor(3), tensor(4)],
       [tensor(0), tensor(10), tensor(20), tensor(30), tensor(40)]],
      dtype=object), tensor(500)), (array([[tensor(1), tensor(2), tensor(3), tensor(4), tensor(5)],
       [tensor(10), tensor(20), tensor(30), tensor(40), tensor(50)]],
      dtype=object), tensor(600)), (array([[tensor(2), tensor(3), tensor(4), tensor(5), tensor(6)],
       [tensor(20), tensor(30), tensor(40), tensor(50), tensor(60)]],
      dtype=object), tensor(700)), (array([[tensor(3), tensor(4), tensor(5), tensor(6), tensor(7)],
       [tensor(30), tensor(40), tensor(50), tensor(60), tensor(70)]],
      dtype=object), tensor(800)), (array([[tensor(4), tensor(5), tensor(6), tensor(7), tensor(8)],
       [tensor(40), tensor(50), tensor(60), tensor(70), tensor(80)]],
      dtype=object), tensor(900))]
{% endraw %} {% raw %}
n_vars = 3

t = (np.random.rand(1000, n_vars) - .5).cumsum(0)
print(t.shape)
plt.plot(t)
plt.show()
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=[0,1], get_y=2)(t)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(1000, 3)
(200, 2, 5) (200,)
{% endraw %} {% raw %}
wl = 5
n_vars = 3

t = (np.random.rand(100, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, horizon=0, get_x=columns[:-1], get_y='target')(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 3)
var_0 var_1 target
0 0.425384 0.389975 0.358791
1 0.405318 0.217295 0.763139
2 0.726061 0.010626 0.288258
3 0.642027 -0.207585 0.326436
4 0.517615 -0.571143 0.722700
... ... ... ...
95 0.143566 0.816213 3.074438
96 0.130509 0.450261 2.875360
97 0.106960 0.829813 2.762571
98 0.298087 1.280414 2.541637
99 0.747045 1.379080 2.216011

100 rows × 3 columns

(96, 2, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
var_0 var_1 var_2 var_3 target
0 0.113306 0.406153 0.450763 -0.163750 0.134610
1 -0.114101 0.502851 0.020611 0.094270 0.217379
2 -0.159095 0.812339 0.115610 0.506805 -0.059258
3 0.111266 0.926737 -0.036331 0.470563 0.182893
4 0.136658 0.449927 -0.488201 0.164951 -0.155879
... ... ... ... ... ...
95 -0.724871 4.663383 8.507840 0.046618 0.942307
96 -0.810676 4.758601 8.783791 -0.199609 1.144218
97 -1.251288 4.769096 9.184236 -0.549223 1.637890
98 -1.131325 4.836805 9.016433 -0.297081 2.054334
99 -0.999644 4.378000 8.748841 -0.183009 1.609790

100 rows × 5 columns

(96, 4, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5

t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
var_0 0.481800 0.767668 1.266852 1.167165 1.325884 1.353266 0.950254 1.121376 0.764226 1.083609 ... 1.889931 1.915581 2.035298 1.935537 1.514511 1.955949 1.751927 2.176073 2.086139 1.962423
var_1 -0.003012 0.438552 0.033087 -0.216950 0.206385 -0.160877 -0.461829 -0.624911 -0.715439 -0.777954 ... -1.261866 -0.867804 -0.794762 -1.260260 -1.623188 -1.612379 -1.408408 -1.636654 -1.572340 -1.126314
var_2 0.310297 0.707496 0.717641 0.329570 -0.110760 -0.503635 -0.013135 -0.125323 0.341515 0.697972 ... -2.083062 -2.176805 -1.860679 -2.009890 -1.982530 -2.160977 -2.409006 -2.454099 -2.499715 -2.454046
var_3 0.031364 -0.180028 -0.054971 -0.197904 -0.446342 -0.275233 -0.646426 -0.844938 -0.670323 -0.187199 ... 1.740708 1.381085 1.139631 1.602170 1.754529 1.352056 1.828581 2.255023 2.506743 2.027922
target 0.051760 0.220619 0.492616 0.972704 0.503141 0.860178 0.809317 0.469393 0.340585 0.601179 ... 2.012693 2.376822 2.810545 2.547291 2.558243 2.825866 2.421291 2.830484 2.454155 2.788130

5 rows × 100 columns

(96, 4, 5) (96,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
var_0 0.445158 0.676725 1.008301 1.488958 1.572959 1.232868 1.606676 1.780644 1.866027 1.861405 ... 0.547313 0.942989 0.550900 0.947722 1.387354 1.214875 1.040467 1.153282 1.387640 1.118774
var_1 0.142383 0.152040 -0.201064 -0.642870 -0.154133 -0.428361 -0.811126 -0.757622 -0.367572 -0.789568 ... -2.023777 -1.995496 -2.235367 -2.039860 -1.994653 -2.324249 -2.254664 -2.373856 -2.735245 -2.241558
var_2 -0.262617 0.178644 0.288476 0.736092 0.714647 0.656701 0.942396 1.187296 1.088446 0.736049 ... -2.102508 -1.946543 -1.561333 -1.543616 -1.740010 -1.875933 -2.116537 -1.997565 -2.491888 -2.084733
var_3 0.124509 -0.279794 -0.276689 0.037345 -0.322787 -0.745140 -1.077558 -0.889892 -0.631147 -1.066690 ... 3.594737 3.510633 3.097920 3.201749 2.937030 3.416820 3.870089 3.857518 3.911162 3.537754
target -0.193091 0.049086 -0.327358 0.019210 -0.219993 0.195364 -0.132089 0.204538 0.525334 0.667191 ... -0.498714 -0.228211 -0.429020 -0.045872 -0.449371 -0.316423 -0.366599 -0.517504 -0.983991 -1.257910

5 rows × 100 columns

(20, 4, 5) (20,)
{% endraw %} {% raw %}
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
splits = TrainValidTestSplitter(valid_size=.2, shuffle=False)(y)
X.shape, y.shape, splits
(100, 5)
var_0 var_1 var_2 var_3 target
0 -0.476062 -0.441021 -0.152402 0.492305 0.324888
1 -0.963942 -0.681256 -0.400798 0.714470 0.069261
2 -0.936488 -0.298446 -0.552363 0.866561 -0.013964
3 -0.687954 -0.197495 -0.168550 0.746651 0.141208
4 -0.217864 -0.357602 0.136323 0.675570 0.115994
... ... ... ... ... ...
95 -2.085664 -0.675079 0.149284 -2.369042 2.378206
96 -2.578443 -0.454039 -0.285605 -2.262007 2.203203
97 -2.948181 -0.754311 0.190622 -2.087817 2.120880
98 -3.420887 -1.191640 0.435389 -2.227205 2.287877
99 -3.134166 -1.024706 0.242647 -1.823440 2.229724

100 rows × 5 columns

((96, 4, 5),
 (96,),
 ((#77) [0,1,2,3,4,5,6,7,8,9...], (#19) [77,78,79,80,81,82,83,84,85,86...]))
{% endraw %} {% raw %}
data = np.concatenate([np.linspace(0, 1, 11).reshape(-1,1).repeat(2, 1), np.arange(11).reshape(-1,1)], -1)
df_test = pd.DataFrame(data, columns=['col1', 'col2', 'target'])
df_test['target'] = df_test['target'].astype(int)
df_test
col1 col2 target
0 0.0 0.0 0
1 0.1 0.1 1
2 0.2 0.2 2
3 0.3 0.3 3
4 0.4 0.4 4
5 0.5 0.5 5
6 0.6 0.6 6
7 0.7 0.7 7
8 0.8 0.8 8
9 0.9 0.9 9
10 1.0 1.0 10
{% endraw %} {% raw %}
def _y_func(o): return o[:, 0]
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, y = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=_y_func)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/wl))
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[1], 3)
    test_eq(x.shape[2], wl)
    test_close(x[:, 0, 0]*10, y)
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, y = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=None)(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/ wl))
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[1], 3)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(1, len(df_test)+1):
    x, y = SlidingWindow(wl, None, pad_remainder=False, get_x=['col1', 'col2'], get_y=['target'], horizon=-wl, y_func=None)(df_test)
    test_eq(x.shape[0], len(df_test) // wl)
    test_eq(x.shape[0], y.shape[0])
    test_eq(x.shape[1], 2)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(1, 20):
    x, _ = SlidingWindow(wl, None, pad_remainder=True, get_x=['col1', 'col2'], get_y=[])(df_test)
    test_eq(x.shape[0], math.ceil((len(df_test))/wl))
    test_eq(x.shape[1], 3)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}
for wl in np.arange(2, len(df_test)):
    x, _ = SlidingWindow(wl, wl, pad_remainder=False, get_x=['col1', 'col2'], get_y=[])(df_test)
    test_eq(x.shape[0], len(df_test) // wl)
    test_eq(x.shape[1], 2)
    test_eq(x.shape[2], wl)
{% endraw %} {% raw %}

SlidingWindowPanel[source]

SlidingWindowPanel(window_len:int, unique_id_cols:list, stride:Union[NoneType, int]=1, start:int=0, pad_remainder:bool=False, padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType, int, list]=None, get_y:Union[NoneType, int, list]=None, y_func:Optional[callable]=None, copy:bool=False, horizon:Union[int, list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True, return_key:bool=False, verbose:bool=True)

Applies a sliding window to a pd.DataFrame.

Args: window_len = length of lookback window unique_id_cols = pd.DataFrame columns that will be used to identify a time series for each entity. stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default), a given step (int), or random within the 1st stride (None). pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict:

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y. [] means no y data is created (unlabeled data).
y_func              = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
copy                = copy the original object to avoid changes in it.
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y
return_key          = when True, the key corresponsing to unique_id_cols for each sample is returned
verbose             = controls verbosity. True or 1 displays progress bar. 2 or more show records that cannot be created due to its length.


Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
samples = 100_000
wl = 5
n_vars = 10

t = (torch.stack(n_vars * [torch.arange(samples)]).T * tensor([10**i for i in range(n_vars)]))
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
df['time'] = np.arange(len(t))
df['device'] = 0
df['target'] = np.random.randint(0, 2, len(df))
df2 = df.copy()
df3 = df.copy()
cols = ['var_0', 'var_1', 'var_2', 'device', 'target']
df2[cols] = df2[cols] + 1
df3[cols] = df3[cols] + 2
df2 = df2.loc[:3]
df['region'] = 'A'
df2['region'] = 'A'
df3['region'] = 'B'
df = df.append(df2).append(df3).reset_index(drop=True)
df['index'] = np.arange(len(df))
df = df.sample(frac=1).reset_index(drop=True)
display(df.head())
df.shape
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 time device target region index
0 tensor(1287) tensor(12870) tensor(128700) tensor(1287000) tensor(12870000) tensor(128700000) tensor(1287000000) tensor(12870000000) tensor(128700000000) tensor(1287000000000) 1287 0 1 A 1287
1 tensor(2247) tensor(22470) tensor(224700) tensor(2247000) tensor(22470000) tensor(224700000) tensor(2247000000) tensor(22470000000) tensor(224700000000) tensor(2247000000000) 2247 0 0 A 2247
2 tensor(71422) tensor(714220) tensor(7142200) tensor(71422000) tensor(714220000) tensor(7142200000) tensor(71422000000) tensor(714220000000) tensor(7142200000000) tensor(71422000000000) 71422 0 1 A 71422
3 tensor(71726) tensor(717242) tensor(7172402) tensor(71724000) tensor(717240000) tensor(7172400000) tensor(71724000000) tensor(717240000000) tensor(7172400000000) tensor(71724000000000) 71724 2 2 B 171728
4 tensor(74436) tensor(744360) tensor(7443600) tensor(74436000) tensor(744360000) tensor(7443600000) tensor(74436000000) tensor(744360000000) tensor(7443600000000) tensor(74436000000000) 74436 0 1 A 74436
(200004, 15)
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=False)(df)
X.shape, y.shape
((199992, 10, 5), (199992,))
{% endraw %} {% raw %}
X, y, key = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                               horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=True)(df)
X.shape, y.shape, key.shape
((199992, 10, 5), (199992,), (199992,))
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
((199992, 10, 5), (199992,))
{% endraw %} {% raw %}
def y_max(o): return np.max(o, axis=1)
{% endraw %} {% raw %}
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          y_func=y_max, horizon=5, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
((199982, 10, 5), (199982,))
{% endraw %} {% raw %}

identify_padding[source]

identify_padding(float_mask, value=-1)

Identifies padded subsequences in a mask of type float

This function identifies as padded subsequences those where all values == nan from the end of the sequence (last dimension) across all channels, and sets those values to the selected value (default = -1)

Args: mask: boolean or float mask value: scalar that will be used to identify padded subsequences

{% endraw %} {% raw %}
{% endraw %} {% raw %}
wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, _ = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X = tensor(X)
X[0, 1, -2:] = np.nan
X[1,..., :3] = np.nan
print(X)
identify_padding(torch.isnan(X).float())
input shape: (13, 3)
tensor([[[ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  1.,  2., nan, nan],
         [ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  0.,  0.,  0.,  0.]],

        [[nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  0.,  0.]],

        [[10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [ 0.,  0.,  0.,  1.,  1.]]])
tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.]],

        [[0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.]]])
{% endraw %}