--- title: Data preprocessing keywords: fastai sidebar: home_sidebar summary: "Functions used to preprocess time series (both X and y)." description: "Functions used to preprocess time series (both X and y)." nb_path: "nbs/016_data.preprocessing.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
from tsai.data.external import get_UCR_data
dsid = 'NATOPS'
X, y, splits = get_UCR_data(dsid, return_split=False)
tfms = [None, Categorize()]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)
{% endraw %} {% raw %}

class ToNumpyCategory[source]

ToNumpyCategory(**kwargs) :: Transform

Categorize a numpy batch

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = ToNumpyCategory()
y_cat = t(y)
y_cat[:10]
array([3, 2, 2, 3, 2, 4, 0, 5, 2, 1])
{% endraw %} {% raw %}
test_eq(t.decode(tensor(y_cat)), y)
test_eq(t.decode(np.array(y_cat)), y)
{% endraw %} {% raw %}

class OneHot[source]

OneHot(n_classes=None, **kwargs) :: Transform

One-hot encode/ decode a batch

{% endraw %} {% raw %}
{% endraw %} {% raw %}
oh_encoder = OneHot()
y_cat = ToNumpyCategory()(y)
oht = oh_encoder(y_cat)
oht[:10]
array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])
{% endraw %} {% raw %}
n_classes = 10
n_samples = 100

t = torch.randint(0, n_classes, (n_samples,))
oh_encoder = OneHot()
oht = oh_encoder(t)
test_eq(oht.shape, (n_samples, n_classes))
test_eq(torch.argmax(oht, dim=-1), t)
test_eq(oh_encoder.decode(oht), t)
{% endraw %} {% raw %}
n_classes = 10
n_samples = 100

a = np.random.randint(0, n_classes, (n_samples,))
oh_encoder = OneHot()
oha = oh_encoder(a)
test_eq(oha.shape, (n_samples, n_classes))
test_eq(np.argmax(oha, axis=-1), a)
test_eq(oh_encoder.decode(oha), a)
{% endraw %} {% raw %}

class TSNan2Value[source]

TSNan2Value(value=0, median=False, by_sample_and_var=True, sel_vars=None) :: Transform

Replaces any nan values by a predefined value or median

{% endraw %} {% raw %}
{% endraw %} {% raw %}
o = TSTensor(torch.randn(16, 10, 100))
o[0,0] = float('nan')
o[o > .9] = float('nan')
o[[0,1,5,8,14,15], :, -20:] = float('nan')
nan_vals1 = torch.isnan(o).sum()
o2 = Pipeline(TSNan2Value(), split_idx=0)(o.clone())
o3 = Pipeline(TSNan2Value(median=True, by_sample_and_var=True), split_idx=0)(o.clone())
o4 = Pipeline(TSNan2Value(median=True, by_sample_and_var=False), split_idx=0)(o.clone())
nan_vals2 = torch.isnan(o2).sum()
nan_vals3 = torch.isnan(o3).sum()
nan_vals4 = torch.isnan(o4).sum()
test_ne(nan_vals1, 0)
test_eq(nan_vals2, 0)
test_eq(nan_vals3, 0)
test_eq(nan_vals4, 0)
{% endraw %} {% raw %}
o = TSTensor(torch.randn(16, 10, 100))
o[o > .9] = float('nan')
o = TSNan2Value(median=True, sel_vars=[0,1,2,3,4])(o)
test_eq(torch.isnan(o[:, [0,1,2,3,4]]).sum().item(), 0)
{% endraw %} {% raw %}

class TSStandardize[source]

TSStandardize(mean=None, std=None, by_sample=False, by_var=False, by_step=False, eps=1e-08, use_single_batch=True, verbose=False, **kwargs) :: Transform

Standardizes batch of type TSTensor

Args:

- mean: you can pass a precalculated mean value as a torch tensor which is the one that will be used, or leave as None, in which case
    it will be estimated using a batch.
- std: you can pass a precalculated std value as a torch tensor which is the one that will be used, or leave as None, in which case
    it will be estimated using a batch. If both mean and std values are passed when instantiating TSStandardize, the rest of arguments won't be used.
- by_sample: if True, it will calculate mean and std for each individual sample. Otherwise based on the entire batch.
- by_var:
    * False: mean and std will be the same for all variables.
    * True: a mean and std will be be different for each variable.
    * a list of ints: (like [0,1,3]) a different mean and std will be set for each variable on the list. Variables not included in the list
    won't be standardized.
    * a list that contains a list/lists: (like[0, [1,3]]) a different mean and std will be set for each element of the list. If multiple elements are
    included in a list, the same mean and std will be set for those variable in the sublist/s. (in the example a mean and std is determined for
    variable 0, and another one for variables 1 & 3 - the same one). Variables not included in the list won't be standardized.
- by_step: if False, it will standardize values for each time step.
- eps: it avoids dividing by 0
- use_single_batch: if True a single training batch will be used to calculate mean & std. Else the entire training set will be used.
{% endraw %} {% raw %}
{% endraw %} {% raw %}
batch_tfms=[TSStandardize(by_sample=True, by_var=False, verbose=True)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, batch_tfms=batch_tfms)
xb, yb = next(iter(dls.train))
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
{% endraw %} {% raw %}
from tsai.data.validation import TimeSplitter
X_nan = np.random.rand(100, 5, 10)
idxs = np.random.choice(len(X_nan), int(len(X_nan)*.5), False)
X_nan[idxs, 0] = float('nan')
idxs = np.random.choice(len(X_nan), int(len(X_nan)*.5), False)
X_nan[idxs, 1, -10:] = float('nan')
batch_tfms = TSStandardize(by_var=True)
dls = get_ts_dls(X_nan, batch_tfms=batch_tfms, splits=TimeSplitter(show_plot=False)(range_of(X_nan)))
test_eq(torch.isnan(dls.after_batch[0].mean).sum(), 0)
test_eq(torch.isnan(dls.after_batch[0].std).sum(), 0)
xb = first(dls.train)[0]
test_ne(torch.isnan(xb).sum(), 0)
test_ne(torch.isnan(xb).sum(), torch.isnan(xb).numel())
batch_tfms = [TSStandardize(by_var=True), Nan2Value()]
dls = get_ts_dls(X_nan, batch_tfms=batch_tfms, splits=TimeSplitter(show_plot=False)(range_of(X_nan)))
xb = first(dls.train)[0]
test_eq(torch.isnan(xb).sum(), 0)
{% endraw %} {% raw %}
batch_tfms=[TSStandardize(by_sample=True, by_var=False, verbose=False)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
xb, yb = next(iter(dls.valid))
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
{% endraw %} {% raw %}
tfms = [None, TSClassification()]
batch_tfms = TSStandardize(by_sample=True)
dls = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[64, 128], inplace=True)
xb, yb = dls.train.one_batch()
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
xb, yb = dls.valid.one_batch()
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
{% endraw %} {% raw %}
tfms = [None, TSClassification()]
batch_tfms = TSStandardize(by_sample=True, by_var=False, verbose=False)
dls = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[64, 128], inplace=False)
xb, yb = dls.train.one_batch()
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
xb, yb = dls.valid.one_batch()
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
{% endraw %} {% raw %}

Tensor.mul_min[source]

Tensor.mul_min(x:(Tensor, TSTensor, NumpyTensor), axes=(), keepdim=False)

{% endraw %} {% raw %}

TSTensor.mul_min[source]

TSTensor.mul_min(x:(Tensor, TSTensor, NumpyTensor), axes=(), keepdim=False)

{% endraw %} {% raw %}

NumpyTensor.mul_min[source]

NumpyTensor.mul_min(x:(Tensor, TSTensor, NumpyTensor), axes=(), keepdim=False)

{% endraw %} {% raw %}

Tensor.mul_max[source]

Tensor.mul_max(x:(Tensor, TSTensor, NumpyTensor), axes=(), keepdim=False)

{% endraw %} {% raw %}

TSTensor.mul_max[source]

TSTensor.mul_max(x:(Tensor, TSTensor, NumpyTensor), axes=(), keepdim=False)

{% endraw %} {% raw %}

NumpyTensor.mul_max[source]

NumpyTensor.mul_max(x:(Tensor, TSTensor, NumpyTensor), axes=(), keepdim=False)

{% endraw %} {% raw %}

class TSNormalize[source]

TSNormalize(min=None, max=None, range=(-1, 1), by_sample=False, by_var=False, by_step=False, clip_values=True, use_single_batch=True, verbose=False, **kwargs) :: Transform

Normalizes batch of type TSTensor

{% endraw %} {% raw %}
{% endraw %} {% raw %}
batch_tfms = [TSNormalize()]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
assert xb.max() <= 1
assert xb.min() >= -1
{% endraw %} {% raw %}
batch_tfms=[TSNormalize(by_sample=True, by_var=False, verbose=False)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
assert xb.max() <= 1
assert xb.min() >= -1
{% endraw %} {% raw %}
batch_tfms = [TSNormalize(by_var=[0, [1, 2]], use_single_batch=False, clip_values=False, verbose=False)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
assert xb[:, [0, 1, 2]].max() <= 1
assert xb[:, [0, 1, 2]].min() >= -1
{% endraw %} {% raw %}

class TSClipOutliers[source]

TSClipOutliers(min=None, max=None, by_sample=False, by_var=False, use_single_batch=False, verbose=False, **kwargs) :: Transform

Clip outliers batch of type TSTensor based on the IQR

{% endraw %} {% raw %}
{% endraw %} {% raw %}
batch_tfms=[TSClipOutliers(-1, 1, verbose=True)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
assert xb.max() <= 1
assert xb.min() >= -1
test_close(xb.min(), -1, eps=1e-1)
test_close(xb.max(), 1, eps=1e-1)
xb, yb = next(iter(dls.valid))
test_close(xb.min(), -1, eps=1e-1)
test_close(xb.max(), 1, eps=1e-1)
TSClipOutliers min=-1, max=1

{% endraw %} {% raw %}

class TSClip[source]

TSClip(min=-6, max=6, **kwargs) :: Transform

Clip batch of type TSTensor

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = TSTensor(torch.randn(10, 20, 100)*10)
test_le(TSClip()(t).max().item(), 6)
test_ge(TSClip()(t).min().item(), -6)
{% endraw %} {% raw %}

class TSSelfMissingness[source]

TSSelfMissingness(sel_vars=None, **kwargs) :: Transform

Applies missingness from samples in a batch to random samples in the batch for selected variables

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = TSTensor(torch.randn(10, 20, 100))
t[t>.8] = np.nan
t2 = TSSelfMissingness()(t.clone())
t3 = TSSelfMissingness(sel_vars=[0,3,5,7])(t.clone())
assert (torch.isnan(t).sum() < torch.isnan(t2).sum()) and (torch.isnan(t2).sum() >  torch.isnan(t3).sum())
{% endraw %} {% raw %}

class TSRobustScale[source]

TSRobustScale(median=None, iqr=None, quantile_range=(25.0, 75.0), use_single_batch=True, eps=1e-08, verbose=False, **kwargs) :: Transform

This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
batch_tfms = TSRobustScale(verbose=True, use_single_batch=False)
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, batch_tfms=batch_tfms, num_workers=0)
xb, yb = next(iter(dls.train))
xb.min()
TSRobustScale median=torch.Size([1, 24, 1]) iqr=torch.Size([1, 24, 1])
TSTensor([-2.420729875564575], device=cpu)
{% endraw %} {% raw %}

get_stats_with_uncertainty[source]

get_stats_with_uncertainty(o, sel_vars=None, sel_vars_zero_mean_unit_var=False, bs=64, n_trials=None, axis=(0, 2))

{% endraw %} {% raw %}

get_random_stats[source]

get_random_stats(E_mean, S_mean, E_std, S_std)

{% endraw %} {% raw %}

class TSGaussianStandardize[source]

TSGaussianStandardize(E_mean:ndarray, S_mean:ndarray, E_std:ndarray, S_std:ndarray, eps=1e-08, split_idx=0, **kwargs) :: Transform

Scales each batch using modeled mean and std based on UNCERTAINTY MODELING FOR OUT-OF-DISTRIBUTION GENERALIZATION https://arxiv.org/abs/2202.03958

Type Default Details
E_mean ndarray Mean expected value
S_mean ndarray Uncertainty (standard deviation) of the mean
E_std ndarray Standard deviation expected value
S_std ndarray Uncertainty (standard deviation) of the standard deviation
eps float 1e-08 (epsilon) small amount added to standard deviation to avoid deviding by zero
split_idx int 0 Flag to indicate to which set is this transofrm applied. 0: training, 1:validation, None:both
kwargs No Content
{% endraw %} {% raw %}
{% endraw %} {% raw %}
arr = np.random.rand(1000, 2, 50)
E_mean, S_mean, E_std, S_std = get_stats_with_uncertainty(arr, sel_vars=None, bs=64, n_trials=None, axis=(0,2))
new_mean, new_std = get_random_stats(E_mean, S_mean, E_std, S_std)
new_mean2, new_std2 = get_random_stats(E_mean, S_mean, E_std, S_std)
test_ne(new_mean, new_mean2)
test_ne(new_std, new_std2)
test_eq(new_mean.shape, (1, 2, 1))
test_eq(new_std.shape, (1, 2, 1))
new_mean, new_std
100.00% [15/15 00:00<00:00]
(array([[[0.49325317],
         [0.49157721]]]),
 array([[[0.29111346],
         [0.2892515 ]]]))
{% endraw %}

TSGaussianStandardize can be used jointly with TSStandardized in the following way:

X, y, splits = get_UCR_data('LSST', split_data=False)
tfms = [None, TSClassification()]
E_mean, S_mean, E_std, S_std = get_stats_with_uncertainty(X, sel_vars=None, bs=64, n_trials=None, axis=(0,2))
batch_tfms = [TSGaussianStandardize(E_mean, S_mean, E_std, S_std, split_idx=0), TSStandardize(E_mean, S_mean, split_idx=1)]
dls = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[32, 64])
learn = ts_learner(dls, InceptionTimePlus, metrics=accuracy, cbs=[ShowGraph()])
learn.fit_one_cycle(1, 1e-2)

In this way the train batches are scaled based on mean and standard deviation distributions while the valid batches are scaled with a fixed mean and standard deviation values.

The intent is to improve out-of-distribution performance. This method is inspired by UNCERTAINTY MODELING FOR OUT-OF-DISTRIBUTION GENERALIZATION https://arxiv.org/abs/2202.03958.

{% raw %}

class TSDiff[source]

TSDiff(lag=1, pad=True, **kwargs) :: Transform

Differences batch of type TSTensor

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = TSTensor(torch.arange(24).reshape(2,3,4))
test_eq(TSDiff()(t)[..., 1:].float().mean(), 1)
test_eq(TSDiff(lag=2, pad=False)(t).float().mean(), 2)
{% endraw %} {% raw %}

class TSLog[source]

TSLog(ex=None, **kwargs) :: Transform

Log transforms batch of type TSTensor + 1. Accepts positive and negative numbers

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = TSTensor(torch.rand(2,3,4)) * 2 - 1 
tfm = TSLog()
enc_t = tfm(t)
test_ne(enc_t, t)
test_close(tfm.decodes(enc_t).data, t.data)
{% endraw %} {% raw %}

class TSCyclicalPosition[source]

TSCyclicalPosition(magnitude=None, **kwargs) :: Transform

Concatenates the position along the sequence as 2 additional variables (sine and cosine)

Args: magnitude: added for compatibility. It's not used.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
bs, c_in, seq_len = 1,3,100
t = TSTensor(torch.rand(bs, c_in, seq_len))
enc_t = TSCyclicalPosition()(t)
test_ne(enc_t, t)
assert t.shape[1] == enc_t.shape[1] - 2
plt.plot(enc_t[0, -2:].cpu().numpy().T)
plt.show()
{% endraw %} {% raw %}

class TSLinearPosition[source]

TSLinearPosition(magnitude=None, lin_range=(-1, 1), **kwargs) :: Transform

Concatenates the position along the sequence as 1 additional variable

Args: magnitude: added for compatibility. It's not used.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
bs, c_in, seq_len = 1,3,100
t = TSTensor(torch.rand(bs, c_in, seq_len))
enc_t = TSLinearPosition()(t)
test_ne(enc_t, t)
assert t.shape[1] == enc_t.shape[1] - 1
plt.plot(enc_t[0, -1].cpu().numpy().T)
plt.show()
{% endraw %} {% raw %}

class TSPosition[source]

TSPosition(cyclical=True, linear=True, magnitude=None, lin_range=(-1, 1), **kwargs) :: Transform

Concatenates linear and/or cyclical positions along the sequence as additional variables

{% endraw %} {% raw %}
{% endraw %} {% raw %}
bs, c_in, seq_len = 1,3,100
t = TSTensor(torch.rand(bs, c_in, seq_len))
enc_t = TSPosition(cyclical=True, linear=True)(t)
test_eq(enc_t.shape[1], 6)
plt.plot(enc_t[0, 3:].T);
{% endraw %} {% raw %}

class TSMissingness[source]

TSMissingness(feature_idxs=None, magnitude=None, **kwargs) :: Transform

Concatenates data missingness for selected features along the sequence as additional variables

{% endraw %} {% raw %}
{% endraw %} {% raw %}
bs, c_in, seq_len = 1,3,100
t = TSTensor(torch.rand(bs, c_in, seq_len))
t[t>.5] = np.nan
enc_t = TSMissingness(feature_idxs=[0,2])(t)
test_eq(enc_t.shape[1], 5)
test_eq(enc_t[:, 3:], torch.isnan(t[:, [0,2]]).float())
{% endraw %} {% raw %}

class TSPositionGaps[source]

TSPositionGaps(feature_idxs=None, magnitude=None, forward=True, backward=False, nearest=False, normalize=True, **kwargs) :: Transform

Concatenates gaps for selected features along the sequence as additional variables

{% endraw %} {% raw %}
{% endraw %} {% raw %}
bs, c_in, seq_len = 1,3,8
t = TSTensor(torch.rand(bs, c_in, seq_len))
t[t>.5] = np.nan
enc_t = TSPositionGaps(feature_idxs=[0,2], forward=True, backward=True, nearest=True, normalize=False)(t)
test_eq(enc_t.shape[1], 9)
enc_t.data
tensor([[[       nan, 3.4281e-01,        nan,        nan,        nan,
          3.8138e-01, 4.1530e-03,        nan],
         [9.8590e-02,        nan, 2.0313e-03, 5.3328e-02,        nan,
                 nan,        nan,        nan],
         [8.0353e-03, 1.3747e-02, 1.5980e-01, 3.5356e-01, 3.9624e-01,
                 nan, 1.4024e-01,        nan],
         [1.0000e+00, 2.0000e+00, 1.0000e+00, 2.0000e+00, 3.0000e+00,
          4.0000e+00, 1.0000e+00, 1.0000e+00],
         [1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
          1.0000e+00, 2.0000e+00, 1.0000e+00],
         [1.0000e+00, 4.0000e+00, 3.0000e+00, 2.0000e+00, 1.0000e+00,
          1.0000e+00, 2.0000e+00, 1.0000e+00],
         [1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 2.0000e+00,
          1.0000e+00, 2.0000e+00, 1.0000e+00],
         [1.0000e+00, 2.0000e+00, 1.0000e+00, 2.0000e+00, 1.0000e+00,
          1.0000e+00, 1.0000e+00, 1.0000e+00],
         [1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
          1.0000e+00, 2.0000e+00, 1.0000e+00]]])
{% endraw %} {% raw %}

class TSRollingMean[source]

TSRollingMean(feature_idxs=None, magnitude=None, window=2, replace=False, **kwargs) :: Transform

Calculates the rolling mean for all/ selected features alongside the sequence

It replaces the original values or adds additional variables (default) If nan values are found, they will be filled forward and backward

{% endraw %} {% raw %}
{% endraw %} {% raw %}
bs, c_in, seq_len = 1,3,8
t = TSTensor(torch.rand(bs, c_in, seq_len))
t[t > .6] = np.nan
print(t.data)
enc_t = TSRollingMean(feature_idxs=[0,2], window=3)(t)
test_eq(enc_t.shape[1], 5)
print(enc_t.data)
enc_t = TSRollingMean(window=3, replace=True)(t)
test_eq(enc_t.shape[1], 3)
print(enc_t.data)
tensor([[[   nan,    nan,    nan, 0.4031, 0.4682, 0.4074, 0.1519,    nan],
         [   nan, 0.2511,    nan,    nan, 0.2277, 0.0215, 0.3229, 0.4810],
         [0.5031, 0.4936,    nan, 0.5097, 0.0483,    nan, 0.5657, 0.2998]]])
tensor([[[0.4031, 0.4031, 0.4031, 0.4031, 0.4682, 0.4074, 0.1519, 0.1519],
         [   nan, 0.2511,    nan,    nan, 0.2277, 0.0215, 0.3229, 0.4810],
         [0.5031, 0.4936, 0.4936, 0.5097, 0.0483, 0.0483, 0.5657, 0.2998],
         [0.4031, 0.4031, 0.4031, 0.4031, 0.4248, 0.4262, 0.3425, 0.2371],
         [0.5031, 0.4983, 0.4968, 0.4990, 0.3505, 0.2021, 0.2207, 0.3046]]])
tensor([[[0.4031, 0.4031, 0.4031, 0.4031, 0.4248, 0.4262, 0.3425, 0.2371],
         [0.2511, 0.2511, 0.2511, 0.2511, 0.2433, 0.1667, 0.1907, 0.2751],
         [0.5031, 0.4983, 0.4968, 0.4990, 0.3505, 0.2021, 0.2207, 0.3046]]])
{% endraw %} {% raw %}

class TSLogReturn[source]

TSLogReturn(lag=1, pad=True, **kwargs) :: Transform

Calculates log-return of batch of type TSTensor. For positive values only

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = TSTensor([1,2,4,8,16,32,64,128,256]).float()
test_eq(TSLogReturn(pad=False)(t).std(), 0)
{% endraw %} {% raw %}

class TSAdd[source]

TSAdd(add, **kwargs) :: Transform

Add a defined amount to each batch of type TSTensor.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = TSTensor([1,2,3]).float()
test_eq(TSAdd(1)(t), TSTensor([2,3,4]).float())
{% endraw %} {% raw %}

class TSClipByVar[source]

TSClipByVar(var_min_max, **kwargs) :: Transform

Clip batch of type TSTensor by variable

Args: var_min_max: list of tuples containing variable index, min value (or None) and max value (or None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
t = TSTensor(torch.rand(16, 3, 10) * tensor([1,10,100]).reshape(1,-1,1))
max_values = t.max(0).values.max(-1).values.data
max_values2 = TSClipByVar([(1,None,5), (2,10,50)])(t).max(0).values.max(-1).values.data
test_le(max_values2[1], 5)
test_ge(max_values2[2], 10)
test_le(max_values2[2], 50)
{% endraw %}

sklearn API transforms

{% raw %}

class TSShrinkDataFrame[source]

TSShrinkDataFrame(columns=None, skip=[], obj2cat=True, int2uint=False, verbose=True) :: BaseEstimator

Base class for all estimators in scikit-learn.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

{% endraw %} {% raw %}
{% endraw %} {% raw %}
df = pd.DataFrame()
df["ints64"] = np.random.randint(0,3,10)
df['floats64'] = np.random.rand(10)
tfm = TSShrinkDataFrame()
tfm.fit(df)
df = tfm.transform(df)
test_eq(df["ints64"].dtype, "int8")
test_eq(df["floats64"].dtype, "float32")
Memory usage of dataframe is 0.000274658203125 MB
Memory usage of dataframe after reduction 0.0001697540283203125 MB
Reduced by 38.19444444444444 % 
{% endraw %} {% raw %}

class TSOneHotEncoder[source]

TSOneHotEncoder(columns=None, drop=True, add_na=True, dtype=int64) :: BaseEstimator

Base class for all estimators in scikit-learn.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

{% endraw %} {% raw %}
{% endraw %} {% raw %}
df = pd.DataFrame()
df["a"] = np.random.randint(0,2,10)
df["b"] = np.random.randint(0,3,10)
unique_cols = len(df["a"].unique()) + len(df["b"].unique())
tfm = TSOneHotEncoder()
tfm.fit(df)
df = tfm.transform(df)
test_eq(df.shape[1], unique_cols)
{% endraw %} {% raw %}

class TSCategoricalEncoder[source]

TSCategoricalEncoder(columns=None, add_na=True) :: BaseEstimator

Base class for all estimators in scikit-learn.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

{% endraw %} {% raw %}
{% endraw %}

Stateful transforms like TSCategoricalEncoder can easily be serialized.

{% raw %}
import joblib
df = pd.DataFrame()
df["a"] = alphabet[np.random.randint(0,2,100)]
df["b"] = ALPHABET[np.random.randint(0,3,100)]
a_unique = len(df["a"].unique())
b_unique = len(df["b"].unique())
tfm = TSCategoricalEncoder()
tfm.fit(df)
joblib.dump(tfm, "data/TSCategoricalEncoder.joblib")
tfm = joblib.load("data/TSCategoricalEncoder.joblib")
df = tfm.transform(df)
test_eq(df['a'].max(), a_unique)
test_eq(df['b'].max(), b_unique)
{% endraw %} {% raw %}

class TSDateTimeEncoder[source]

TSDateTimeEncoder(datetime_columns=None, prefix=None, drop=True, time=False, attr=['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']) :: BaseEstimator

Base class for all estimators in scikit-learn.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

{% endraw %} {% raw %}
{% endraw %} {% raw %}
import datetime
df = pd.DataFrame()
df.loc[0, "date"] = datetime.datetime.now()
df.loc[1, "date"] = datetime.datetime.now() + pd.Timedelta(1, unit="D")
tfm = TSDateTimeEncoder()
joblib.dump(tfm, "data/TSDateTimeEncoder.joblib")
tfm = joblib.load("data/TSDateTimeEncoder.joblib")
tfm.fit_transform(df)
_Year _Month _Week _Day _Dayofweek _Dayofyear _Is_month_end _Is_month_start _Is_quarter_end _Is_quarter_start _Is_year_end _Is_year_start
0 2022 3 13 28 0 87 False False False False False False
1 2022 3 13 29 1 88 False False False False False False
{% endraw %} {% raw %}

class TSMissingnessEncoder[source]

TSMissingnessEncoder(columns=None) :: BaseEstimator

Base class for all estimators in scikit-learn.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

{% endraw %} {% raw %}
{% endraw %} {% raw %}
data = np.random.rand(10,3)
data[data > .8] = np.nan
df = pd.DataFrame(data, columns=["a", "b", "c"])
tfm = TSMissingnessEncoder()
tfm.fit(df)
joblib.dump(tfm, "data/TSMissingnessEncoder.joblib")
tfm = joblib.load("data/TSMissingnessEncoder.joblib")
df = tfm.transform(df)
df
a b c a_missing b_missing c_missing
0 NaN 0.083395 0.022296 1 0 0
1 0.562314 NaN NaN 0 1 1
2 0.542282 0.143784 0.278463 0 0 0
3 0.674480 0.503308 0.505716 0 0 0
4 0.717242 0.418776 0.432351 0 0 0
5 0.398410 0.493584 0.251381 0 0 0
6 NaN 0.409473 0.421612 1 0 0
7 NaN 0.489890 NaN 1 0 1
8 0.708219 0.792705 0.030193 0 0 0
9 0.613305 0.716313 0.518017 0 0 0
{% endraw %}

y transforms

{% raw %}

class Preprocessor[source]

Preprocessor(preprocessor, **kwargs)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
from tsai.data.validation import TimeSplitter
y = random_shuffle(np.random.randn(1000) * 10 + 5)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(StandardScaler)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()
{% endraw %} {% raw %}
y = random_shuffle(np.random.randn(1000) * 10 + 5)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(RobustScaler)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()
{% endraw %} {% raw %}
y = random_shuffle(np.random.rand(1000) * 3 + .5)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(Normalizer)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()
{% endraw %} {% raw %}
y = random_shuffle(np.random.rand(1000) * 10 + 5)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(BoxCox)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()
{% endraw %} {% raw %}
y = random_shuffle(np.random.randn(1000) * 10 + 5)
y = np.random.beta(.5, .5, size=1000)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(YeoJohnshon)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()
{% endraw %} {% raw %}
y = - np.random.beta(1, .5, 10000) * 10
splits = TimeSplitter()(y)
preprocessor = Preprocessor(Quantile)
preprocessor.fit(y[splits[0]])
plt.hist(y, 50, label='ori',)
y_tfm = preprocessor.transform(y)
plt.legend(loc='best')
plt.show()
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()
test_close(preprocessor.inverse_transform(y_tfm), y, 1e-1)
{% endraw %} {% raw %}

ReLabeler[source]

ReLabeler(cm)

Changes the labels in a dataset based on a dictionary (class mapping) Args: cm = class mapping dictionary

{% endraw %} {% raw %}
{% endraw %} {% raw %}
vals = {0:'a', 1:'b', 2:'c', 3:'d', 4:'e'}
y = np.array([vals[i] for i in np.random.randint(0, 5, 20)])
labeler = ReLabeler(dict(a='x', b='x', c='y', d='z', e='z'))
y_new = labeler(y)
test_eq(y.shape, y_new.shape)
y, y_new
(array(['d', 'e', 'b', 'b', 'd', 'a', 'd', 'd', 'c', 'a', 'e', 'a', 'b',
        'e', 'd', 'b', 'c', 'b', 'a', 'b'], dtype='<U1'),
 array(['z', 'z', 'x', 'x', 'z', 'x', 'z', 'z', 'y', 'x', 'z', 'x', 'x',
        'z', 'z', 'x', 'y', 'x', 'x', 'x'], dtype='<U1'))
{% endraw %}