--- title: TSiT & InceptionTSiT keywords: fastai sidebar: home_sidebar summary: "These are PyTorch implementations created by Ignacio Oguiza (timeseriesAI@gmail.com) based on ViT (Vision Transformer)" description: "These are PyTorch implementations created by Ignacio Oguiza (timeseriesAI@gmail.com) based on ViT (Vision Transformer)" nb_path: "nbs/124_models.TSiTPlus.ipynb" ---
class TSiTPlus(nn.Sequential):
"""Time series transformer model based on ViT (Vision Transformer):
Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., ... & Houlsby, N. (2020).
An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929.
This implementation is a modified version of Vision Transformer that is part of the grat timm library
(https://github.com/rwightman/pytorch-image-models/blob/72b227dcf57c0c62291673b96bdc06576bb90457/timm/models/vision_transformer.py)
"""
def __init__(self, c_in:int, c_out:int, seq_len:int, n_layers:int=6, d_model:int=128, n_heads:int=16, d_head:Optional[int]=None, act:str='reglu',
d_ff:int=256, attn_dropout:float=0, dropout:float=0., drop_path_rate:float=0., mlp_ratio:int=1,
qkv_bias:bool=True, pre_norm:bool=False, use_token:bool=True, fc_dropout:float=0., bn:bool=False, y_range:Optional[tuple]=None,
ks:Optional[int]=None, maxpool:bool=True, feature_extractor:Optional[Callable]=None, custom_head:Optional[Callable]=None, verbose:bool=False):
"""
Args:
=====
c_in: the number of features (aka variables, dimensions, channels) in the time series dataset.
c_out: the number of target classes.
seq_len: number of time steps in the time series.
n_layers: number of layers (or blocks) in the encoder. Default: 3 (range(1-4))
d_model: total dimension of the model (number of features created by the model). Default: 128 (range(64-512))
n_heads: parallel attention heads. Default:16 (range(8-16)).
d_head: size of the learned linear projection of queries, keys and values in the MHA. Usual values: 16-512.
Default: None -> (d_model/n_heads) = 32.
act: the activation function of intermediate layer, relu, gelu, geglu, reglu.
d_ff: the dimension of the feedforward network model. Default: 512 (range(256-512))
dropout: dropout applied to to the embedded sequence steps after position embeddings have been added and
to the mlp sublayer in the encoder.
attn_dropout: dropout rate applied to the attention sublayer.
drop_path_rate: stochastic depth rate.
mlp_ratio: ratio of mlp hidden dim to embedding dim.
qkv_bias: determines whether bias is applied to the Linear projections of queries, keys and values in the MultiheadAttention
pre_norm: if True normalization will be applied as the first step in the sublayers. Defaults to False.
use_token: if True, the output will come from the transformed token. Otherwise a pooling layer will be applied.
fc_dropout: dropout applied to the final fully connected layer.
bn: flag that indicates if batchnorm will be applied to the head.
y_range: range of possible y values (used in regression tasks).
ks: (Optional) kernel sizes that will be applied to a hybrid embedding.
maxpool: If true and kernel sizes are passed, maxpool will also be added to the hybrid embedding.
feature_extractor: an optional callable (nn.Conv1d with dilation > 1 or stride > 1 for example) that will be used to preprocess the time series before
the embedding step. It is useful to extract features or resample the time series.
custom_head: custom head that will be applied to the network. It must contain all kwargs (pass a partial function)
Input shape:
x: bs (batch size) x nvars (aka features, variables, dimensions, channels) x seq_len (aka time steps)
"""
backbone = _TSiTBackbone(c_in, seq_len, n_layers=n_layers, d_model=d_model, n_heads=n_heads, d_head=d_head, act=act, d_ff=d_ff,
attn_dropout=attn_dropout, dropout=dropout, drop_path_rate=drop_path_rate, pre_norm=pre_norm, mlp_ratio=mlp_ratio,
use_token=use_token, ks=ks, maxpool=maxpool, feature_extractor=feature_extractor, verbose=verbose)
self.head_nf = d_model
self.c_out = c_out
self.seq_len = seq_len
if custom_head:
head = custom_head(self.head_nf, c_out, self.seq_len) # custom head passed as a partial func with all its kwargs
else:
layers = [TokenLayer(token=use_token)]
layers += [LinBnDrop(d_model, c_out, bn=bn, p=fc_dropout)]
if y_range: layers += [SigmoidRange(*y_range)]
head = nn.Sequential(*layers)
super().__init__(OrderedDict([('backbone', backbone), ('head', head)]))
TSiT = TSiTPlus
InceptionTSiTPlus = named_partial("InceptionTSiTPlus", TSiTPlus, feature_extractor=partial(InceptionBlockPlus, ks=[3,5,7]))
InceptionTSiT = named_partial("InceptionTSiT", TSiTPlus, feature_extractor=partial(InceptionBlockPlus, ks=[3,5,7]))
ConvTSiT = named_partial("ConvTSiT", TSiTPlus, ks=[1,3,5,7])
ConvTSiTPlus = named_partial("ConvTSiTPlus", TSiTPlus, ks=[1,3,5,7])
bs = 16
nvars = 4
seq_len = 50
c_out = 2
xb = torch.rand(bs, nvars, seq_len)
model = TSiTPlus(nvars, c_out, seq_len, attn_dropout=.1, dropout=.1)
test_eq(model(xb).shape, (bs, c_out))
model
bs = 16
nvars = 4
seq_len = 50
c_out = 2
xb = torch.rand(bs, nvars, seq_len)
model = InceptionTSiTPlus(nvars, c_out, seq_len)
test_eq(model(xb).shape, (bs, c_out))
from tsai.data.validation import get_splits
from tsai.data.core import get_ts_dls
X = np.zeros((10, 3, 5000))
y = np.random.randint(0,2,X.shape[0])
splits = get_splits(y)
dls = get_ts_dls(X, y, splits=splits)
xb, yb = dls.train.one_batch()
xb
If you try to use TSiTPlus, it's likely you'll get an 'out-of-memory' error.
To avoid this you can subsample the sequence reducing the input's length. This can be done in multiple ways. Here are a few examples:
feature_extractor = Conv1d(xb.shape[1], xb.shape[1], ks=100, stride=50, padding=0, groups=xb.shape[1]).to(default_device())
feature_extractor(xb).shape
feature_extractor = Conv1d(xb.shape[1], 2, ks=100, stride=50, padding=0).to(default_device())
feature_extractor(xb).shape
feature_extractor = nn.Sequential(Pad1d((0, 50), 0), nn.MaxPool1d(kernel_size=100, stride=50)).to(default_device())
feature_extractor(xb).shape
feature_extractor = nn.Sequential(Pad1d((0, 50), 0), nn.AvgPool1d(kernel_size=100, stride=50)).to(default_device())
feature_extractor(xb).shape
Once you decide what type of transform you want to apply, you just need to pass the layer as the feature_extractor attribute:
bs = 16
nvars = 4
seq_len = 1000
c_out = 2
d_model = 128
xb = torch.rand(bs, nvars, seq_len)
feature_extractor = partial(Conv1d, ks=5, stride=3, padding=0, groups=xb.shape[1])
model = TSiTPlus(nvars, c_out, seq_len, d_model=d_model, feature_extractor=feature_extractor)
test_eq(model(xb).shape, (bs, c_out))