--- title: Introduction keywords: fastai sidebar: home_sidebar nb_path: "examples/Getting_Started_with_Auto_Arima_and_ETS.ipynb" ---
{% raw %}
{% endraw %}

Open In Colab

1. Installing StatsForecast Library

{% raw %}
%%capture
!pip install -U numba
!pip install -U statsmodels
!pip install statsforecast
{% endraw %} {% raw %}
import numpy as np
import pandas as pd
from IPython.display import display, Markdown

import matplotlib.pyplot as plt
from statsforecast import StatsForecast
from statsforecast.models import auto_arima, ets
from statsforecast.utils import AirPassengers
{% endraw %} {% raw %}
#from statsforecast.models import __all__
#__all__
{% endraw %}

2. Loading AirPassengers Example Data

{% raw %}
# We use the index functionality to make the training a lot faster.
Y_df = pd.DataFrame({'unique_id': np.ones(len(AirPassengers)),
                     'ds': pd.date_range(start='1949-01-01', 
                                         periods=len(AirPassengers), freq='M'),
                     'y': AirPassengers})
Y_df.tail(13)
unique_id ds y
131 1.0 1959-12-31 405.0
132 1.0 1960-01-31 417.0
133 1.0 1960-02-29 391.0
134 1.0 1960-03-31 419.0
135 1.0 1960-04-30 461.0
136 1.0 1960-05-31 472.0
137 1.0 1960-06-30 535.0
138 1.0 1960-07-31 622.0
139 1.0 1960-08-31 606.0
140 1.0 1960-09-30 508.0
141 1.0 1960-10-31 461.0
142 1.0 1960-11-30 390.0
143 1.0 1960-12-31 432.0
{% endraw %}

3. Fit AutoArima and AutoETS

ETS: The exponential smoothing (ETS) algorithm is especially suited for data with seasonality and trend. ETS computes a weighted average over all observations in the input time series dataset as its prediction. In contrast to moving average methods with constant weights, ETS weights exponentially decrease over time, capturing long term dependencies while prioritizing new observations.

AutoARIMA: The autoregressive integrated moving average (ARIMA), combines differencing steps, lag regression and moving averages into a single method capable of modeling non-stationary time series. This method complements on ETS and it is based on the description of data's autocorrelations.

{% raw %}
Y_train_df = Y_df[Y_df.ds<='1959-12-31']
Y_test_df = Y_df[Y_df.ds>'1959-12-31']
print('len(Y_train_df)', len(Y_train_df))
print('len(Y_test_df)', len(Y_test_df))
len(Y_train_df) 132
len(Y_test_df) 12
{% endraw %} {% raw %}
# Note: For all models the following parameters are passed automaticly and 
# don't need to be declared: (X, h, future_xreg).
# for ets we pass a ZMZ, model, which stands for error selected optimally,
season_length = 12
horizon = len(Y_test_df)
model = StatsForecast(Y_train_df.set_index('unique_id'), 
                      models=[(auto_arima, season_length),
                              (ets, season_length, 'ZMZ')],
                      freq='M', n_jobs=-1)

# In this step, you could include further models like: ses, adida, historic_average, 
# croston_classic, croston_sba, croston_optimized, seasonal_window_average, seasonal_naive, 
# imapa naive, random_walk_with_drift, window_average, seasonal_exponential_smoothing and tsb.

# For some models like ARIMA, include confidence intervals
# For the moment confidence intervals for ETS are unavailable
Y_hat_df = model.forecast(horizon).reset_index()
Y_hat_df.head()
unique_id ds auto_arima_season_length-12 ets_season_length-12_model-ZMZ
0 1.0 1960-01-31 424.160156 419.163574
1 1.0 1960-02-29 407.081696 416.904449
2 1.0 1960-03-31 470.860535 480.243378
3 1.0 1960-04-30 460.913605 461.996887
4 1.0 1960-05-31 484.900879 463.853241
{% endraw %}

4. Plot and Evaluate Predictions

{% raw %}
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = pd.concat([Y_train_df, Y_hat_df]).set_index('ds')

plot_df[['y',
         'auto_arima_season_length-12',
         'ets_season_length-12_model-ZMZ']].plot(ax=ax, linewidth=2)

ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()
{% endraw %}

Finally, we evaluate the predictions accuracy using the Mean Absolute Error:

$\qquad MAE = \frac{1}{Horizon} \sum_{\tau} |y_{\tau} - \hat{y}_{\tau}|\qquad$

{% raw %}
def mae(y_hat, y_true):
    return np.mean(np.abs(y_hat-y_true))

y_true = Y_test_df['y'].values
ets_preds = Y_hat_df['ets_season_length-12_model-ZMZ'].values
arima_preds = Y_hat_df['auto_arima_season_length-12'].values

print('ETS   MAE: %0.3f' % mae(ets_preds, y_true))
print('ARIMA MAE: %0.3f' % mae(arima_preds, y_true))
ETS   MAE: 16.222
ARIMA MAE: 18.551
{% endraw %}

5. Add Confidence Intervals to ARIMA

{% raw %}
# as folloows
Y_hat_df_intervals = model.forecast(h=12, level=(80, 95))
{% endraw %} {% raw %}
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
df_plot = pd.concat([Y_train_df, Y_hat_df_intervals]).set_index('ds')
df_plot[['y', 'auto_arima_season_length-12_mean','ets_season_length-12_model-ZMZ']].plot(ax=ax, linewidth=2)
ax.fill_between(df_plot.index, 
                df_plot['auto_arima_season_length-12_lo-80'], 
                df_plot['auto_arima_season_length-12_hi-80'],
                alpha=.35,
                color='orange',
                label='auto_arima_level_80')
ax.fill_between(df_plot.index, 
                df_plot['auto_arima_season_length-12_lo-95'], 
                df_plot['auto_arima_season_length-12_hi-95'],
                alpha=.2,
                color='orange',
                label='auto_arima_level_95')
ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(20)
{% endraw %}

6. Add external regressors to ARIMA

{% raw %}
Y_train_df['trend'] = np.arange(1, len(Y_train_df) + 1)
Y_train_df['intercept'] = np.ones(len(Y_train_df))
Y_train_df['month'] = Y_train_df['ds'].dt.month
Y_train_df = pd.get_dummies(Y_train_df, columns=['month'], drop_first=True)
Y_train_df
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
unique_id ds y trend intercept month_2 month_3 month_4 month_5 month_6 month_7 month_8 month_9 month_10 month_11 month_12
0 1.0 1949-01-31 112.0 1 1.0 0 0 0 0 0 0 0 0 0 0 0
1 1.0 1949-02-28 118.0 2 1.0 1 0 0 0 0 0 0 0 0 0 0
2 1.0 1949-03-31 132.0 3 1.0 0 1 0 0 0 0 0 0 0 0 0
3 1.0 1949-04-30 129.0 4 1.0 0 0 1 0 0 0 0 0 0 0 0
4 1.0 1949-05-31 121.0 5 1.0 0 0 0 1 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
127 1.0 1959-08-31 559.0 128 1.0 0 0 0 0 0 0 1 0 0 0 0
128 1.0 1959-09-30 463.0 129 1.0 0 0 0 0 0 0 0 1 0 0 0
129 1.0 1959-10-31 407.0 130 1.0 0 0 0 0 0 0 0 0 1 0 0
130 1.0 1959-11-30 362.0 131 1.0 0 0 0 0 0 0 0 0 0 1 0
131 1.0 1959-12-31 405.0 132 1.0 0 0 0 0 0 0 0 0 0 0 1

132 rows × 16 columns

{% endraw %} {% raw %}
xreg_test = pd.DataFrame({
  'unique_id': 1,
  'ds': pd.date_range(start='1960-01-01', periods=len(Y_hat_df), freq='M')
})
# We construct xreg for test. The train series ends at the 133th step. 
xreg_test['trend'] = np.arange(133, len(Y_hat_df) + 133)
xreg_test['intercept'] = np.ones(len(Y_hat_df))
xreg_test['month'] = xreg_test['ds'].dt.month
xreg_test = pd.get_dummies(xreg_test, columns=['month'], drop_first=True)
xreg_test
unique_id ds trend intercept month_2 month_3 month_4 month_5 month_6 month_7 month_8 month_9 month_10 month_11 month_12
0 1 1960-01-31 133 1.0 0 0 0 0 0 0 0 0 0 0 0
1 1 1960-02-29 134 1.0 1 0 0 0 0 0 0 0 0 0 0
2 1 1960-03-31 135 1.0 0 1 0 0 0 0 0 0 0 0 0
3 1 1960-04-30 136 1.0 0 0 1 0 0 0 0 0 0 0 0
4 1 1960-05-31 137 1.0 0 0 0 1 0 0 0 0 0 0 0
5 1 1960-06-30 138 1.0 0 0 0 0 1 0 0 0 0 0 0
6 1 1960-07-31 139 1.0 0 0 0 0 0 1 0 0 0 0 0
7 1 1960-08-31 140 1.0 0 0 0 0 0 0 1 0 0 0 0
8 1 1960-09-30 141 1.0 0 0 0 0 0 0 0 1 0 0 0
9 1 1960-10-31 142 1.0 0 0 0 0 0 0 0 0 1 0 0
10 1 1960-11-30 143 1.0 0 0 0 0 0 0 0 0 0 1 0
11 1 1960-12-31 144 1.0 0 0 0 0 0 0 0 0 0 0 1
{% endraw %} {% raw %}
season_length = 12

# Note: For all models the following parameters are passed automaticly and don't need to be declared: (X, h, future_xreg)

model = StatsForecast(
    Y_train_df.set_index('unique_id'), 
    models=[(auto_arima, season_length), (ets, season_length, 'ZMZ')], 
    freq='M', 
    n_jobs=-1
)

Y_hat_df_xreg = model.forecast(horizon, xreg=xreg_test.set_index('unique_id'))
Y_hat_df_xreg = Y_hat_df_xreg.reset_index()
{% endraw %} {% raw %}
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
df_plot = pd.concat([Y_train_df, Y_hat_df_xreg]).set_index('ds')
df_plot[['y', 'auto_arima_season_length-12','ets_season_length-12_model-ZMZ']].plot(ax=ax, linewidth=2)
ax.set_title('AirPassengers Forecast (with AutoArima external regressors)', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(20)
{% endraw %}

7. Include other Benchmark models

{% raw %}
from statsforecast.models import seasonal_naive, naive

#Define the parameters that you want to use in your models. 
season_length = 12

# Note: For all models the following parameters are passed automaticly and don't need to be declared: (X, h, future_xreg)

model = StatsForecast(
    Y_train_df.set_index('unique_id'), 
    models=[(auto_arima, season_length), (ets, season_length, 'ZMZ'), 
            (seasonal_naive, season_length), naive], 
    freq='M', 
    n_jobs=-1
)

Y_hat_df_bench = model.forecast(horizon, xreg=xreg_test.set_index('unique_id'))
Y_hat_df_bench = Y_hat_df_bench.reset_index()
{% endraw %} {% raw %}
Y_hat_df_bench
unique_id ds auto_arima_season_length-12 ets_season_length-12_model-ZMZ seasonal_naive_season_length-12 naive
0 1.0 1960-01-31 393.107025 419.163574 360.0 405.0
1 1.0 1960-02-29 357.360138 416.904449 342.0 405.0
2 1.0 1960-03-31 430.633820 480.243378 406.0 405.0
3 1.0 1960-04-30 433.796173 461.996887 396.0 405.0
4 1.0 1960-05-31 462.941559 463.853241 420.0 405.0
5 1.0 1960-06-30 496.972107 527.359131 472.0 405.0
6 1.0 1960-07-31 576.668518 586.639160 548.0 405.0
7 1.0 1960-08-31 596.789368 585.093384 559.0 405.0
8 1.0 1960-09-30 483.331268 512.351868 463.0 405.0
9 1.0 1960-10-31 414.599670 446.725525 407.0 405.0
10 1.0 1960-11-30 383.801086 391.588348 362.0 405.0
11 1.0 1960-12-31 386.914398 444.086792 405.0 405.0
{% endraw %} {% raw %}
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
df_plot = pd.concat([Y_train_df, Y_hat_df_bench]).set_index('ds')
df_plot[['y', 'auto_arima_season_length-12','ets_season_length-12_model-ZMZ', 'seasonal_naive_season_length-12', 'naive']].plot(ax=ax, linewidth=2)
ax.set_title('AirPassengers Forecast (with AutoArima external regressors)', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(20)
{% endraw %}