--- title: External data keywords: fastai sidebar: home_sidebar summary: "Helper functions used to download and extract common time series datasets." description: "Helper functions used to download and extract common time series datasets." nb_path: "nbs/012_data.external.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

decompress_from_url[source]

decompress_from_url(url, target_dir=None, verbose=False)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

download_data[source]

download_data(url, fname=None, c_key='archive', force_download=False, timeout=4, verbose=False)

Download url to fname.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_UCR_univariate_list[source]

get_UCR_univariate_list()

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_UCR_multivariate_list[source]

get_UCR_multivariate_list()

{% endraw %} {% raw %}
158
{% endraw %} {% raw %}

get_UCR_data[source]

get_UCR_data(dsid, path='.', parent_dir='data/UCR', on_disk=True, mode='c', Xdtype='float32', ydtype=None, return_split=True, split_data=True, force_download=False, verbose=False)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
from fastai.data.transforms import get_files
PATH = Path('.')
dsids = ['ECGFiveDays', 'AtrialFibrillation'] # univariate and multivariate
for dsid in dsids:
    print(dsid)
    tgt_dir = PATH/f'data/UCR/{dsid}'
    if os.path.isdir(tgt_dir): shutil.rmtree(tgt_dir)
    test_eq(len(get_files(tgt_dir)), 0) # no file left
    X_train, y_train, X_valid, y_valid = get_UCR_data(dsid)
    test_eq(len(get_files(tgt_dir, '.npy')), 6)
    test_eq(len(get_files(tgt_dir, '.npy')), len(get_files(tgt_dir))) # test no left file/ dir
    del X_train, y_train, X_valid, y_valid
    start = time.time()
    X_train, y_train, X_valid, y_valid = get_UCR_data(dsid)
    elapsed = time.time() - start
    test_eq(elapsed < 1, True)
    test_eq(X_train.ndim, 3)
    test_eq(y_train.ndim, 1)
    test_eq(X_valid.ndim, 3)
    test_eq(y_valid.ndim, 1)
    test_eq(len(get_files(tgt_dir, '.npy')), 6)
    test_eq(len(get_files(tgt_dir, '.npy')), len(get_files(tgt_dir))) # test no left file/ dir
    test_eq(X_train.ndim, 3)
    test_eq(y_train.ndim, 1)
    test_eq(X_valid.ndim, 3)
    test_eq(y_valid.ndim, 1)
    test_eq(X_train.dtype, np.float32)
    test_eq(X_train.__class__.__name__, 'memmap')
    del X_train, y_train, X_valid, y_valid
    X_train, y_train, X_valid, y_valid = get_UCR_data(dsid, on_disk=False)
    test_eq(X_train.__class__.__name__, 'ndarray')
    del X_train, y_train, X_valid, y_valid
ECGFiveDays
AtrialFibrillation
{% endraw %} {% raw %}
X_train, y_train, X_valid, y_valid = get_UCR_data('natops')
{% endraw %} {% raw %}
dsid = 'natops' 
X_train, y_train, X_valid, y_valid = get_UCR_data(dsid, verbose=True)
X, y, splits = get_UCR_data(dsid, split_data=False)
test_eq(X[splits[0]], X_train)
test_eq(y[splits[1]], y_valid)
test_eq(X[splits[0]], X_train)
test_eq(y[splits[1]], y_valid)
test_type(X, X_train)
test_type(y, y_train)
Dataset: NATOPS
X_train: (180, 24, 51)
y_train: (180,)
X_valid: (180, 24, 51)
y_valid: (180,) 

{% endraw %} {% raw %}

check_data[source]

check_data(X, y=None, splits=None, show_plot=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dsid = 'ECGFiveDays'
X, y, splits = get_UCR_data(dsid, split_data=False, on_disk=False, force_download=False)
check_data(X, y, splits)
check_data(X[:, 0], y, splits)
y = y.astype(np.float32)
check_data(X, y, splits)
y[:10] = np.nan
check_data(X[:, 0], y, splits)
X, y, splits = get_UCR_data(dsid, split_data=False, on_disk=False, force_download=False)
splits = get_splits(y, 3)
check_data(X, y, splits)
check_data(X[:, 0], y, splits)
y[:5]= np.nan
check_data(X[:, 0], y, splits)
X, y, splits = get_UCR_data(dsid, split_data=False, on_disk=False, force_download=False)
X      - shape: [884 samples x 1 features x 136 timesteps]  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 2 shape: [23, 861]  overlap: False
X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 2 shape: [23, 861]  overlap: False
X      - shape: [884 samples x 1 features x 136 timesteps]  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:float32  isnan: 0
splits - n_splits: 2 shape: [23, 861]  overlap: False
X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:float32  isnan: 10
splits - n_splits: 2 shape: [23, 861]  overlap: False
/Users/nacho/opt/anaconda3/envs/py37torch110/lib/python3.7/site-packages/ipykernel_launcher.py:23: UserWarning: y contains nan values
X      - shape: [884 samples x 1 features x 136 timesteps]  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 3 shape: [[589, 295], [589, 295], [590, 294]]  overlap: [False, False, False]
X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 3 shape: [[589, 295], [589, 295], [590, 294]]  overlap: [False, False, False]
X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 3 (294 samples per class) ['1', '2', 'n']  isnan: False
splits - n_splits: 3 shape: [[589, 295], [589, 295], [590, 294]]  overlap: [False, False, False]
{% endraw %} {% raw %}

get_Monash_regression_list[source]

get_Monash_regression_list()

{% endraw %} {% raw %}
15
{% endraw %} {% raw %}

get_Monash_regression_data[source]

get_Monash_regression_data(dsid, path='./data/Monash', on_disk=True, mode='c', Xdtype='float32', ydtype=None, split_data=True, force_download=False, verbose=False, timeout=4)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dsid = "Covid3Month"
X_train, y_train, X_valid, y_valid = get_Monash_regression_data(dsid, on_disk=False, split_data=True, force_download=False)
X, y, splits = get_Monash_regression_data(dsid, on_disk=True, split_data=False, force_download=False, verbose=True)
if X_train is not None: 
    test_eq(X_train.shape, (140, 1, 84))
if X is not None: 
    test_eq(X.shape, (201, 1, 84))
Dataset: Covid3Month
X      : (201, 1, 84)
y      : (201,)
splits : (#140) [0,1,2,3,4,5,6,7,8,9...] (#61) [140,141,142,143,144,145,146,147,148,149...] 

{% endraw %} {% raw %}

get_forecasting_list[source]

get_forecasting_list()

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_forecasting_time_series[source]

get_forecasting_time_series(dsid, path='./data/forecasting/', force_download=False, verbose=True, **kwargs)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
ts = get_forecasting_time_series("sunspots", force_download=False)
test_eq(len(ts), 3235)
ts
Dataset: Sunspots
downloading data...
...data downloaded. Path = data/forecasting/Sunspots.csv
Monthly Mean Total Sunspot Number
Date
1749-01-31 96.7
1749-02-28 104.3
1749-03-31 116.7
1749-04-30 92.8
1749-05-31 141.7
... ...
2018-03-31 2.5
2018-04-30 8.9
2018-05-31 13.2
2018-06-30 15.9
2018-07-31 1.6

3235 rows × 1 columns

{% endraw %} {% raw %}
ts = get_forecasting_time_series("weather", force_download=False)
if ts is not None: 
    test_eq(len(ts), 70091)
    print(ts)
Dataset: Weather
downloading data...
...data downloaded. Path = data/forecasting/Weather.csv.zip
module 'datetime' has no attribute 'timestamp'
/Users/nacho/opt/anaconda3/envs/py37torch110/lib/python3.7/site-packages/ipykernel_launcher.py:69: UserWarning: Cannot download Weather dataset
{% endraw %} {% raw %}
{% endraw %} {% raw %}

convert_tsf_to_dataframe[source]

convert_tsf_to_dataframe(full_file_path_and_name, replace_missing_vals_with='NaN', value_column_name='series_value')

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_Monash_forecasting_data[source]

get_Monash_forecasting_data(dsid, path='./data/forecasting/', force_download=False, remove_from_disk=False, verbose=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dsid = 'm1_yearly_dataset'
X = get_Monash_forecasting_data(dsid, force_download=False)
if X is not None: 
    test_eq(X.shape, (181, 1, 58))
Dataset: m1_yearly_dataset
converting dataframe to numpy array...
...dataframe converted to numpy array

X.shape: (181, 1, 58)
freq: yearly
forecast_horizon: 6
contain_missing_values: False
contain_equal_length: False
{% endraw %}