--- title: TourismLarge dataset. keywords: fastai sidebar: home_sidebar nb_path: "nbs/data_datasets__tourismL.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %}

1. Auxiliary Functions

{% raw %}

class CodeTimer[source]

CodeTimer(name=None, verbose=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

one_hot_encoding[source]

one_hot_encoding(df, index_col)

{% endraw %} {% raw %}

nonzero_indexes_by_row[source]

nonzero_indexes_by_row(M)

{% endraw %} {% raw %}
{% endraw %}

2. TourismL Class

{% raw %}

class TourismL[source]

TourismL(S:DataFrame, X:DataFrame, Y:DataFrame, idx_categorical_static:Optional[List[T]]=None, group:Union[str, List[str]]=None) :: TimeSeriesDataclass

TourismL(S: pandas.core.frame.DataFrame, X: pandas.core.frame.DataFrame, Y: pandas.core.frame.DataFrame, idx_categorical_static: Union[List, NoneType] = None, group: Union[str, List[str]] = None)

{% endraw %} {% raw %}
{% endraw %}

3. TourismL Validation

3.1 Load and Process data

{% raw %}
directory = './data/hierarchical/TourismL'

TourismL.preprocess_data(directory=directory, verbose=False)
data = TourismL.load_process(directory=directory, verbose=True)
{% endraw %}

3.2 Validate data with plots.

Checking Hierarchical aggregation matrix

{% raw %}
plt.figure(num=1, figsize=(6, 15), dpi=80, facecolor='w')
plt.spy(data['H'])
plt.show()
plt.close()
{% endraw %}

Checking pre-estimated levels seasons S_bottom

{% raw %}
dates = data['dates']
scols_bottom = data['scols']

# n_groups,n_series,n_features,n_time
Y_bottom = data['Y'].reshape(-1, 228)
X_bottom = data['X']
X_bottom = X_bottom.reshape(-1, X_bottom.shape[2], 241)
S_bottom = data['S'].reshape(-1, data['S'].shape[1])

print("data['scols']:=", data['scols'])
print("data['xcols']:=", data['xcols'])

b_idx = 0
plt.figure(num=1, figsize=(10, 6), dpi=80, facecolor='w')
plt.plot(dates[:228],Y_bottom[b_idx,:], label='signal')
plt.axhline(y=S_bottom[b_idx,0], 
            label='level', color='black')
plt.axhline(y=S_bottom[b_idx,0]*S_bottom[b_idx,2], 
            label='level_[december]', color='green')
plt.axhline(y=S_bottom[b_idx,0]*S_bottom[b_idx,3],
            label='level_[january]', color='red')
plt.title('{} Visits'.format(data["unique_ids"][b_idx]))
plt.ylabel('Tourism Visits')
plt.xlabel('Date')
plt.legend(bbox_to_anchor=(1, 1), 
           loc='upper left', ncol=1)
plt.show()
plt.close()

plt.figure(num=1, figsize=(10, 6), dpi=80, facecolor='w')
plt.plot(dates,X_bottom[b_idx,-1,:-1], label='signal') # 'y_[lag12]'
plt.plot(dates,X_bottom[b_idx,4,:-1], label='y_[lag12]') # 'y_[lag12]'
plt.plot(dates,X_bottom[b_idx,0,:-1]*2000, label='month_[11]') # 'month_[11]'
plt.title('{} Visits'.format(data["unique_ids"][b_idx]))
plt.ylabel('Tourism Visits')
plt.xlabel('Date')
plt.legend(bbox_to_anchor=(1, 1), 
           loc='upper left', ncol=1)
plt.show()
plt.close()
# assert 1<0
{% endraw %} {% raw %}
print("data['scols_agg']:=", data['scols_agg'][:4])
print("data['xcols_agg']:=", data['xcols_agg'])

X_agg = data['X_agg']

a_idx = 0
plt.figure(num=1, figsize=(10, 6), dpi=80, facecolor='w')
plt.plot(dates,X_agg[a_idx,1,:-1], label='y_[total]')
plt.plot(dates,X_agg[a_idx,2,:-1], label='y_[state]')
plt.plot(dates,X_agg[a_idx,3,:-1], label='y_[zone]')
plt.plot(dates,X_agg[a_idx,4,:-1], label='y_[region]')
plt.plot(dates,X_agg[a_idx,0,:-1]*20_000, label='distance_month')
plt.legend()
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.show()
plt.close()
{% endraw %} {% raw %}
plt.figure(num=1, figsize=(6, 10), dpi=80, facecolor='w')
plt.spy(data['S_agg'].T)
plt.show()
plt.close()
{% endraw %}

Checking Hierarchically linked data

{% raw %}
def plot_hierarchically_linked_data(data, b_idx, n_years=18, plot_file=None):
    
    # This plots validate the S/X/Y_bottom
    # We check for the sinchronization of features with Y_bottom
    # We do the same for Y_agg, to check correct linkage
    assert b_idx < 304 and b_idx>=0
    
    # Hierarchically linked target data and labels
    # [Total, State, Zone, Region, TotalP, StateP, ZoneP] [0, 1, 2, 3, 4, 5, 6]
    # [Total, State, Zone, Region] <--> [0, 1, 2, 3]
    linked_idxs = data['hier_linked_idxs'][b_idx][:4]
    Y_hier = data['Y_hier']
    hier_labels = data['hier_labels']
    
    # Base series and its label  n_agg=251 idx \in {0,...,304}
    Y_base  = data['Y_hier'][b_idx + 251]
    Y_base2 = data['Y'][b_idx//4,b_idx % 4,:]
    label   = data['hier_labels'][b_idx + 251]
    
    x_plot = data['dates'][:-12] # Skip future dates
        
    fig, axs = plt.subplots(len(linked_idxs)+1, 1, figsize=(12, 9))
    for idx, linked_idx in enumerate(linked_idxs):
        axs[idx].plot(x_plot[-12*n_years:], Y_hier[linked_idx,-12*n_years:]/1000,
                      color = 'black', linewidth=1.5)
        axs[idx].set_ylabel(f'{hier_labels[linked_idx]} \nTourist Visits\n [In millions]', 
                            fontsize=13)
        axs[idx].set_xticks([])
        axs[idx].tick_params(labelsize=13)
    
    # Checking that Y_bottom and Y_hier match
    axs[4].plot(x_plot[-12*n_years:], Y_base[-12*n_years:]/1000,
                color = 'blue', linewidth=1.5, label='base')
    axs[4].plot(x_plot[-12*n_years:], Y_base2[-12*n_years:]/1000,
                color = 'black', linewidth=1.5, label='base')
    
    axs[4].set_xlabel('Month', fontsize=13)
    axs[4].set_ylabel(f'{label} \nTourist Visits\n [In millions]', fontsize=13)
    axs[4].tick_params(labelsize=13)
    
    plt.suptitle(f'Hierarchically Linked Series for {label}', fontsize=16)
    fig.tight_layout()
    fig.subplots_adjust(top=0.95)
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
    if plot_file is not None:
        plt.savefig(plot_file, bbox_inches='tight')
    plt.show()
    plt.close()
    
b_idx = 0
plot_hierarchically_linked_data(data=data, b_idx=b_idx, n_years=5, plot_file=None)
{% endraw %}

Checking Geographically linked data

{% raw %}
def plot_region_data(data, region_idx, plot_file=None):
    x_plot = data['dates'][:-12]
    Y_agg  = data['Y_agg'][region_idx] # [country,state,zone,region]
    Y_base = data['Y'][region_idx]
    
    hier_idxs = data['hier_linked_idxs'][region_idx]
    Y_agg_labels = data['hier_labels'][hier_idxs]
    region = Y_agg_labels[3] #[country,state,zone,region,country_p,state_p,...]
    
    n_years = 5
    fig, axs = plt.subplots(4,1, figsize=(12, 9))

    for idx, purpose in enumerate(['Hol', 'Vis', 'Bus', 'Oth']):
        axs[idx].plot(x_plot[-12*n_years:], Y_base[idx,-12*n_years:]/1000, 
                      color = 'black', linewidth=1.5, label=f'{idx}')
        #axs[idx].set_xlabel('Month', fontsize=13)
        axs[idx].set_ylabel(f'{region}{purpose} \n Tourist Visits\n (In millions)', fontsize=13)
        if idx<3:
            axs[idx].set_xticks([])
        axs[idx].tick_params(labelsize=13)  
    
    axs[idx].plot(x_plot[-12*n_years:], Y_base[idx, -12*n_years:]/1000, 
                  color = 'black', linewidth=1.5, label=f'{idx}')
    axs[3].set_xlabel('Month', fontsize=13)
    axs[3].tick_params(labelsize=13)
    
    plt.suptitle(f'Purpose Grouped Series for {region}', fontsize=16)
    fig.tight_layout()
    fig.subplots_adjust(top=0.95)
    if plot_file is not None:
        plt.savefig(plot_file, bbox_inches='tight')
    plt.show()
    
plot_region_data(data, region_idx=0)
{% endraw %}

4. HierTimeseriesDataset Class

{% raw %}
{% endraw %} {% raw %}

class HierTimeseriesDataset[source]

HierTimeseriesDataset(*args, **kwds) :: Dataset

An abstract class representing a :class:Dataset.

All datasets that represent a map from keys to data samples should subclass it. All subclasses should overwrite :meth:__getitem__, supporting fetching a data sample for a given key. Subclasses could also optionally overwrite :meth:__len__, which is expected to return the size of the dataset by many :class:~torch.utils.data.Sampler implementations and the default options of :class:~torch.utils.data.DataLoader.

.. note:: :class:~torch.utils.data.DataLoader by default constructs a index sampler that yields integral indices. To make it work with a map-style dataset with non-integral indices/keys, a custom sampler must be provided.

{% endraw %} {% raw %}
{% endraw %}

5. HierTimeseriesDataset Validation

{% raw %}
# directory = './data/hierarchical/TourismL'
# TourismL.preprocess_data(directory=directory, verbose=False)
# data = TourismL.load_process(directory=directory, verbose=False)
{% endraw %} {% raw %}
T = 228-12-12
H = 12

train_dataset = HierTimeseriesDataset(# Bottom data
                                      X=data['X'], 
                                      S=data['S'], 
                                      Y=data['Y'],
                                      xcols=data['xcols'],
                                      xcols_hist=data['xcols_hist'],
                                      xcols_futr=data['xcols_futr'],
                                      xcols_sample_mask=data['xcols_sample_mask'],
                                      xcols_available_mask=data['xcols_available_mask'],
                                      dates = data['dates'],
                                      # Aggregated data
                                      X_agg=data['X_agg'], 
                                      S_agg=data['S_agg'], 
                                      Y_agg=data['Y_agg'],
                                      xcols_agg=data['xcols_agg'],
                                      xcols_hist_agg=data['xcols_hist_agg'],
                                      xcols_futr_agg=data['xcols_futr_agg'],
                                      # Generator parameters
                                      T0=0,T=T,H=H)

valid_dataset = HierTimeseriesDataset(# Bottom data
                                      X=data['X'], 
                                      S=data['S'], 
                                      Y=data['Y'],
                                      xcols=data['xcols'],
                                      xcols_hist=data['xcols_hist'],
                                      xcols_futr=data['xcols_futr'],
                                      xcols_sample_mask=data['xcols_sample_mask'],
                                      xcols_available_mask=data['xcols_available_mask'], 
                                      dates = data['dates'],
                                      # Aggregated data
                                      X_agg=data['X_agg'], 
                                      S_agg=data['S_agg'], 
                                      Y_agg=data['Y_agg'],
                                      xcols_agg=data['xcols_agg'],
                                      xcols_hist_agg=data['xcols_hist_agg'],
                                      xcols_futr_agg=data['xcols_futr_agg'],
                                      # Generator parameters
                                      T0=24,T=T,H=H,
                                      lastwindow_mask=True)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)
valid_dataloader = DataLoader(valid_dataset, batch_size=4, shuffle=False, num_workers=0)

train_batch = next(iter(train_dataloader))
valid_batch = next(iter(valid_dataloader))

for key in train_batch.keys():
    print(f'{key}.shape', train_batch[key].shape)
{% endraw %}

5.1 Examinate HierTimeSeriesDataset aggregate batch

  1. Examinate the hierarchically linked Y_agg series.
  2. Examinate the alignment between Y_agg and X_agg_futr.
  3. Validate the S_agg dummies generated from H constraints matrix.
{% raw %}
batch = valid_batch
S_agg = batch['S_agg'][0]
Y_agg = batch['Y_agg'][0]
X_agg = batch['X_agg'][0]
F_agg = batch['F_agg'][0]

#---------------- Hierarchical Y_agg ----------------#
plt.plot(Y_agg[0,:])
plt.plot(Y_agg[1,:])
plt.plot(Y_agg[2,:])
plt.plot(Y_agg[3,:])
plt.title('Y_agg')
plt.show()
plt.close()

#---------------- X_agg_futr ----------------#
F_agg = np.maximum(F_agg[0,:]*80000,
                   np.zeros(F_agg[0,:].shape))
plt.plot(F_agg[-36:])
plt.plot(Y_agg[0,-36:])
plt.title('F_agg')
plt.show()
plt.close()

#---------------- S_agg ----------------#
plt.figure(num=1, figsize=(5, 5), dpi=80, facecolor='w')
plt.spy(S_agg[None,:])
plt.title('S_agg')
plt.show()
plt.close()
{% endraw %}

5.2 Examinate HierTimeSeriesDataset bottom batch

  1. Examinate the alignment between Y_bottom, S_bottom and X_bottom_futr.
  2. Validate batch Seq2Seq structure between X_bottom vs Y_bottom.
{% raw %}
# batch = train_batch
# dates = train_dataset.dates
batch = valid_batch
dates = valid_dataset.dates
S_bottom = batch['S'][0]
Y_bottom = batch['Y'][0]
X_bottom = batch['X'][0]
F_bottom = batch['F'][0]
sample_mask = batch['sample_mask'][0]

#---------------- Y/S/X_bottom alignment ----------------#
n_years = 3
fig, axs = plt.subplots(4,1, figsize=(12, 9))

for idx, purpose in enumerate(['Hol', 'Vis', 'Bus', 'Oth']):    
    axs[idx].plot(dates[1:n_years*12+1],Y_bottom[idx,:n_years*12], label='y')
    axs[idx].axhline(y=S_bottom[idx,0],
                label='level', color='black')
    axs[idx].axhline(y=S_bottom[idx,0]*S_bottom[idx,2], 
                label='level_[december]', color='green')
    axs[idx].axhline(y=S_bottom[idx,0]*S_bottom[idx,3],
                label='level_[january]', color='red')
    
    axs[idx].plot(dates[1:n_years*12+1],
                  F_bottom[idx,0,:n_years*12]*S_bottom[idx,0]*S_bottom[idx,2],
                 label='dummy_[december]')
    axs[idx].plot(dates[1:n_years*12+1],
                  F_bottom[idx,1,:n_years*12]*S_bottom[idx,0]*S_bottom[idx,3],
                 label='dummy_[january]')
    axs[idx].set_ylabel(purpose)

plt.suptitle('Y/S/X/F_bottom alignment ', y=0.91)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.show()
plt.close()

#---------------- Y/X_bottom Seq2Seq structure ----------------#
idx = 0
start = 155
end = 203
print('\n')
plt.figure(figsize=(12, 1.8))
plt.plot(dates[start:end],     Y_bottom[idx,start:end]+100, label='y')
plt.plot(dates[start-1:end-1], X_bottom[idx,0,start:end], label='y_[lag1]')
plt.plot(dates[start:end+12],  F_bottom[idx,2,start:end+12], label='y_[lag12]')
plt.plot(dates[start:end],     sample_mask[idx,start:end]*Y_bottom[idx,-1], label='sample_mask')
plt.ylabel('Tourist Visits\n [In millions]')
plt.title('Y/X_bottom (Seq2Seq)')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.show()
plt.close()
{% endraw %}