--- title: TourismLarge dataset. keywords: fastai sidebar: home_sidebar nb_path: "nbs/data_datasets__tourismL.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %}

1. Auxiliary Functions

{% raw %}

class CodeTimer[source]

CodeTimer(name=None, verbose=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

one_hot_encoding[source]

one_hot_encoding(df, index_col)

{% endraw %} {% raw %}

nonzero_indexes_by_row[source]

nonzero_indexes_by_row(M)

{% endraw %} {% raw %}
{% endraw %}

2. TourismL Class

{% raw %}

class TourismL[source]

TourismL(S:DataFrame, X:DataFrame, Y:DataFrame, idx_categorical_static:Optional[List[T]]=None, group:Union[str, List[str]]=None) :: TimeSeriesDataclass

TourismL(S: pandas.core.frame.DataFrame, X: pandas.core.frame.DataFrame, Y: pandas.core.frame.DataFrame, idx_categorical_static: Union[List, NoneType] = None, group: Union[str, List[str]] = None)

{% endraw %} {% raw %}
{% endraw %}

3. TourismL Validation

3.1 Load and Process data

{% raw %}
directory = './data/hierarchical/TourismL'

TourismL.preprocess_data(directory=directory, verbose=False)
data = TourismL.load_process(directory=directory, verbose=True)
Code block 'Reading data   ' took:	0.26691 seconds
Code block 'Process temporal_agg' took:	0.01913 seconds
Code block 'Process static_agg' took:	0.00484 seconds
Code block 'Process temporal_bottom' took:	0.00901 seconds
Code block 'Process static_bottom' took:	0.00104 seconds
Code block 'Hier constraints' took:	2.09411 seconds


Total days 240, train 216 validation 12, test 12
Whole dates: 		 [1998-01-01 00:00:00, 2017-12-01 00:00:00]
Validation dates: 	 [2016-01-01 00:00:00,2016-12-01 00:00:00]
Test dates: 		 [2017-01-01 00:00:00, 2017-12-01 00:00:00]


                                   BOTTOM
S.shape (n_regions,n_purpose,n_features):        	(76, 4, 4)
X.shape (n_regions,n_purpose,n_features,n_time): 	(76, 4, 6, 241)
Y.shape (n_regions,n_purpose,n_time):            	(76, 4, 228)
scols (4) Index(['level', 'spread', 'level_[december]', 'level_[january]'], dtype='object')
xcols (6) Index(['month_[11]', 'month_[12]', 'available_mask', 'sample_mask',
       'y_[lag12]', 'y'],
      dtype='object')
xcols_hist (1) ['y']
xcols_futr (3) ['month_[11]', 'month_[12]', 'y_[lag12]']
                                   AGGREGATE
S_agg.shape (n_regions,n_features):           			(76, 35)
X_agg.shape (n_regions,n_features,n_time):    			(76, 5, 241)
Y_agg.shape (n_regions,n_features,n_time):    			(76, 4, 228)
scols_agg (35) Index(['country_[Australia]', 'state_[A]', 'state_[B]', 'state_[C]'], dtype='object')
xcols_agg (5) Index(['distance_month', 'y_[total]', 'y_[state]', 'y_[zone]', 'y_[region]'], dtype='object')
xcols_hist_agg (4) ['y_[total]', 'y_[state]', 'y_[zone]', 'y_[region]']
xcols_futr_agg (1) ['distance_month']
                                   HIERARCHICAL
Y_hier.shape (n_regions,n_time):   				(555, 228)
overall.shape 		 (555, 228)
country.shape 		 (1, 228)
states.shape  		 (7, 228)
zones.shape   		 (27, 228)
regions.shape 		 (76, 228)
country_purpose.shape 	 (4, 228)
states_purpose.shape  	 (28, 228)
zones_purpose.shape   	 (108, 228)
regions_purpose.shape 	 (304, 228)
Code block 'Final processing' took:	0.01266 seconds
{% endraw %}

3.2 Validate data with plots.

Checking Hierarchical aggregation matrix

{% raw %}
plt.figure(num=1, figsize=(6, 15), dpi=80, facecolor='w')
plt.spy(data['H'])
plt.show()
plt.close()
{% endraw %}

Checking pre-estimated levels seasons S_bottom

{% raw %}
dates = data['dates']
scols_bottom = data['scols']

# n_groups,n_series,n_features,n_time
Y_bottom = data['Y'].reshape(-1, 228)
X_bottom = data['X']
X_bottom = X_bottom.reshape(-1, X_bottom.shape[2], 241)
S_bottom = data['S'].reshape(-1, data['S'].shape[1])

print("data['scols']:=", data['scols'])
print("data['xcols']:=", data['xcols'])

b_idx = 0
plt.figure(num=1, figsize=(10, 6), dpi=80, facecolor='w')
plt.plot(dates[:228],Y_bottom[b_idx,:], label='signal')
plt.axhline(y=S_bottom[b_idx,0], 
            label='level', color='black')
plt.axhline(y=S_bottom[b_idx,0]*S_bottom[b_idx,2], 
            label='level_[december]', color='green')
plt.axhline(y=S_bottom[b_idx,0]*S_bottom[b_idx,3],
            label='level_[january]', color='red')
plt.title('{} Visits'.format(data["unique_ids"][b_idx]))
plt.ylabel('Tourism Visits')
plt.xlabel('Date')
plt.legend(bbox_to_anchor=(1, 1), 
           loc='upper left', ncol=1)
plt.show()
plt.close()

plt.figure(num=1, figsize=(10, 6), dpi=80, facecolor='w')
plt.plot(dates,X_bottom[b_idx,-1,:-1], label='signal') # 'y_[lag12]'
plt.plot(dates,X_bottom[b_idx,4,:-1], label='y_[lag12]') # 'y_[lag12]'
plt.plot(dates,X_bottom[b_idx,0,:-1]*2000, label='month_[11]') # 'month_[11]'
plt.title('{} Visits'.format(data["unique_ids"][b_idx]))
plt.ylabel('Tourism Visits')
plt.xlabel('Date')
plt.legend(bbox_to_anchor=(1, 1), 
           loc='upper left', ncol=1)
plt.show()
plt.close()
# assert 1<0
data['scols']:= Index(['level', 'spread', 'level_[december]', 'level_[january]'], dtype='object')
data['xcols']:= Index(['month_[11]', 'month_[12]', 'available_mask', 'sample_mask',
       'y_[lag12]', 'y'],
      dtype='object')
{% endraw %} {% raw %}
print("data['scols_agg']:=", data['scols_agg'][:4])
print("data['xcols_agg']:=", data['xcols_agg'])

X_agg = data['X_agg']

a_idx = 0
plt.figure(num=1, figsize=(10, 6), dpi=80, facecolor='w')
plt.plot(dates,X_agg[a_idx,1,:-1], label='y_[total]')
plt.plot(dates,X_agg[a_idx,2,:-1], label='y_[state]')
plt.plot(dates,X_agg[a_idx,3,:-1], label='y_[zone]')
plt.plot(dates,X_agg[a_idx,4,:-1], label='y_[region]')
plt.plot(dates,X_agg[a_idx,0,:-1]*20_000, label='distance_month')
plt.legend()
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.show()
plt.close()
data['scols_agg']:= Index(['country_[Australia]', 'state_[A]', 'state_[B]', 'state_[C]'], dtype='object')
data['xcols_agg']:= Index(['distance_month', 'y_[total]', 'y_[state]', 'y_[zone]', 'y_[region]'], dtype='object')
{% endraw %} {% raw %}
plt.figure(num=1, figsize=(6, 10), dpi=80, facecolor='w')
plt.spy(data['S_agg'].T)
plt.show()
plt.close()
{% endraw %}

Checking Hierarchically linked data

{% raw %}
def plot_hierarchically_linked_data(data, b_idx, n_years=18, plot_file=None):
    
    # This plots validate the S/X/Y_bottom
    # We check for the sinchronization of features with Y_bottom
    # We do the same for Y_agg, to check correct linkage
    assert b_idx < 304 and b_idx>=0
    
    # Hierarchically linked target data and labels
    # [Total, State, Zone, Region, TotalP, StateP, ZoneP] [0, 1, 2, 3, 4, 5, 6]
    # [Total, State, Zone, Region] <--> [0, 1, 2, 3]
    linked_idxs = data['hier_linked_idxs'][b_idx][:4]
    Y_hier = data['Y_hier']
    hier_labels = data['hier_labels']
    
    # Base series and its label  n_agg=251 idx \in {0,...,304}
    Y_base  = data['Y_hier'][b_idx + 251]
    Y_base2 = data['Y'][b_idx//4,b_idx % 4,:]
    label   = data['hier_labels'][b_idx + 251]
    
    x_plot = data['dates'][:-12] # Skip future dates
        
    fig, axs = plt.subplots(len(linked_idxs)+1, 1, figsize=(12, 9))
    for idx, linked_idx in enumerate(linked_idxs):
        axs[idx].plot(x_plot[-12*n_years:], Y_hier[linked_idx,-12*n_years:]/1000,
                      color = 'black', linewidth=1.5)
        axs[idx].set_ylabel(f'{hier_labels[linked_idx]} \nTourist Visits\n [In millions]', 
                            fontsize=13)
        axs[idx].set_xticks([])
        axs[idx].tick_params(labelsize=13)
    
    # Checking that Y_bottom and Y_hier match
    axs[4].plot(x_plot[-12*n_years:], Y_base[-12*n_years:]/1000,
                color = 'blue', linewidth=1.5, label='base')
    axs[4].plot(x_plot[-12*n_years:], Y_base2[-12*n_years:]/1000,
                color = 'black', linewidth=1.5, label='base')
    
    axs[4].set_xlabel('Month', fontsize=13)
    axs[4].set_ylabel(f'{label} \nTourist Visits\n [In millions]', fontsize=13)
    axs[4].tick_params(labelsize=13)
    
    plt.suptitle(f'Hierarchically Linked Series for {label}', fontsize=16)
    fig.tight_layout()
    fig.subplots_adjust(top=0.95)
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
    if plot_file is not None:
        plt.savefig(plot_file, bbox_inches='tight')
    plt.show()
    plt.close()
    
b_idx = 0
plot_hierarchically_linked_data(data=data, b_idx=b_idx, n_years=5, plot_file=None)
{% endraw %}

Checking Geographically linked data

{% raw %}
def plot_region_data(data, region_idx, plot_file=None):
    x_plot = data['dates'][:-12]
    Y_agg  = data['Y_agg'][region_idx] # [country,state,zone,region]
    Y_base = data['Y'][region_idx]
    
    hier_idxs = data['hier_linked_idxs'][region_idx]
    Y_agg_labels = data['hier_labels'][hier_idxs]
    region = Y_agg_labels[3] #[country,state,zone,region,country_p,state_p,...]
    
    n_years = 5
    fig, axs = plt.subplots(4,1, figsize=(12, 9))

    for idx, purpose in enumerate(['Hol', 'Vis', 'Bus', 'Oth']):
        axs[idx].plot(x_plot[-12*n_years:], Y_base[idx,-12*n_years:]/1000, 
                      color = 'black', linewidth=1.5, label=f'{idx}')
        #axs[idx].set_xlabel('Month', fontsize=13)
        axs[idx].set_ylabel(f'{region}{purpose} \n Tourist Visits\n (In millions)', fontsize=13)
        if idx<3:
            axs[idx].set_xticks([])
        axs[idx].tick_params(labelsize=13)  
    
    axs[idx].plot(x_plot[-12*n_years:], Y_base[idx, -12*n_years:]/1000, 
                  color = 'black', linewidth=1.5, label=f'{idx}')
    axs[3].set_xlabel('Month', fontsize=13)
    axs[3].tick_params(labelsize=13)
    
    plt.suptitle(f'Purpose Grouped Series for {region}', fontsize=16)
    fig.tight_layout()
    fig.subplots_adjust(top=0.95)
    if plot_file is not None:
        plt.savefig(plot_file, bbox_inches='tight')
    plt.show()
    
plot_region_data(data, region_idx=0)
{% endraw %}

4. HierTimeseriesDataset Class

{% raw %}
{% endraw %} {% raw %}

class HierTimeseriesDataset[source]

HierTimeseriesDataset(*args, **kwds) :: Dataset

An abstract class representing a :class:Dataset.

All datasets that represent a map from keys to data samples should subclass it. All subclasses should overwrite :meth:__getitem__, supporting fetching a data sample for a given key. Subclasses could also optionally overwrite :meth:__len__, which is expected to return the size of the dataset by many :class:~torch.utils.data.Sampler implementations and the default options of :class:~torch.utils.data.DataLoader.

.. note:: :class:~torch.utils.data.DataLoader by default constructs a index sampler that yields integral indices. To make it work with a map-style dataset with non-integral indices/keys, a custom sampler must be provided.

{% endraw %} {% raw %}
{% endraw %}

5. HierTimeseriesDataset Validation

{% raw %}
# directory = './data/hierarchical/TourismL'
# TourismL.preprocess_data(directory=directory, verbose=False)
# data = TourismL.load_process(directory=directory, verbose=False)
{% endraw %} {% raw %}
T = 228-12-12
H = 12

train_dataset = HierTimeseriesDataset(# Bottom data
                                      X=data['X'], 
                                      S=data['S'], 
                                      Y=data['Y'],
                                      xcols=data['xcols'],
                                      xcols_hist=data['xcols_hist'],
                                      xcols_futr=data['xcols_futr'],
                                      xcols_sample_mask=data['xcols_sample_mask'],
                                      xcols_available_mask=data['xcols_available_mask'],
                                      dates = data['dates'],
                                      # Aggregated data
                                      X_agg=data['X_agg'], 
                                      S_agg=data['S_agg'], 
                                      Y_agg=data['Y_agg'],
                                      xcols_agg=data['xcols_agg'],
                                      xcols_hist_agg=data['xcols_hist_agg'],
                                      xcols_futr_agg=data['xcols_futr_agg'],
                                      # Generator parameters
                                      T0=0,T=T,H=H)

valid_dataset = HierTimeseriesDataset(# Bottom data
                                      X=data['X'], 
                                      S=data['S'], 
                                      Y=data['Y'],
                                      xcols=data['xcols'],
                                      xcols_hist=data['xcols_hist'],
                                      xcols_futr=data['xcols_futr'],
                                      xcols_sample_mask=data['xcols_sample_mask'],
                                      xcols_available_mask=data['xcols_available_mask'], 
                                      dates = data['dates'],
                                      # Aggregated data
                                      X_agg=data['X_agg'], 
                                      S_agg=data['S_agg'], 
                                      Y_agg=data['Y_agg'],
                                      xcols_agg=data['xcols_agg'],
                                      xcols_hist_agg=data['xcols_hist_agg'],
                                      xcols_futr_agg=data['xcols_futr_agg'],
                                      # Generator parameters
                                      T0=24,T=T,H=H,
                                      lastwindow_mask=True)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=1)
valid_dataloader = DataLoader(valid_dataset, batch_size=4, shuffle=False, num_workers=1)

train_batch = next(iter(train_dataloader))
valid_batch = next(iter(valid_dataloader))

for key in train_batch.keys():
    print(f'{key}.shape', train_batch[key].shape)
Y.shape torch.Size([4, 4, 203])
S.shape torch.Size([4, 4, 4])
X.shape torch.Size([4, 4, 1, 204])
F.shape torch.Size([4, 4, 3, 216])
sample_mask.shape torch.Size([4, 4, 203])
available_mask.shape torch.Size([4, 4, 203])
Y_agg.shape torch.Size([4, 4, 203])
S_agg.shape torch.Size([4, 35])
X_agg.shape torch.Size([4, 4, 204])
F_agg.shape torch.Size([4, 1, 216])
{% endraw %}

5.1 Examinate HierTimeSeriesDataset aggregate batch

  1. Examinate the hierarchically linked Y_agg series.
  2. Examinate the alignment between Y_agg and X_agg_futr.
  3. Validate the S_agg dummies generated from H constraints matrix.
{% raw %}
batch = valid_batch
S_agg = batch['S_agg'][0]
Y_agg = batch['Y_agg'][0]
X_agg = batch['X_agg'][0]
F_agg = batch['F_agg'][0]

#---------------- Hierarchical Y_agg ----------------#
plt.plot(Y_agg[0,:])
plt.plot(Y_agg[1,:])
plt.plot(Y_agg[2,:])
plt.plot(Y_agg[3,:])
plt.title('Y_agg')
plt.show()
plt.close()

#---------------- X_agg_futr ----------------#
F_agg = np.maximum(F_agg[0,:]*80000,
                   np.zeros(F_agg[0,:].shape))
plt.plot(F_agg[-36:])
plt.plot(Y_agg[0,-36:])
plt.title('F_agg')
plt.show()
plt.close()

#---------------- S_agg ----------------#
plt.figure(num=1, figsize=(5, 5), dpi=80, facecolor='w')
plt.spy(S_agg[None,:])
plt.title('S_agg')
plt.show()
plt.close()
{% endraw %}

5.2 Examinate HierTimeSeriesDataset bottom batch

  1. Examinate the alignment between Y_bottom, S_bottom and X_bottom_futr.
  2. Validate batch Seq2Seq structure between X_bottom vs Y_bottom.
{% raw %}
# batch = train_batch
# dates = train_dataset.dates
batch = valid_batch
dates = valid_dataset.dates
S_bottom = batch['S'][0]
Y_bottom = batch['Y'][0]
X_bottom = batch['X'][0]
F_bottom = batch['F'][0]
sample_mask = batch['sample_mask'][0]

#---------------- Y/S/X_bottom alignment ----------------#
n_years = 3
fig, axs = plt.subplots(4,1, figsize=(12, 9))

for idx, purpose in enumerate(['Hol', 'Vis', 'Bus', 'Oth']):    
    axs[idx].plot(dates[1:n_years*12+1],Y_bottom[idx,:n_years*12], label='y')
    axs[idx].axhline(y=S_bottom[idx,0],
                label='level', color='black')
    axs[idx].axhline(y=S_bottom[idx,0]*S_bottom[idx,2], 
                label='level_[december]', color='green')
    axs[idx].axhline(y=S_bottom[idx,0]*S_bottom[idx,3],
                label='level_[january]', color='red')
    
    axs[idx].plot(dates[1:n_years*12+1],
                  F_bottom[idx,0,:n_years*12]*S_bottom[idx,0]*S_bottom[idx,2],
                 label='dummy_[december]')
    axs[idx].plot(dates[1:n_years*12+1],
                  F_bottom[idx,1,:n_years*12]*S_bottom[idx,0]*S_bottom[idx,3],
                 label='dummy_[january]')
    axs[idx].set_ylabel(purpose)

plt.suptitle('Y/S/X/F_bottom alignment ', y=0.91)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.show()
plt.close()

#---------------- Y/X_bottom Seq2Seq structure ----------------#
idx = 0
start = 155
end = 203
print('\n')
plt.figure(figsize=(12, 1.8))
plt.plot(dates[start:end],     Y_bottom[idx,start:end]+100, label='y')
plt.plot(dates[start-1:end-1], X_bottom[idx,0,start:end], label='y_[lag1]')
plt.plot(dates[start:end+12],  F_bottom[idx,2,start:end+12], label='y_[lag12]')
plt.plot(dates[start:end],     sample_mask[idx,start:end]*Y_bottom[idx,-1], label='sample_mask')
plt.ylabel('Tourist Visits\n [In millions]')
plt.title('Y/X_bottom (Seq2Seq)')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.show()
plt.close()

{% endraw %}