--- title: TourismLarge dataset. keywords: fastai sidebar: home_sidebar nb_path: "nbs/data_datasets__tourismL.ipynb" ---
directory = './data/hierarchical/TourismL'
TourismL.preprocess_data(directory=directory, verbose=False)
data = TourismL.load_process(directory=directory, verbose=True)
plt.figure(num=1, figsize=(6, 15), dpi=80, facecolor='w')
plt.spy(data['H'])
plt.show()
plt.close()
dates = data['dates']
scols_bottom = data['scols']
# n_groups,n_series,n_features,n_time
Y_bottom = data['Y'].reshape(-1, 228)
X_bottom = data['X']
X_bottom = X_bottom.reshape(-1, X_bottom.shape[2], 241)
S_bottom = data['S'].reshape(-1, data['S'].shape[1])
print("data['scols']:=", data['scols'])
print("data['xcols']:=", data['xcols'])
b_idx = 0
plt.figure(num=1, figsize=(10, 6), dpi=80, facecolor='w')
plt.plot(dates[:228],Y_bottom[b_idx,:], label='signal')
plt.axhline(y=S_bottom[b_idx,0],
label='level', color='black')
plt.axhline(y=S_bottom[b_idx,0]*S_bottom[b_idx,2],
label='level_[december]', color='green')
plt.axhline(y=S_bottom[b_idx,0]*S_bottom[b_idx,3],
label='level_[january]', color='red')
plt.title('{} Visits'.format(data["unique_ids"][b_idx]))
plt.ylabel('Tourism Visits')
plt.xlabel('Date')
plt.legend(bbox_to_anchor=(1, 1),
loc='upper left', ncol=1)
plt.show()
plt.close()
plt.figure(num=1, figsize=(10, 6), dpi=80, facecolor='w')
plt.plot(dates,X_bottom[b_idx,-1,:-1], label='signal') # 'y_[lag12]'
plt.plot(dates,X_bottom[b_idx,4,:-1], label='y_[lag12]') # 'y_[lag12]'
plt.plot(dates,X_bottom[b_idx,0,:-1]*2000, label='month_[11]') # 'month_[11]'
plt.title('{} Visits'.format(data["unique_ids"][b_idx]))
plt.ylabel('Tourism Visits')
plt.xlabel('Date')
plt.legend(bbox_to_anchor=(1, 1),
loc='upper left', ncol=1)
plt.show()
plt.close()
# assert 1<0
print("data['scols_agg']:=", data['scols_agg'][:4])
print("data['xcols_agg']:=", data['xcols_agg'])
X_agg = data['X_agg']
a_idx = 0
plt.figure(num=1, figsize=(10, 6), dpi=80, facecolor='w')
plt.plot(dates,X_agg[a_idx,1,:-1], label='y_[total]')
plt.plot(dates,X_agg[a_idx,2,:-1], label='y_[state]')
plt.plot(dates,X_agg[a_idx,3,:-1], label='y_[zone]')
plt.plot(dates,X_agg[a_idx,4,:-1], label='y_[region]')
plt.plot(dates,X_agg[a_idx,0,:-1]*20_000, label='distance_month')
plt.legend()
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.show()
plt.close()
plt.figure(num=1, figsize=(6, 10), dpi=80, facecolor='w')
plt.spy(data['S_agg'].T)
plt.show()
plt.close()
def plot_hierarchically_linked_data(data, b_idx, n_years=18, plot_file=None):
# This plots validate the S/X/Y_bottom
# We check for the sinchronization of features with Y_bottom
# We do the same for Y_agg, to check correct linkage
assert b_idx < 304 and b_idx>=0
# Hierarchically linked target data and labels
# [Total, State, Zone, Region, TotalP, StateP, ZoneP] [0, 1, 2, 3, 4, 5, 6]
# [Total, State, Zone, Region] <--> [0, 1, 2, 3]
linked_idxs = data['hier_linked_idxs'][b_idx][:4]
Y_hier = data['Y_hier']
hier_labels = data['hier_labels']
# Base series and its label n_agg=251 idx \in {0,...,304}
Y_base = data['Y_hier'][b_idx + 251]
Y_base2 = data['Y'][b_idx//4,b_idx % 4,:]
label = data['hier_labels'][b_idx + 251]
x_plot = data['dates'][:-12] # Skip future dates
fig, axs = plt.subplots(len(linked_idxs)+1, 1, figsize=(12, 9))
for idx, linked_idx in enumerate(linked_idxs):
axs[idx].plot(x_plot[-12*n_years:], Y_hier[linked_idx,-12*n_years:]/1000,
color = 'black', linewidth=1.5)
axs[idx].set_ylabel(f'{hier_labels[linked_idx]} \nTourist Visits\n [In millions]',
fontsize=13)
axs[idx].set_xticks([])
axs[idx].tick_params(labelsize=13)
# Checking that Y_bottom and Y_hier match
axs[4].plot(x_plot[-12*n_years:], Y_base[-12*n_years:]/1000,
color = 'blue', linewidth=1.5, label='base')
axs[4].plot(x_plot[-12*n_years:], Y_base2[-12*n_years:]/1000,
color = 'black', linewidth=1.5, label='base')
axs[4].set_xlabel('Month', fontsize=13)
axs[4].set_ylabel(f'{label} \nTourist Visits\n [In millions]', fontsize=13)
axs[4].tick_params(labelsize=13)
plt.suptitle(f'Hierarchically Linked Series for {label}', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
if plot_file is not None:
plt.savefig(plot_file, bbox_inches='tight')
plt.show()
plt.close()
b_idx = 0
plot_hierarchically_linked_data(data=data, b_idx=b_idx, n_years=5, plot_file=None)
def plot_region_data(data, region_idx, plot_file=None):
x_plot = data['dates'][:-12]
Y_agg = data['Y_agg'][region_idx] # [country,state,zone,region]
Y_base = data['Y'][region_idx]
hier_idxs = data['hier_linked_idxs'][region_idx]
Y_agg_labels = data['hier_labels'][hier_idxs]
region = Y_agg_labels[3] #[country,state,zone,region,country_p,state_p,...]
n_years = 5
fig, axs = plt.subplots(4,1, figsize=(12, 9))
for idx, purpose in enumerate(['Hol', 'Vis', 'Bus', 'Oth']):
axs[idx].plot(x_plot[-12*n_years:], Y_base[idx,-12*n_years:]/1000,
color = 'black', linewidth=1.5, label=f'{idx}')
#axs[idx].set_xlabel('Month', fontsize=13)
axs[idx].set_ylabel(f'{region}{purpose} \n Tourist Visits\n (In millions)', fontsize=13)
if idx<3:
axs[idx].set_xticks([])
axs[idx].tick_params(labelsize=13)
axs[idx].plot(x_plot[-12*n_years:], Y_base[idx, -12*n_years:]/1000,
color = 'black', linewidth=1.5, label=f'{idx}')
axs[3].set_xlabel('Month', fontsize=13)
axs[3].tick_params(labelsize=13)
plt.suptitle(f'Purpose Grouped Series for {region}', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=0.95)
if plot_file is not None:
plt.savefig(plot_file, bbox_inches='tight')
plt.show()
plot_region_data(data, region_idx=0)
# directory = './data/hierarchical/TourismL'
# TourismL.preprocess_data(directory=directory, verbose=False)
# data = TourismL.load_process(directory=directory, verbose=False)
T = 228-12-12
H = 12
train_dataset = HierTimeseriesDataset(# Bottom data
X=data['X'],
S=data['S'],
Y=data['Y'],
xcols=data['xcols'],
xcols_hist=data['xcols_hist'],
xcols_futr=data['xcols_futr'],
xcols_sample_mask=data['xcols_sample_mask'],
xcols_available_mask=data['xcols_available_mask'],
dates = data['dates'],
# Aggregated data
X_agg=data['X_agg'],
S_agg=data['S_agg'],
Y_agg=data['Y_agg'],
xcols_agg=data['xcols_agg'],
xcols_hist_agg=data['xcols_hist_agg'],
xcols_futr_agg=data['xcols_futr_agg'],
# Generator parameters
T0=0,T=T,H=H)
valid_dataset = HierTimeseriesDataset(# Bottom data
X=data['X'],
S=data['S'],
Y=data['Y'],
xcols=data['xcols'],
xcols_hist=data['xcols_hist'],
xcols_futr=data['xcols_futr'],
xcols_sample_mask=data['xcols_sample_mask'],
xcols_available_mask=data['xcols_available_mask'],
dates = data['dates'],
# Aggregated data
X_agg=data['X_agg'],
S_agg=data['S_agg'],
Y_agg=data['Y_agg'],
xcols_agg=data['xcols_agg'],
xcols_hist_agg=data['xcols_hist_agg'],
xcols_futr_agg=data['xcols_futr_agg'],
# Generator parameters
T0=24,T=T,H=H,
lastwindow_mask=True)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)
valid_dataloader = DataLoader(valid_dataset, batch_size=4, shuffle=False, num_workers=0)
train_batch = next(iter(train_dataloader))
valid_batch = next(iter(valid_dataloader))
for key in train_batch.keys():
print(f'{key}.shape', train_batch[key].shape)
batch = valid_batch
S_agg = batch['S_agg'][0]
Y_agg = batch['Y_agg'][0]
X_agg = batch['X_agg'][0]
F_agg = batch['F_agg'][0]
#---------------- Hierarchical Y_agg ----------------#
plt.plot(Y_agg[0,:])
plt.plot(Y_agg[1,:])
plt.plot(Y_agg[2,:])
plt.plot(Y_agg[3,:])
plt.title('Y_agg')
plt.show()
plt.close()
#---------------- X_agg_futr ----------------#
F_agg = np.maximum(F_agg[0,:]*80000,
np.zeros(F_agg[0,:].shape))
plt.plot(F_agg[-36:])
plt.plot(Y_agg[0,-36:])
plt.title('F_agg')
plt.show()
plt.close()
#---------------- S_agg ----------------#
plt.figure(num=1, figsize=(5, 5), dpi=80, facecolor='w')
plt.spy(S_agg[None,:])
plt.title('S_agg')
plt.show()
plt.close()
# batch = train_batch
# dates = train_dataset.dates
batch = valid_batch
dates = valid_dataset.dates
S_bottom = batch['S'][0]
Y_bottom = batch['Y'][0]
X_bottom = batch['X'][0]
F_bottom = batch['F'][0]
sample_mask = batch['sample_mask'][0]
#---------------- Y/S/X_bottom alignment ----------------#
n_years = 3
fig, axs = plt.subplots(4,1, figsize=(12, 9))
for idx, purpose in enumerate(['Hol', 'Vis', 'Bus', 'Oth']):
axs[idx].plot(dates[1:n_years*12+1],Y_bottom[idx,:n_years*12], label='y')
axs[idx].axhline(y=S_bottom[idx,0],
label='level', color='black')
axs[idx].axhline(y=S_bottom[idx,0]*S_bottom[idx,2],
label='level_[december]', color='green')
axs[idx].axhline(y=S_bottom[idx,0]*S_bottom[idx,3],
label='level_[january]', color='red')
axs[idx].plot(dates[1:n_years*12+1],
F_bottom[idx,0,:n_years*12]*S_bottom[idx,0]*S_bottom[idx,2],
label='dummy_[december]')
axs[idx].plot(dates[1:n_years*12+1],
F_bottom[idx,1,:n_years*12]*S_bottom[idx,0]*S_bottom[idx,3],
label='dummy_[january]')
axs[idx].set_ylabel(purpose)
plt.suptitle('Y/S/X/F_bottom alignment ', y=0.91)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.show()
plt.close()
#---------------- Y/X_bottom Seq2Seq structure ----------------#
idx = 0
start = 155
end = 203
print('\n')
plt.figure(figsize=(12, 1.8))
plt.plot(dates[start:end], Y_bottom[idx,start:end]+100, label='y')
plt.plot(dates[start-1:end-1], X_bottom[idx,0,start:end], label='y_[lag1]')
plt.plot(dates[start:end+12], F_bottom[idx,2,start:end+12], label='y_[lag12]')
plt.plot(dates[start:end], sample_mask[idx,start:end]*Y_bottom[idx,-1], label='sample_mask')
plt.ylabel('Tourist Visits\n [In millions]')
plt.title('Y/X_bottom (Seq2Seq)')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.show()
plt.close()