Data Description: The actual concrete compressive strength (MPa) for a given mixture under a specific age (days) was determined from laboratory. Data is in raw form (not scaled).The data has 8 quantitative input variables, and 1 quantitative output variable, and 1030 instances (observations).
Domain: Material manufacturing
Context: Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients. These ingredients include cement, blast furnace slag, fly ash, water, superplasticizer, coarse aggregate, and fine aggregate.
Attribute Information slag ash water superplastic coarseagg fineagg age strength
cement
: measured in kg in a m3 mixtureslag
: measured in kg in a m3 mixtureash
: measured in kg in a m3 mixturewater
: measured in kg in a m3 mixturesuperplastic
: measured in kg in a m3 mixturecoarseagg
: measured in kg in a m3 mixturefineagg
: measured in kg in a m3 mixtureage
: day (1~365)strength
: Concrete compressive strength measured in MPaLearning Outcomes
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')
# Setting the current working directory
import os; os.chdir('drive/My Drive/Great Learning')
!pip install catboost
!pip install eli5
# Imports
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from scipy import stats; from scipy.stats import zscore, norm, randint
import matplotlib.style as style; style.use('fivethirtyeight')
from collections import OrderedDict
%matplotlib inline
# Checking Leverage and Influence Points
from statsmodels.graphics.regressionplots import *
import statsmodels.stats.stattools as stools
import statsmodels.formula.api as smf
import statsmodels.stats as stats
import scipy.stats as scipystats
import statsmodels.api as sm
# Checking multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
# Cluster analysis
from sklearn.cluster import KMeans
# Feature importance
import eli5
from eli5.sklearn import PermutationImportance
# Modelling
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score, learning_curve
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.svm import SVR
import xgboost as xgb
# Metrics
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
# Hyperparameter tuning
from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL, Trials, space_eval
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.utils import resample
# Display settings
pd.options.display.max_rows = 400
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
random_state = 2019
np.random.seed(random_state)
# Suppress warnings
import warnings; warnings.filterwarnings('ignore')
# Checking if GPU is found
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
# Reading the data as dataframe and print the first five rows
concrete = pd.read_csv('concrete (1).csv')
concrete.head()
Performing exploratory data analysis on the cement dataset. Below are some of the steps performed:
print('Several helper function that I created to help in EDA and Modelling'); print('--'*60)
# Customized describe function
def custom_describe(df):
results = []
for col in df.select_dtypes(include = ['float64', 'int64']).columns.tolist():
stats = OrderedDict({'': col, 'Count': df[col].count(), 'Type': df[col].dtype, 'Mean': round(df[col].mean(), 2), 'StandardDeviation': round(df[col].std(), 2),
'Variance': round(df[col].var(), 2), 'Minimum': round(df[col].min(), 2), 'Q1': round(df[col].quantile(0.25), 2),
'Median': round(df[col].median(), 2), 'Q3': round(df[col].quantile(0.75), 2), 'Maximum': round(df[col].max(), 2),
'Range': round(df[col].max(), 2)-round(df[col].min(), 2), 'IQR': round(df[col].quantile(0.75), 2)-round(df[col].quantile(0.25), 2),
'Kurtosis': round(df[col].kurt(), 2), 'Skewness': round(df[col].skew(), 2), 'MeanAbsoluteDeviation': round(df[col].mad(), 2)})
if df[col].skew() < -1:
if df[col].median() < df[col].mean():
ske = 'Highly Skewed (Right)'
else:
ske = 'Highly Skewed (Left)'
elif -1 <= df[col].skew() <= -0.5:
if df[col].median() < df[col].mean():
ske = 'Moderately Skewed (Right)'
else:
ske = 'Moderately Skewed (Left)'
elif -0.5 < df[col].skew() <= 0:
if df[col].median() < df[col].mean():
ske = 'Fairly Symmetrical (Right)'
else:
ske = 'Fairly Symmetrical (Left)'
elif 0 < df[col].skew() <= 0.5:
if df[col].median() < df[col].mean():
ske = 'Fairly Symmetrical (Right)'
else:
ske = 'Fairly Symmetrical (Left)'
elif 0.5 < df[col].skew() <= 1:
if df[col].median() < df[col].mean():
ske = 'Moderately Skewed (Right)'
else:
ske = 'Moderately Skewed (Left)'
elif df[col].skew() > 1:
if df[col].median() < df[col].mean():
ske = 'Highly Skewed (Right)'
else:
ske = 'Highly Skewed (Left)'
else:
ske = 'Error'
stats['SkewnessComment'] = ske
upper_lim, lower_lim = stats['Q3'] + (1.5 * stats['IQR']), stats['Q1'] - (1.5 * stats['IQR'])
if len([x for x in df[col] if x < lower_lim or x > upper_lim])>1:
out = 'HasOutliers'
else:
out = 'NoOutliers'
stats['OutliersComment'] = out
results.append(stats)
statistics = pd.DataFrame(results).set_index('')
return display(statistics)
# Functions that will help us with EDA plot
def odp_plots(df, col):
f,(ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (15, 7.2))
# Boxplot to check outliers
sns.boxplot(x = col, data = df, ax = ax1, orient = 'v', color = 'darkslategrey')
# Distribution plot with outliers
sns.distplot(df[col], ax = ax2, color = 'teal', fit = norm, rug = True).set_title(f'{col} with outliers')
ax2.axvline(df[col].mean(), color = 'r', linestyle = '--', label = 'Mean', linewidth = 1.2)
ax2.axvline(df[col].median(), color = 'g', linestyle = '--', label = 'Median', linewidth = 1.2)
ax2.axvline(df[col].mode()[0], color = 'b', linestyle = '--', label = 'Mode', linewidth = 1.2); ax2.legend(loc = 'best')
# Removing outliers, but in a new dataframe
upperbound, lowerbound = np.percentile(df[col], [1, 99])
y = pd.DataFrame(np.clip(df[col], upperbound, lowerbound))
# Distribution plot without outliers
sns.distplot(y[col], ax = ax3, color = 'tab:orange', fit = norm, rug = True).set_title(f'{col} without outliers')
ax3.axvline(y[col].mean(), color = 'r', linestyle = '--', label = 'Mean', linewidth = 1.2)
ax3.axvline(y[col].median(), color = 'g', linestyle = '--', label = 'Median', linewidth = 1.2)
ax3.axvline(y[col].mode()[0], color = 'b', linestyle = '--', label = 'Mode', linewidth = 1.2); ax3.legend(loc = 'best')
kwargs = {'fontsize':14, 'color':'black'}
ax1.set_title(col + ' Boxplot Analysis', **kwargs)
ax1.set_xlabel('Box', **kwargs)
ax1.set_ylabel(col + ' Values', **kwargs)
return plt.show()
# Correlation matrix for all variables
def correlation_matrix(df, threshold = 0.8):
corr = df.corr()
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize = (15, 7.2))
cmap = sns.diverging_palette(220, 10, as_cmap = True)
sns.heatmap(corr, mask = mask, cmap = cmap, square = True, linewidths = .5, cbar_kws = {"shrink": .5})#, annot = True)
ax.set_title('Correlation Matrix of Data')
# Filter for correlation value greater than threshold
sort = corr.abs().unstack()
sort = sort.sort_values(kind = "quicksort", ascending = False)
display(sort[(sort > threshold) & (sort < 1)])
# Outliers removal
def outliers(df, col, method = 'quantile', strategy = 'median', drop = True):
if method == 'quantile':
Q3, Q2, Q1 = df[col].quantile([0.75, 0.50, 0.25])
IQR = Q3 - Q1
upper_lim = Q3 + (1.5 * IQR)
lower_lim = Q1 - (1.5 * IQR)
print(f'Outliers for {col} are: {sorted([x for x in df[col] if x < lower_lim or x > upper_lim])}\n')
if strategy == 'median':
df.loc[(df[col] < lower_lim) | (df[col] > upper_lim), col] = Q2
else:
df.loc[(df[col] < lower_lim) | (df[col] > upper_lim), col] = df[col].mean()
elif method == 'stddev':
col_mean, col_std, Q2 = df[col].mean(), df[col].std(), df[col].median()
cut_off = col_std * 3
lower_lim, upper_lim = col_mean - cut_off, col_mean + cut_off
print(f'Outliers for {col} are: {sorted([x for x in df[col] if x < lower_lim or x > upper_lim])}\n')
if strategy == 'median':
df.loc[(df[col] < lower_lim) | (df[col] > upper_lim), col] = Q2
else:
df.loc[(df[col] < lower_lim) | (df[col] > upper_lim), col] = col_mean
else:
print('Please pass the correct method, strategy or drop criteria')
# KMeans Plots
def kmeans_plots(df, compcol):
columns = list(set(list(df.columns))-set([compcol]))
f, ax = plt.subplots(4, 2, figsize = (15, 15))
ax[0][0].scatter(X[compcol], X[columns[0]], c = labels, s = 25, cmap = 'viridis'); ax[0][0].set_xlabel(compcol); ax[0][0].set_ylabel(columns[0])
ax[0][1].scatter(X[compcol], X[columns[1]], c = labels, s = 25, cmap = 'viridis'); ax[0][1].set_xlabel(compcol); ax[0][1].set_ylabel(columns[1])
ax[1][0].scatter(X[compcol], X[columns[2]], c = labels, s = 25, cmap = 'viridis'); ax[1][0].set_xlabel(compcol); ax[1][0].set_ylabel(columns[2])
ax[1][1].scatter(X[compcol], X[columns[3]], c = labels, s = 25, cmap = 'viridis'); ax[1][1].set_xlabel(compcol); ax[1][1].set_ylabel(columns[3])
ax[2][0].scatter(X[compcol], X[columns[4]], c = labels, s = 25, cmap = 'viridis'); ax[2][0].set_xlabel(compcol); ax[2][0].set_ylabel(columns[4])
ax[2][1].scatter(X[compcol], X[columns[5]], c = labels, s = 25, cmap = 'viridis'); ax[2][1].set_xlabel(compcol); ax[2][1].set_ylabel(columns[5])
ax[3][0].scatter(X[compcol], X[columns[6]], c = labels, s = 25, cmap = 'viridis'); ax[3][0].set_xlabel(compcol); ax[3][0].set_ylabel(columns[5])
# For rmse scoring
def rmse_score(y, y_pred):
return np.sqrt(np.mean((y_pred - y)**2))
# Function to get top results from grid search and randomized search
def report(results):
df = pd.concat([pd.DataFrame(results.cv_results_['params']), pd.DataFrame(results.cv_results_['mean_test_score'], columns = ['r2'])], axis = 1)
return df
# Get info of the dataframe columns
concrete.info()
concrete.isnull().sum()
All features are of numerical types. strength
is a target variable (continuous). age
is a discrete feature whereas rest of them are continuous.
### Five point summary of numerical attributes and check unique values in 'object' columns
print('Five point summary of the dataframe'); print('--'*60)
custom_describe(concrete)
cement
: Data ranges between 102 to 540, while 25th and 75th percentile is spread between 192.38 to 350. Median (272.90) is less than Mean (281.17) which means cement is moderately skewed to the right. Column has no outliers.slag
: Data ranges between 0 to 359.40, while 25th and 75th percentile is spread between 0 to 142.95. Median (22) is less than Mean (73.90) which means slag is moderately skewed to the right. Column has outliers.ash
: Data ranges between 0 to 200.10, while 25th and 75th percentile is spread between 0 to 118.30. Median (0) is less than Mean (54.19) which mean ash is moderately skewed to the right. Column has no outliers.water
: Data ranges between 121.80 to 247, while 25th and 75th percentile is spread between 164.90 and 192. Median (185) is greater than Mean (181.57) which means water is skewed to the left (fairly sym). Column has outliers.superplastic
: Data ranges between 0 to 32.20, while 25th and 75th percentile is spread between 0 to 10.20. Median (6.40) is greater than Mean (6.20) which means superplastic is moderately skewed to the left. Column has outliers.coarseagg
: Data ranges between 801 to 1145, while 25th and 75th percentile is spread between 932 to 1029.40. Median (968) is less than Mean (972.92) which means coarseagg is skewed to the right (fairly sym). Column has no outliers.fineagg
: Data ranges between 594 to 992.60, while 25th and 75th percentile is spread between 730.95 to 824. Median (779.5) is greater than Mean (773.58) which means fineagg is skewed to the left (fairly sym). Column has no outliers.age
: Data ranges between 1 to 365, while 25th and 75th percentile is spread between 7 to 56. Median (28) is less than Mean (45.66) which means age is highly skewed to the right. Column has no outliers.strength
: Data ranges between 2.33 to 82.60, while 25th and 75th percentile is spread between 23.71 to 46.14. Median (34.45) is less than Mean (35.82) which means strength is slightly skewed to the right (fairly sym). Column has no outliers.# A quick check to find columns that contain outliers
print('A quick check to find columns that contain outliers, graphical'); print('--'*60)
fig = plt.figure(figsize = (15, 7.2))
ax = sns.boxplot(data = concrete.iloc[:, 0:-1], orient = 'h')
# Outlier, distribution for columns with outliers
print('Box plot, distribution of columns with and without outliers'); print('--'*60)
boxplotcolumns = list(concrete.columns)[:-1]
for cols in boxplotcolumns:
Q3 = concrete[cols].quantile(0.75)
Q1 = concrete[cols].quantile(0.25)
IQR = Q3 - Q1
print(f'{cols.capitalize()} column', '--'*40)
print(f'Number of rows with outliers: {len(concrete.loc[(concrete[cols] < (Q1 - 1.5 * IQR)) | (concrete[cols] > (Q3 + 1.5 * IQR))])}')
display(concrete.loc[(concrete[cols] < (Q1 - 1.5 * IQR)) | (concrete[cols] > (Q3 + 1.5 * IQR))].head())
odp_plots(concrete, cols)
del cols, IQR, boxplotcolumns
# Replacing outliers with mean values in these columns
print('Replacing outliers with mean values using quantile method'); print('--'*60)
concrete_im = concrete.copy(deep = True)
outliers_cols = ['slag', 'water', 'superplastic', 'fineagg', 'age']
for col in outliers_cols:
outliers(concrete_im, col, method = 'quantile', strategy = 'mean')
print('\nColumn for which outliers where replaced with mean using quantile method: \n', outliers_cols)
print('Summary stats before outlier removal for columns with outliers'); print('--'*60); display(concrete[outliers_cols].describe().T)
print('\nSummary stats after outlier removal for columns with outliers'); print('--'*60); display(concrete_im[outliers_cols].describe().T)
A quick observation after imputating the missing values: medians remain unchanged while mean changes slightly not significantly. Type of skewness remain unchanged.
# A quick check to find columns that contain outliers
fig = plt.figure(figsize = (15, 7.2))
ax = sns.boxplot(data = concrete_im.iloc[:, 0:-1], orient = 'h')
print('cement and strength column have a linear relationship'); print('--'*60)
sns.pairplot(concrete_im, diag_kind = 'kde')
for col in list(concrete_im.columns)[:-2]:
fig, ax1 = plt.subplots(figsize = (15, 7.2), ncols = 1, sharex = False)
sns.regplot(x = concrete_im[col], y = concrete_im['strength'], ax = ax1).set_title(f'Understanding relation between {col}, strength')
Reference for carrying out this analysis
Leverage: An observation with an extreme value on a predictor variable is called a point with high leverage. Leverage is a measure of how far an observation deviates from the mean of that variable. These leverage points can have an effect on the estimate of regression coefficients.
Influence: An observation is said to be influential if removing the observation substantially changes the estimate of coefficients. Influence can be thought of as the product of leverage and outlierness.
lm = smf.ols(formula = 'strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age', data = concrete_im).fit()
print(lm.summary())
influence = lm.get_influence()
resid_student = influence.resid_studentized_external
(cooks, p) = influence.cooks_distance
(dffits, p) = influence.dffits
leverage = influence.hat_matrix_diag
print('\n')
print('Leverage v.s. Studentized Residuals')
fig = plt.figure(figsize = (15, 7.2))
sns.regplot(leverage, lm.resid_pearson, fit_reg = False)
concrete_im_res = pd.concat([pd.Series(cooks, name = 'cooks'), pd.Series(dffits, name = 'dffits'), pd.Series(leverage, name = 'leverage'), pd.Series(resid_student, name = 'resid_student')], axis = 1)
concrete_im_res = pd.concat([concrete_im, concrete_im_res], axis = 1)
concrete_im_res.head()
# Studentized Residual
print('Studentized residuals as a first means for identifying outliers'); print('--'*60)
r = concrete_im_res.resid_student
print('-'*30 + ' studentized residual ' + '-'*30)
display(r.describe())
print('\n')
r_sort = concrete_im_res.sort_values(by = 'resid_student', ascending = True)
print('-'*30 + ' top 5 most negative residuals ' + '-'*30)
display(r_sort.head())
print('\n')
r_sort = concrete_im_res.sort_values(by = 'resid_student', ascending = False)
print('-'*30 + ' top 5 most positive residuals ' + '-'*30)
display(r_sort.head())
We should pay attention to studentized residuals that exceed +2 or -2, and get even more concerned about residuals that exceed +2.5 or -2.5 and even yet more concerned about residuals that exceed +3 or -3.
print('Printing indexes where studentized residual exceeds +2 or -2'); print('--'*60)
res_index = concrete_im_res[abs(r) > 2].index
print(res_index)
print('Let\'s look at leverage points to identify observations that will have potential great influence on reg coefficient estimates.'); print('--'*60)
print('A point with leverage greater than (2k+2)/n should be carefully examined, where k is the number of predictors and n is the number of observations. In our example this works out to (2*8+2)/1030 = .017476')
leverage = concrete_im_res.leverage
print('-'*30 + ' Leverage ' + '-'*30)
display(leverage.describe())
print('\n')
leverage_sort = concrete_im_res.sort_values(by = 'leverage', ascending = False)
print('-'*30 + ' top 5 highest leverage data points ' + '-'*30)
display(leverage_sort.head())
print('Printing indexes where leverage exceeds +0.017476 or -0.017476'); print('--'*60)
lev_index = concrete_im_res[abs(leverage) > 0.017476].index
print(lev_index)
print('Let\'s take a look at DFITS. The conventional cut-off point for DFITS is 2*sqrt(k/n).')
print('DFITS can be either positive or negative, with numbers close to zero corresponding to the points with small or zero influence.'); print('--'*60)
import math
dffits_index = concrete_im_res[concrete_im_res['dffits'] > 2 * math.sqrt(8 / 1030)].index
print(dffits_index)
set(res_index).intersection(lev_index).intersection(dffits_index)
print('Let\'s run the regression again without 452 and 469 row'); print('--'*60)
concrete_im.drop([452, 469], axis = 0, inplace = True)
print(concrete_im.shape)
lm1 = smf.ols(formula = 'strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age', data = concrete_im).fit()
print(lm1.summary())
# Correlation matrix
correlation_matrix(concrete_im, 0.8)
# Absolute correlation of independent variables with the target variable
absCorrwithDep = []
allVars = concrete_im.drop('strength', axis = 1).columns
for var in allVars:
absCorrwithDep.append(abs(concrete_im['strength'].corr(concrete_im[var])))
display(pd.DataFrame([allVars, absCorrwithDep], index = ['Variable', 'Correlation']).T.\
sort_values('Correlation', ascending = False))
age
, cement
and superplastic
are some of the columns that have strong influence over target variable. Performing feature engineering on the cement dataset. Objective here would be:
concrete_im.reset_index(inplace = True, drop = True)
X = concrete_im.drop('strength', axis = 1)
y = concrete_im['strength']
labels = KMeans(2, random_state = random_state).fit_predict(X)
print('Cement vs other columns clusters'); print('--'*60)
kmeans_plots(X, 'cement')
print('Slag vs other columns clusters'); print('--'*60)
kmeans_plots(X, 'slag')
print('Ash vs other columns clusters'); print('--'*60)
kmeans_plots(X, 'ash')
print('Water vs other columns clusters'); print('--'*60)
kmeans_plots(X, 'water')
print('Superplastic vs other columns clusters'); print('--'*60)
kmeans_plots(X, 'superplastic')
print('Coarseagg vs other columns clusters'); print('--'*60)
kmeans_plots(X, 'coarseagg')
print('Fineagg vs other columns clusters'); print('--'*60)
kmeans_plots(X, 'fineagg')
print('Fineagg vs other columns clusters'); print('--'*60)
kmeans_plots(X, 'age')
Let's add features based on cluster analysis we found for cement and other columns.
# Adding features based on cement clusters
print('Let\'s add features based on cluster analysis we found for cement and other columns'); print('--'*60)
concrete_im = concrete_im.join(pd.DataFrame(labels, columns = ['labels']), how = 'left')
cement_features = concrete_im.groupby('labels', as_index = False)['cement'].agg(['mean', 'median'])
concrete_im = concrete_im.merge(cement_features, on = 'labels', how = 'left')
concrete_im.rename(columns = {'mean': 'cement_labels_mean', 'median': 'cement_labels_median'}, inplace = True)
concrete_im.drop('labels', axis = 1, inplace = True)
display(custom_describe(concrete_im))
Check whether there exist any important feature interaction which we can make use of to create new features.
# Splitting the dataset into train and test set for checking feature interaction
print('Checking if there exist any important feature interaction and make use of that to create features')
print('Make use catboostregressor\'s feature interaction'); print('--'*60)
X = concrete_im.drop('strength', axis = 1)
y = concrete_im['strength']
features_list = list(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
# Initialize CatBoostRegressor
reg = CatBoostRegressor(iterations = None, eval_metric = 'RMSE', random_state = random_state, task_type = 'GPU', od_type = 'Iter', od_wait = 5)
reg.fit(X_train, y_train, early_stopping_rounds = 5, verbose = False, eval_set = [(X_test, y_test)], use_best_model = True)
# Get feature importance -- Type = Interaction
print('Feature Importance plot for catboostregressor using type = Interaction');
print('Adding features based on cement and age; water and age can be useful'); print('--'*60)
FI = reg.get_feature_importance(Pool(X_test, label = y_test), type = 'Interaction')
FI_new = []
for k, item in enumerate(FI):
first = X_test.dtypes.index[FI[k][0]]
second = X_test.dtypes.index[FI[k][1]]
if first != second:
FI_new.append([first + "_" + second, FI[k][2]])
feature_score = pd.DataFrame(FI_new, columns = ['FeaturePair', 'Score'])
feature_score = feature_score.sort_values(by = 'Score', ascending = True)
ax = feature_score.plot('FeaturePair', 'Score', kind = 'barh', figsize = (15, 10))
ax.set_title('Pairwise Feature Importance', fontsize = 14)
ax.set_xlabel('Features Pair')
plt.show()
Let's add features based on cement and age; water and age interaction.
# Adding features based on 'feature interaction' we got from above catboostregressor
print('Adding features based on feature interaction we got from catboostregressor\'s feature importance'); print('--'*60)
cement_age = concrete_im.groupby('age', as_index = False)['cement'].agg(['mean', 'median'])
concrete_im = concrete_im.merge(cement_age, on = 'age', how = 'left')
concrete_im.rename(columns = {'mean': 'cement_age_mean', 'median': 'cement_age_median'}, inplace = True)
water_age = concrete_im.groupby('age')['water'].agg(['mean', 'median']); concrete_im = concrete_im.merge(water_age, on = 'age', how = 'left')
concrete_im.rename(columns = {'mean': 'water_age_mean', 'median': 'water_age_median'}, inplace = True)
concrete_im.describe()
Let's use model based feature importance, eli5, correlation matrix and absolute correlation to understand important features.
X = concrete_im.drop('strength', axis = 1)
y = concrete_im['strength']
features_list = list(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
reg = CatBoostRegressor(iterations = None, eval_metric = 'RMSE', random_state = random_state, task_type = 'GPU', od_type = 'Iter', od_wait = 5)
reg.fit(X_train, y_train, early_stopping_rounds = 5, verbose = False, eval_set = [(X_test, y_test)], use_best_model = True)
# Get feature importance -- eli5
perm = PermutationImportance(reg, random_state = random_state).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())
# Get feature importance -- model based
print('Feature Importance plot for catboostregressor using type = PredictionValuesChange');
print('Age, cement and water are top 3 importance features'); print('--'*60)
FI = reg.get_feature_importance(Pool(X_test, label = y_test), type = 'PredictionValuesChange')
feature_score = pd.DataFrame(list(zip(X_test.dtypes.index, FI)), columns = ['Feature', 'Score'])
feature_score = feature_score.sort_values(by = 'Score', ascending = True)
ax = feature_score.plot('Feature', 'Score', kind = 'barh', figsize = (15, 10))
ax.set_title('Feature Importance', fontsize = 14)
ax.set_xlabel('Features')
plt.show()
# Correlation matrix
correlation_matrix(concrete_im, 0.9)
# Absolute correlation of independent variables with the target variable
absCorrwithDep = []
allVars = concrete_im.drop('strength', axis = 1).columns
for var in allVars:
absCorrwithDep.append(abs(concrete_im['strength'].corr(concrete_im[var])))
display(pd.DataFrame([allVars, absCorrwithDep], index = ['Variable', 'Correlation']).T.\
sort_values('Correlation', ascending = False))
print('Checking if multicollinearity exists')
print('A VIF between 5 and 10 indicates high correlation that may be problematic. And if the VIF goes above 10, you can assume that the regression coefficients are poorly estimated due to multicollinearity.')
print('--'*60)
y, X = dmatrices('strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age + cement_labels_mean + cement_labels_median + cement_age_mean + cement_age_median + water_age_mean + water_age_median',
concrete_im, return_type = 'dataframe')
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['Features'] = X.columns
display(vif.round(1).sort_values(by = 'VIF Factor', ascending = False))
age
, cement
, water
, slag
are some of the importance features based on eli5 and model based feature importance. Dropping all newly added features since they resulted in multicollinearity :(.concrete_im.drop(['water_age_mean', 'water_age_median', 'cement_age_mean', 'cement_labels_mean', 'cement_labels_median', 'cement_age_mean'], axis = 1, inplace = True)
concrete_im.shape, concrete_im.columns
Decide on complexity of the model, should it be simple linear mode in terms of parameters or would a quadratic or higher degree help
print('Split into training (70%), validation(10%) and test(20%) sets for both with EDA and FE & without EDA and FE.')
print('--'*60)
# Training, validation and test sets with outliers
X = concrete.drop('strength', axis = 1); y = concrete['strength']; features_list = list(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.12, random_state = random_state)
print(f'Shape of train, valid and test datasets without EDA, FE: {(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)}')
print(f'Proportion in the splits for train, valid, test datasets without EDA, FE: {round(len(X_train)/len(X), 2), round(len(X_val)/len(X), 2), round(len(X_test)/len(X), 2)}')
# Training, validation and test sets without outliers
X = concrete_im.drop('strength', axis = 1); y = concrete_im['strength']; features_list = list(X.columns)
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X, y, test_size = 0.2, random_state = random_state)
X_train_fe, X_val_fe, y_train_fe, y_val_fe = train_test_split(X_train_fe, y_train_fe, test_size = 0.12, random_state = random_state)
print(f'\nShape of train, valid and test datasets with EDA, FE: {(X_train_fe.shape, y_train_fe.shape, X_val_fe.shape, y_val_fe.shape, X_test_fe.shape, y_test_fe.shape)}')
print(f'Proportion in the splits for train, valid, test datasets with EDA, FE: {round(len(X_train_fe)/len(X), 2), round(len(X_val_fe)/len(X), 2), round(len(X_test_fe)/len(X), 2)}')
training_test_sets = {'withoutedafe': (X_train, y_train, X_val, y_val), 'withedafe': (X_train_fe, y_train_fe, X_val_fe, y_val_fe)}
print('Let\'s check cross validated scores on linear models and tree-based models on training and validation sets with and without EDA & FE')
print('--'*60)
models = []
models.append(('Linear', LinearRegression()))
models.append(('Lasso', Lasso(random_state = random_state)))
models.append(('Ridge', Ridge(random_state = random_state)))
models.append(('SVR', SVR()))
models.append(('DecisionTree', DecisionTreeRegressor(random_state = random_state)))
models.append(('GradientBoost', GradientBoostingRegressor(random_state = random_state)))
models.append(('AdaBoost', AdaBoostRegressor(random_state = random_state)))
models.append(('ExtraTrees', ExtraTreesRegressor(random_state = random_state)))
models.append(('RandomForest', RandomForestRegressor(random_state = random_state)))
models.append(('Bagging', BaggingRegressor(DecisionTreeRegressor(random_state = random_state), random_state = random_state)))
models.append(('CatBoost', CatBoostRegressor(random_state = random_state, silent = True)))
scoring = 'r2'; results = {}; score = {}
for encoding_label, (_X_train, _y_train, _X_val, _y_val) in training_test_sets.items():
scores = []; result_cv = []; names = []
for name, model in models:
kf = KFold(n_splits = 10, random_state = random_state)
cv_results = cross_val_score(model, _X_train, _y_train, cv = kf, scoring = scoring)
result_cv.append(cv_results); names.append(name)
scores.append([name, cv_results.mean().round(4), cv_results.std().round(4)])
score[encoding_label] = scores
results[encoding_label] = [names, result_cv]
print('Let\'s check the cv scores (r2) for sets without EDA and FE')
display(score['withoutedafe'])
print('\nLet\'s check the cv scores (r2) for sets with EDA and FE')
display(score['withedafe'])
pd.options.display.float_format = "{:.4f}".format
scores_df = pd.concat([pd.DataFrame(score['withoutedafe'], columns = ['Model', 'R2 (Mean) Without', 'R2 (Std) Without']).set_index('Model'),
pd.DataFrame(score['withedafe'], columns = ['Model', 'R2 (Mean) With', 'R2 (Std) With']).set_index('Model')], axis = 1)
scores_df['Improvement?'] = scores_df['R2 (Mean) With'] - scores_df['R2 (Mean) Without']
display(scores_df)
print('A significant improvement in r2 scores after EDA & FE for linear algorithms whereas remains almost same for tree-based algorithms.'); print('--'*60)
fig,(ax1, ax2) = plt.subplots(1, 2, figsize = (20, 7.2))
ax1.boxplot(results['withoutedafe'][1]); ax1.set_xticklabels(results['withoutedafe'][0], rotation = 90); ax1.set_title('CV Score - without EDA and FE')
ax2.boxplot(results['withedafe'][1]); ax2.set_xticklabels(results['withedafe'][0], rotation = 90); ax2.set_title('CV Score - with EDA and FE')
plt.show()
scalers = {'notscaled': None, 'standardscaling': StandardScaler(), 'robustscaling': RobustScaler()}
training_test_sets = {'validation_sets': (X_train_fe, y_train_fe, X_val_fe, y_val_fe),
'test_sets': (X_train_fe, y_train_fe, X_test_fe, y_test_fe)}
# initialize model
cat_reg = CatBoostRegressor(iterations = None, eval_metric = 'RMSE', random_state = random_state, od_type = 'Iter', od_wait = 5)
# iterate over all possible combinations and get the errors
errors = {}
for encoding_label, (_X_train, _y_train, _X_val, _y_val) in training_test_sets.items():
for scaler_label, scaler in scalers.items():
scores = []
if scaler == None:
trainingset = _X_train.copy()
testset = _X_val.copy()
cat_reg.fit(trainingset, _y_train, early_stopping_rounds = 5, verbose = False, plot = False,
eval_set = [(testset, _y_val)], use_best_model = True)
pred = cat_reg.predict(testset)
rmse = rmse_score(_y_val, pred)
r2 = r2_score(_y_val, pred)
scores.append([rmse, r2])
key = encoding_label + ' - ' + scaler_label
errors[key] = scores[0]
else:
trainingset = _X_train.copy()
testset = _X_val.copy()
trainingset = scaler.fit_transform(trainingset)
testset = scaler.transform(testset)
cat_reg.fit(trainingset, _y_train, early_stopping_rounds = 5, verbose = False, plot = False,
eval_set = [(testset, _y_val)], use_best_model = True)
pred = cat_reg.predict(testset)
rmse = rmse_score(_y_val, pred)
r2 = r2_score(_y_val, pred)
scores.append([rmse, r2])
key = encoding_label + ' - ' + scaler_label
errors[key] = scores[0]
print('It can be seen that RMSE is lowest when robust scaling is used whereas R2 almost remains same as un-scaled data.');
print('Scaling would help to effectively use the training and validation sets across algorithms.');print('--'*60)
display(errors)
## Helper function to train, validate and predict
def train_val_predict(basemodel, train_X, train_y, test_X, test_y, name, model):
folds = list(KFold(n_splits = 5, random_state = random_state, shuffle = True).split(train_X, train_y))
r2_scores_train = []; r2_scores_val = []; r2_scores_test = []
for j, (train_index, val_index) in enumerate(folds):
X_train = train_X.iloc[train_index]
y_train = train_y.iloc[train_index]
X_val = train_X.iloc[val_index]
y_val = train_y.iloc[val_index]
if model == 'CatBoost':
basemodel.fit(X_train, y_train, early_stopping_rounds = 5, verbose = 300, eval_set = [(X_val, y_val)], use_best_model = True)
else:
basemodel.fit(X_train, y_train)
pred = basemodel.predict(X_train)
r2 = r2_score(y_train, pred); r2_scores_train.append(r2)
pred = basemodel.predict(X_val)
r2 = r2_score(y_val, pred); r2_scores_val.append(r2)
pred = basemodel.predict(X_test_fe)
r2 = r2_score(y_test_fe, pred); r2_scores_test.append(r2)
df = pd.DataFrame([np.mean(r2_scores_train), np.mean(r2_scores_val), np.mean(r2_scores_test)],
index = ['r2 Scores Train', 'r2 Scores Val', 'r2 Scores Test'],
columns = [name]).T
return df
print('Separating the dependents and independents + Scaling the data'); print('--'*60)
features_list = list(concrete_im.columns)
concrete_im = concrete_im.apply(zscore); concrete_im = pd.DataFrame(concrete_im , columns = features_list)
display(concrete_im.describe())
X = concrete_im.drop('strength', axis = 1); y = concrete_im['strength'];
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X, y, test_size = 0.2, random_state = random_state)
X_train_fe.shape, X_test_fe.shape, y_train_fe.shape, y_test_fe.shape
print('Using the 5-Fold Linear Regression to train, validate and predict'); print('--'*60)
lr_reg = LinearRegression()
df_lr = train_val_predict(lr_reg, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold LinearRegression', model = 'LR')
%%time
print('Using the 5-Fold Lasso Regression to train, validate and predict'); print('--'*60)
lasso_reg = Lasso(alpha = 0.01)
df_lasso = train_val_predict(lasso_reg, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold LassoRegression', model = 'Lasso')
df = df_lr.append(df_lasso)
%%time
print('Using the 5-Fold Ridge Regression to train, validate and predict'); print('--'*60)
ridge_reg = Ridge(alpha = 0.01)
df_ridge = train_val_predict(ridge_reg, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold RidgeRegression', model = 'Ridge')
df = df.append(df_ridge)
display(df)
%%time
print('Finding out the hyperparameters for Decision Tree and Random Forest with GridSearchCV'); print('--'*60)
best_params_grid = {}
# Decision Tree and Random Forest Regressor Hyperparameters Grid
param_grid = {'DecisionTree': {'criterion': ['mse', 'mae'], 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, None]},
'RandomForest': {'bootstrap': [True, False], 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, None],
'max_features': ['auto', 'sqrt'], 'n_estimators': [200, 400, 600, 800]}}
# Decision Tree Regressor
dt_reg = DecisionTreeRegressor(random_state = random_state)
dt_reg_grid = GridSearchCV(dt_reg, param_grid['DecisionTree'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
dt_reg_grid.fit(X_train_fe, y_train_fe)
best_params_grid['DecisionTree'] = dt_reg_grid.best_params_
# Random Forest Regressor
rf_reg = RandomForestRegressor(random_state = random_state)
rf_reg_grid = GridSearchCV(rf_reg, param_grid['RandomForest'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
rf_reg_grid.fit(X_train_fe, y_train_fe)
best_params_grid['RandomForest'] = rf_reg_grid.best_params_
print(f'Best parameters for Decision Tree and Random Forest using GridSearchCV: {best_params_grid}')
%%time
print('Finding out the hyperparameters for Decision Tree and Random Forest with RandomizedSearchCV'); print('--'*60)
best_params_random = {}
# Decision Tree and Random Forest Regressor Hyperparameters Grid
param_grid = {'DecisionTree': {'criterion': ['mse', 'mae'], 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, None]},
'RandomForest': {'bootstrap': [True, False], 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, None],
'max_features': ['auto', 'sqrt'], 'n_estimators': [200, 400, 600, 800]}}
# Decision Tree Regressor
dt_reg = DecisionTreeRegressor(random_state = random_state)
dt_reg_grid = RandomizedSearchCV(dt_reg, param_grid['DecisionTree'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
dt_reg_grid.fit(X_train_fe, y_train_fe)
best_params_random['DecisionTree'] = dt_reg_grid.best_params_
# Random Forest Regressor
rf_reg = RandomForestRegressor(random_state = random_state)
rf_reg_grid = RandomizedSearchCV(rf_reg, param_grid['RandomForest'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
rf_reg_grid.fit(X_train_fe, y_train_fe)
best_params_random['RandomForest'] = rf_reg_grid.best_params_
print(f'Best parameters for Decision Tree and Random Forest using RandomizedSearchCV: {best_params_random}')
%%time
print('Using the 5-Fold Decision Tree Regressor to train, validate and predict'); print('--'*60)
dt_reg = DecisionTreeRegressor(random_state = random_state)
df_reg = train_val_predict(dt_reg, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold DecisionTree', model = 'DT')
df = df.append(df_reg)
%%time
print('Using the 5-Fold Decision Tree Regressor to train, validate and predict using GridSearchCV'); print('--'*60)
dt_reg_grid = DecisionTreeRegressor(random_state = random_state, **best_params_grid['DecisionTree'])
df_reg_grid = train_val_predict(dt_reg_grid, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold DecisionTree GridSearchCV', model = 'DT')
df = df.append(df_reg_grid)
%%time
print('Using the 5-Fold Decision Tree Regressor to train, validate and predict using RandomizedSearchCV'); print('--'*60)
dt_reg_rand = DecisionTreeRegressor(random_state = random_state, **best_params_random['DecisionTree'])
df_reg_rand = train_val_predict(dt_reg_rand, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold DecisionTree RandomizedSearchCV', model = 'DT')
df = df.append(df_reg_rand)
display(df)
%%time
print('Using the 5-Fold Random Forest Regressor to train, validate and predict'); print('--'*60)
rf_reg = RandomForestRegressor(random_state = random_state)
df_reg = train_val_predict(rf_reg, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold RandomForest', model = 'RF')
df = df.append(df_reg)
%%time
print('Using the 5-Fold Random Forest Regressor to train, validate and predict using GridSearchCV'); print('--'*60)
rf_reg_grid = RandomForestRegressor(random_state = random_state, **best_params_grid['RandomForest'])
df_reg_grid = train_val_predict(rf_reg_grid, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold RandomForest GridSearchCV', model = 'RF')
df = df.append(df_reg_grid)
%%time
print('Using the 5-Fold Random Forest Regressor to train, validate and predict using RandomizedSearchCV'); print('--'*60)
rf_reg_rand = RandomForestRegressor(random_state = random_state, **best_params_random['RandomForest'])
df_reg_rand = train_val_predict(rf_reg_rand, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold RandomForest RandomizedSearchCV', model = 'RF')
df = df.append(df_reg_rand)
display(df)
%%time
print('Using the 5-Fold Ada Boost Regressor to train, validate and predict'); print('--'*60)
ada_reg = AdaBoostRegressor(random_state = random_state)
df_reg = train_val_predict(ada_reg, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold AdaBoost', model = 'Ada')
df = df.append(df_reg)
%%time
# AdaBoost Regressor Hyperparameters Grid
print('Finding out the hyperparameters for AdaBoostRegressor with GridSearchCV'); print('--'*60)
param_grid = {'AdaBoost': {'base_estimator': [DecisionTreeRegressor(random_state = random_state, **best_params_grid['DecisionTree']), None],
'n_estimators': [100, 150, 200], 'learning_rate': [0.01, 0.1, 1.0]}}
# AdaBoost Regressor
ada_reg = AdaBoostRegressor(random_state = random_state)
ada_reg_grid = GridSearchCV(ada_reg, param_grid['AdaBoost'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
ada_reg_grid.fit(X_train_fe, y_train_fe)
best_params_grid['AdaBoost'] = ada_reg_grid.best_params_
print('Best parameters for AdaBoost Regressor using GridSearchCV: {}'.format(best_params_grid['AdaBoost']))
%%time
# AdaBoost Regressor Hyperparameters Grid
print('Finding out the hyperparameters for AdaBoostRegressor with RandomizedSearchCV'); print('--'*60)
param_grid = {'AdaBoost': {'base_estimator': [DecisionTreeRegressor(random_state = random_state, **best_params_grid['DecisionTree']), None],
'n_estimators': [100, 150, 200], 'learning_rate': [0.01, 0.1, 1.0]}}
# AdaBoost Regressor
ada_reg = AdaBoostRegressor(random_state = random_state)
ada_reg_grid = RandomizedSearchCV(ada_reg, param_grid['AdaBoost'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
ada_reg_grid.fit(X_train_fe, y_train_fe)
best_params_random['AdaBoost'] = ada_reg_grid.best_params_
print('Best parameters for AdaBoost Regressor using RandomizedSearchCV: {}'.format(best_params_random['AdaBoost']))
%%time
print('Using the 5-Fold Ada Boost Regressor to train, validate and predict using GridSearchCV'); print('--'*60)
ada_reg_grid = AdaBoostRegressor(random_state = random_state, **best_params_grid['AdaBoost'])
df_reg_grid = train_val_predict(ada_reg_grid, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold AdaBoost using GridSearchCV', model = 'Ada')
df = df.append(df_reg_grid)
%%time
print('Using the 5-Fold Ada Boost Regressor to train, validate and predict using RandomizedSearchCV'); print('--'*60)
ada_reg_rand = AdaBoostRegressor(random_state = random_state, **best_params_random['AdaBoost'])
df_reg_rand = train_val_predict(ada_reg_rand, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold AdaBoost using RandomizedSearchCV', model = 'Ada')
df = df.append(df_reg_rand)
display(df)
%%time
# GradientBoostRegressor Hyperparameters Grid
print('Finding out the hyperparameters for GradientBoostRegressor with GridSearchCV'); print('--'*60)
param_grid = {'GradientBoost': {'max_depth': [5, 6, 7, 8, 9, 10, None], 'max_features': ['auto', 'sqrt'],
'n_estimators': [600, 800, 1000]}}
# GradientBoostRegressor
gb_reg = GradientBoostingRegressor(random_state = random_state)
gb_reg_grid = GridSearchCV(gb_reg, param_grid['GradientBoost'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
gb_reg_grid.fit(X_train_fe, y_train_fe)
best_params_grid['GradientBoost'] = gb_reg_grid.best_params_
print('Best parameters for Gradient Boost Regressor using GridSearchCV: {}'.format(best_params_grid['GradientBoost']))
%%time
# GradientBoostRegressor Hyperparameters Grid
print('Finding out the hyperparameters for GradientBoostRegressor with RandomizedSearchCV'); print('--'*60)
param_grid = {'GradientBoost': {'max_depth': [5, 6, 7, 8, 9, 10, None], 'max_features': ['auto', 'sqrt'],
'n_estimators': [600, 800, 1000]}}
# GradientBoostRegressor
gb_reg = GradientBoostingRegressor(random_state = random_state)
gb_reg_rand = RandomizedSearchCV(gb_reg, param_grid['GradientBoost'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
gb_reg_rand.fit(X_train_fe, y_train_fe)
best_params_random['GradientBoost'] = gb_reg_rand.best_params_
print('Best parameters for Gradient Boost Regressor using RandomizedSearchCV: {}'.format(best_params_random['GradientBoost']))
%%time
print('Using the 5-Fold Gradient Boost Regressor to train, validate and predict'); print('--'*60)
gb_reg = GradientBoostingRegressor(random_state = random_state)
df_reg = train_val_predict(gb_reg, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold GradientBoost', model = 'GB')
df = df.append(df_reg)
%%time
print('Using the 5-Fold Gradient Boost Regressor to train, validate and predict using GridSearchCV'); print('--'*60)
gb_reg_grid = GradientBoostingRegressor(random_state = random_state, **best_params_grid['GradientBoost'])
df_reg_grid = train_val_predict(gb_reg_grid, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold GradientBoost using GridSearchCV', model = 'GB')
df = df.append(df_reg_grid)
%%time
print('Using the 5-Fold Gradient Boost Regressor to train, validate and predict using RandomizedSearchCV'); print('--'*60)
gb_reg_rand = GradientBoostingRegressor(random_state = random_state, **best_params_random['GradientBoost'])
df_reg_rand = train_val_predict(gb_reg_rand, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold GradientBoost using RandomizedSearchCV', model = 'GB')
df = df.append(df_reg_rand)
display(df)
%%time
# ExtraTreesRegressor Hyperparameters Grid
print('Finding out the hyperparameters for ExtraTreesRegressor with GridSearchCV'); print('--'*60)
param_grid = {'ExtraTrees': {'max_depth': [5, 6, 7, 8, 9, 10, None], 'max_features': ['auto', 'sqrt'],
'n_estimators': [100, 600, 800, 1000]}}
# ExtraTreesRegressor
et_reg = ExtraTreesRegressor(random_state = random_state)
et_reg_grid = GridSearchCV(et_reg, param_grid['ExtraTrees'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
et_reg_grid.fit(X_train_fe, y_train_fe)
best_params_grid['ExtraTrees'] = et_reg_grid.best_params_
print('Best parameters for Extra Trees Regressor using GridSearchCV: {}'.format(best_params_grid['ExtraTrees']))
%%time
# ExtraTreesRegressor Hyperparameters Grid
print('Finding out the hyperparameters for ExtraTreesRegressor with RandomizedSearchCV'); print('--'*60)
param_grid = {'ExtraTrees': {'max_depth': [5, 6, 7, 8, 9, 10, None], 'max_features': ['auto', 'sqrt'],
'n_estimators': [100, 600, 800, 1000]}}
# ExtraTreesRegressor
et_reg = ExtraTreesRegressor(random_state = random_state)
et_reg_rand = RandomizedSearchCV(et_reg, param_grid['ExtraTrees'], cv = 5, n_jobs = -1, verbose = False, scoring = 'r2')
et_reg_rand.fit(X_train_fe, y_train_fe)
best_params_random['ExtraTrees'] = et_reg_rand.best_params_
print('Best parameters for Extra Trees Regressor using RandomizedSearchCV: {}'.format(best_params_random['ExtraTrees']))
%%time
print('Using the 5-Fold Extra Trees Regressor to train, validate and predict'); print('--'*60)
et_reg = ExtraTreesRegressor(random_state = random_state)
df_reg = train_val_predict(et_reg, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold ExtraTrees', model = 'ET')
df = df.append(df_reg)
%%time
print('Using the 5-Fold Extra Trees Regressor to train, validate and predict using GridSearchCV'); print('--'*60)
et_reg_grid = ExtraTreesRegressor(random_state = random_state, **best_params_grid['ExtraTrees'])
df_reg_grid = train_val_predict(et_reg_grid, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold ExtraTrees using GridSearchCV', model = 'ET')
df = df.append(df_reg_grid)
%%time
print('Using the 5-Fold Extra Trees Regressor to train, validate and predict using RandomizedSearchCV'); print('--'*60)
et_reg_rand = ExtraTreesRegressor(random_state = random_state, **best_params_random['ExtraTrees'])
df_reg_rand = train_val_predict(et_reg_rand, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold ExtraTrees using RandomizedSearchCV', model = 'ET')
df = df.append(df_reg_rand)
display(df)
%%time
print('Finding out the hyperparameters for CatBoost with GridSearch'); print('--'*60)
param_grid = {'CatBoost': {'learning_rate': np.arange(0.01, 0.31, 0.05), 'depth': [3, 4, 5, 6, 7, 8, 9, 10], 'l2_leaf_reg': np.arange(2, 10, 1)}}
# Cat Boost Regressor
cat_reg = CatBoostRegressor(iterations = None, random_state = random_state, od_type = 'Iter', od_wait = 5)
best_params = cat_reg.grid_search(param_grid['CatBoost'], X = X_train_fe, y = y_train_fe, cv = 3, verbose = 150)
best_params_grid['CatBoostGridSearch'] = best_params['params']
%%time
print('Finding out the hyperparameters for CatBoost with RandomSearch'); print('--'*60)
param_grid = {'CatBoost': {'learning_rate': np.arange(0.01, 0.31, 0.05), 'depth': [3, 4, 5, 6, 7, 8, 9, 10], 'l2_leaf_reg': np.arange(2, 10, 1)}}
# Cat Boost Regressor
cat_reg = CatBoostRegressor(iterations = None, random_state = random_state, od_type = 'Iter', od_wait = 5)
best_params = cat_reg.randomized_search(param_grid['CatBoost'], X = X_train_fe, y = y_train_fe, cv = 3, verbose = 150)
best_params_grid['CatBoostRandomSearch'] = best_params['params']
%%time
print('Using the 5-Fold CatBoost Regressor to train, validate and predict'); print('--'*60)
cb_reg = CatBoostRegressor(iterations = None, random_state = random_state, od_type = 'Iter', od_wait = 5)
df_reg = train_val_predict(cb_reg, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold CatBoost', model = 'CatBoost')
df = df.append(df_reg)
%%time
print('Using the 5-Fold CatBoost Regressor to train, validate and predict using GridSearch'); print('--'*60)
cb_reg_grid = CatBoostRegressor(iterations = None, random_state = random_state, od_type = 'Iter', od_wait = 5, **best_params_grid['CatBoostGridSearch'])
df_reg_grid = train_val_predict(cb_reg_grid, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold CatBoost GridSearch', model = 'CatBoost')
df = df.append(df_reg_grid)
%%time
print('Using the 5-Fold CatBoost Regressor to train, validate and predict using RandomSearch'); print('--'*60)
cb_reg_rand = CatBoostRegressor(iterations = None, random_state = random_state, od_type = 'Iter', od_wait = 5, **best_params_grid['CatBoostRandomSearch'], verbose = False)
df_reg_rand = train_val_predict(cb_reg_rand, X_train_fe, y_train_fe, X_test_fe, y_test_fe, '5-Fold CatBoost RandomSearch', model = 'CatBoost')
df = df.append(df_reg_rand)
display(df)
%%time
values = concrete_im.values
n_iterations = 500 # Number of bootstrap samples to create
n_size = int(len(concrete_im) * 1) # size of a bootstrap sample
# run bootstrap
stats = list() # empty list that will hold the scores for each bootstrap iteration
for i in range(n_iterations):
# prepare train and test sets
train = resample(values, n_samples = n_size) # Sampling with replacement
test = np.array([x for x in values if x.tolist() not in train.tolist()]) # picking rest of the data not considered in sample
# fit model
gb_reg_grid = GradientBoostingRegressor(random_state = random_state, **best_params_grid['GradientBoost'])
gb_reg_grid.fit(train[:, :-1], train[:, -1]) # fit against independent variables and corresponding target values
# evaluate model
predictions = gb_reg_grid.predict(test[:, :-1]) # predict based on independent variables in the test data
score = r2_score(test[:, -1], predictions)
stats.append(score)
# plot scores
plt.figure(figsize = (15, 7.2))
plt.hist(stats); plt.show()
# confidence intervals
alpha = 0.95 # for 95% confidence
p = ((1.0 - alpha) / 2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))
p = (alpha + ((1.0 - alpha) / 2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))
display(df)