import easeml as es ## import just one package
### just in case you are not able to view plots
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import cufflinks
cufflinks.go_offline(connected=True)
from plotly.offline import iplot, init_notebook_mode
# es.help() prints all function available and usage
df = es.importdata('adv-housing.csv')
es.quick_ml(df,'SalePrice','r')
Dataframe Imported Successfully *** Dataset Infomarion *** ################################################## No of Rows : 1460 No of columns : 81 No of Numerical columns: 38 No of Categorical columns: 43 ################################################## Total Missing values : 6965 ################################################## Summary of DataFrame: Column Name Nulls/NaN outof Unique Type of Columns 0 Id 0 1460 1460 Numerical 1 MSSubClass 0 1460 15 Numerical 2 MSZoning 0 1460 5 Categorical 3 LotFrontage 259 1460 110 Numerical 4 LotArea 0 1460 1073 Numerical .. ... ... ... ... ... 76 MoSold 0 1460 12 Numerical 77 YrSold 0 1460 5 Numerical 78 SaleType 0 1460 9 Categorical 79 SaleCondition 0 1460 6 Categorical 80 SalePrice 0 1460 663 Numerical [81 rows x 5 columns]
Null value summary: Total Percent PoolQC 1453 99.520548 MiscFeature 1406 96.301370 Alley 1369 93.767123 Fence 1179 80.753425 FireplaceQu 690 47.260274 LotFrontage 259 17.739726 GarageCond 81 5.547945 GarageType 81 5.547945 GarageYrBlt 81 5.547945 GarageFinish 81 5.547945 GarageQual 81 5.547945 BsmtExposure 38 2.602740 BsmtFinType2 38 2.602740 BsmtFinType1 37 2.534247 BsmtCond 37 2.534247 BsmtQual 37 2.534247 MasVnrArea 8 0.547945 MasVnrType 8 0.547945 Electrical 1 0.068493 columns dropped Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage'], dtype='object') columns dropped ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage'] [] Dropped null values from columns: ['MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond'] columns label encoded Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'], dtype='object')
Detailed Executing with RandomForest ..
----------------------------------------- ************* Model Results ************* ----------------------------------------- R2 score 0.6273971463490331 MSE score: 1607856695.0580575 RMSE score score: 40098.088421495326 ----------------------------------------- Executing RFE Please wait......... ### RFE selected columns: Index(['LotArea', 'OverallQual', 'YearBuilt', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageArea'], dtype='object')
RFE Selected Features Please wait Training-Testing with all models.. Done with LinearRegression Done with RandomForestRegression [15:36:16] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror. Done with XGBoostRegressor Done with LGBoostRegressor Done with AdaBoostRegressor Done with SupportVectorMachine Done with GradientBoostingRegression
Model | R^2 score | Root Mean Squared Error | Root Mean Squared Log Error | Mean Squared Error | |
---|---|---|---|---|---|
1 | RandomForestRegression | 0.88781 | 16097.1 | 0.106443 | 2.59116e+08 |
2 | XGBoostRegressor | 0.886151 | 16215.7 | 0.0977401 | 2.62949e+08 |
6 | GradientBoostingRegression | 0.883387 | 16411.4 | 0.0990406 | 2.69333e+08 |
4 | AdaBoostRegressor | 0.837192 | 19391.4 | 0.108653 | 3.76025e+08 |
0 | LinearRegression | 0.759426 | 23571.9 | 0.191409 | 5.55637e+08 |
3 | LGBoostRegressor | 0.244415 | 41774.7 | 0.24101 | 1.74512e+09 |
5 | SupportVectorMachine | -0.0113428 | 48330.4 | 0.263462 | 2.33583e+09 |
df1 = es.importdata('train_split.csv')
df1 = es.dropcolumns(df1,'batch_enrolled','member_id')
## no hazzle to clean data or do anything just pass target and dataframe use quick_ml fucntion in easeml
## just pass dataframe, targetcolumn and flag(r-regression/c=classification)
es.quick_ml(df1,'loan_status','c')
Dataframe Imported Successfully *** Dataset Infomarion *** ################################################## No of Rows : 63999 No of columns : 43 No of Numerical columns: 26 No of Categorical columns: 17 ################################################## Total Missing values : 276727 ################################################## Summary of DataFrame: Column Name Nulls/NaN outof Unique Type of Columns 0 loan_amnt 0 63999 1273 Numerical 1 funded_amnt 0 63999 1275 Numerical 2 funded_amnt_inv 0 63999 1965 Numerical 3 term 0 63999 2 Categorical 4 int_rate 0 63999 459 Numerical 5 grade 0 63999 7 Categorical 6 sub_grade 0 63999 35 Categorical 7 emp_title 3826 63999 31609 Categorical 8 emp_length 3324 63999 11 Categorical 9 home_ownership 0 63999 6 Categorical 10 annual_inc 0 63999 6368 Numerical 11 verification_status 0 63999 3 Categorical 12 pymnt_plan 0 63999 1 Categorical 13 desc 54849 63999 8757 Categorical 14 purpose 0 63999 14 Categorical 15 title 13 63999 6409 Categorical 16 zip_code 0 63999 866 Categorical 17 addr_state 0 63999 51 Categorical 18 dti 0 63999 3943 Numerical 19 delinq_2yrs 0 63999 19 Numerical 20 inq_last_6mths 0 63999 15 Numerical 21 mths_since_last_delinq 32831 63999 103 Numerical 22 mths_since_last_record 54349 63999 121 Numerical 23 open_acc 0 63999 54 Numerical 24 pub_rec 0 63999 14 Numerical 25 revol_bal 0 63999 30608 Numerical 26 revol_util 29 63999 1090 Numerical 27 total_acc 0 63999 101 Numerical 28 initial_list_status 0 63999 2 Categorical 29 total_rec_int 0 63999 50744 Numerical 30 total_rec_late_fee 0 63999 600 Numerical 31 recoveries 0 63999 1754 Numerical 32 collection_recovery_fee 0 63999 1653 Numerical 33 collections_12_mths_ex_med 8 63999 5 Numerical 34 mths_since_last_major_derog 48155 63999 148 Numerical 35 application_type 0 63999 2 Categorical 36 verification_status_joint 63971 63999 3 Categorical 37 last_week_pay 0 63999 87 Categorical 38 acc_now_delinq 0 63999 5 Numerical 39 tot_coll_amt 5124 63999 2579 Numerical 40 tot_cur_bal 5124 63999 52207 Numerical 41 total_rev_hi_lim 5124 63999 3617 Numerical 42 loan_status 0 63999 2 Numerical
Null value summary: Total Percent verification_status_joint 63971 99.956249 desc 54849 85.702902 mths_since_last_record 54349 84.921639 mths_since_last_major_derog 48155 75.243363 mths_since_last_delinq 32831 51.299239 tot_cur_bal 5124 8.006375 tot_coll_amt 5124 8.006375 total_rev_hi_lim 5124 8.006375 emp_title 3826 5.978218 emp_length 3324 5.193831 revol_util 29 0.045313 title 13 0.020313 collections_12_mths_ex_med 8 0.012500 columns dropped Index(['verification_status_joint', 'desc', 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq'], dtype='object') columns dropped ['verification_status_joint', 'desc', 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq'] [] Dropped null values from columns: ['emp_title', 'emp_length', 'title', 'revol_util', 'collections_12_mths_ex_med', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim'] columns label encoded Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'initial_list_status', 'application_type', 'last_week_pay'], dtype='object')
Detailed Executing with RandomForest ..
----------------------------------------- ************* Model Results ************* ----------------------------------------- F1 Score : 0.7112312634261447 Report: precision recall f1-score support 0 0.89 0.68 0.77 13359 1 0.34 0.67 0.45 3238 accuracy 0.68 16597 macro avg 0.62 0.68 0.61 16597 weighted avg 0.79 0.68 0.71 16597
----------------------------------------- Executing RFE Please wait......... ### RFE selected columns: Index(['funded_amnt_inv', 'term', 'int_rate', 'sub_grade', 'title', 'dti', 'initial_list_status', 'total_rec_int', 'recoveries', 'last_week_pay'], dtype='object') RFE Selected Features
Please wait Training-Testing with all models.. Done with LogisticRegression Done with RandomForestClassifier Done with XGBoost Classifier Done with LGBoost Classifier Done with AdaBoost Classifier Done with GradientBoostingClassifier
Model | F1 score | AUC-ROC score | Accuracy | Confustion-Matrix | |
---|---|---|---|---|---|
3 | LGBoost Classifier | 0.923457 | 0.75 | 0.933333 | [[26 0] [ 2 2]] |
0 | LogisticRegression | 0.872727 | 0.625 | 0.9 | [[26 0] [ 3 1]] |
1 | RandomForestClassifier | 0.872727 | 0.625 | 0.9 | [[26 0] [ 3 1]] |
2 | XGBoost Classifier | 0.804762 | 0.5 | 0.866667 | [[26 0] [ 4 0]] |
4 | AdaBoost Classifier | 0.804762 | 0.5 | 0.866667 | [[26 0] [ 4 0]] |
5 | GradientBoostingClassifier | 0.804762 | 0.5 | 0.866667 | [[26 0] [ 4 0]] |
df = es.importdata('Housing.csv')
es.quick_ml(df,'price','r')
Dataframe Imported Successfully *** Dataset Infomarion *** ################################################## No of Rows : 545 No of columns : 13 No of Numerical columns: 6 No of Categorical columns: 7 ################################################## Total Missing values : 0 ################################################## Summary of DataFrame: Column Name Nulls/NaN outof Unique Type of Columns 0 price 0 545 219 Numerical 1 area 0 545 284 Numerical 2 bedrooms 0 545 6 Numerical 3 bathrooms 0 545 4 Numerical 4 stories 0 545 4 Numerical 5 mainroad 0 545 2 Categorical 6 guestroom 0 545 2 Categorical 7 basement 0 545 2 Categorical 8 hotwaterheating 0 545 2 Categorical 9 airconditioning 0 545 2 Categorical 10 parking 0 545 4 Numerical 11 prefarea 0 545 2 Categorical 12 furnishingstatus 0 545 3 Categorical Null value summary: Empty DataFrame Columns: [Total, Percent] Index: [] columns dropped Index([], dtype='object') columns dropped [] [] columns label encoded Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus'], dtype='object')
Detailed Executing with RandomForest ..
----------------------------------------- ************* Model Results ************* ----------------------------------------- R2 score 0.5499416877756778 MSE score: 1938127160645.6133 RMSE score score: 1392166.3552340337 ----------------------------------------- Executing RFE Please wait......... ### RFE selected columns: Index(['area', 'bedrooms', 'bathrooms', 'stories', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus'], dtype='object')
RFE Selected Features Please wait Training-Testing with all models.. Done with LinearRegression Done with RandomForestRegression [15:48:34] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror. Done with XGBoostRegressor Done with LGBoostRegressor Done with AdaBoostRegressor Done with SupportVectorMachine Done with GradientBoostingRegression
Model | R^2 score | Root Mean Squared Error | Root Mean Squared Log Error | Mean Squared Error | |
---|---|---|---|---|---|
0 | LinearRegression | 0.536614 | 1.19452e+06 | 0.223221 | 1.42688e+12 |
2 | XGBoostRegressor | 0.498153 | 1.24311e+06 | 0.224521 | 1.54531e+12 |
6 | GradientBoostingRegression | 0.485068 | 1.25921e+06 | 0.229515 | 1.5856e+12 |
4 | AdaBoostRegressor | 0.4731 | 1.27376e+06 | 0.240908 | 1.62246e+12 |
1 | RandomForestRegression | 0.437476 | 1.31611e+06 | 0.246043 | 1.73215e+12 |
3 | LGBoostRegressor | -0.0563305 | 1.80353e+06 | 0.318165 | 3.2527e+12 |
5 | SupportVectorMachine | -0.230739 | 1.94673e+06 | 0.341145 | 3.78975e+12 |
df = es.importdata('TrainDataset.csv')
es.quick_ml(df,'popularity','c')
Dataframe Imported Successfully *** Dataset Infomarion *** ################################################## No of Rows : 1302 No of columns : 7 No of Numerical columns: 7 No of Categorical columns: 0 ################################################## Total Missing values : 0 ################################################## Summary of DataFrame: Column Name Nulls/NaN outof Unique Type of Columns 0 buying_price 0 1302 4 Numerical 1 maintainence_cost 0 1302 4 Numerical 2 number_of_doors 0 1302 4 Numerical 3 number_of_seats 0 1302 3 Numerical 4 luggage_boot_size 0 1302 3 Numerical 5 safety_rating 0 1302 3 Numerical 6 popularity 0 1302 4 Numerical Null value summary: Empty DataFrame Columns: [Total, Percent] Index: [] columns dropped Index([], dtype='object') columns dropped [] []
Detailed Executing with RandomForest ..
----------------------------------------- ************* Model Results ************* ----------------------------------------- F1 Score : 0.9623893476106086 Report: precision recall f1-score support 1 1.00 0.96 0.98 277 2 0.90 0.96 0.93 94 3 0.86 1.00 0.92 12 4 0.80 1.00 0.89 8 accuracy 0.96 391 macro avg 0.89 0.98 0.93 391 weighted avg 0.96 0.96 0.96 391
----------------------------------------- Executing RFE Please wait......... ### RFE selected columns: Index(['buying_price', 'maintainence_cost', 'number_of_doors', 'number_of_seats', 'luggage_boot_size', 'safety_rating'], dtype='object') RFE Selected Features
Please wait Training-Testing with all models.. Done with LogisticRegression Done with RandomForestClassifier Done with XGBoost Classifier Done with LGBoost Classifier Done with AdaBoost Classifier Done with GradientBoostingClassifier
Model | F1 score | AUC-ROC score | Accuracy | Confustion-Matrix | |
---|---|---|---|---|---|
1 | RandomForestClassifier | 1 | 1 | 1 | [[21 0] [ 0 9]] |
3 | LGBoost Classifier | 1 | 1 | 1 | [[21 0] [ 0 9]] |
2 | XGBoost Classifier | 0.967137 | 0.97619 | 0.966667 | [[20 1] [ 0 9]] |
5 | GradientBoostingClassifier | 0.967137 | 0.97619 | 0.966667 | [[20 1] [ 0 9]] |
4 | AdaBoost Classifier | 0.898222 | 0.865079 | 0.9 | [[20 1] [ 2 7]] |
0 | LogisticRegression | 0.822222 | 0.753968 | 0.833333 | [[20 1] [ 4 5]] |
df1 = es.importdata('train_split.csv')
es.info(df1)
es.box_hist_plot(df1,'emp_length')
df1 = es.drop_columns(df1,20,'member_id','pymnt_plan','batch_enrolled')
es.missingdata(df1)
es.extract_number(df1,'emp_length')
df1 = es.fillnulls(df1,'unknown','emp_title','title')
es.box_hist_plot(df1,'revol_util','tot_cur_bal','total_rev_hi_lim')
df1 = es.dropcolumns(df1,'tot_coll_amt')
df1 = es.fillnulls(df1,'mean','revol_util','tot_cur_bal','total_rev_hi_lim')
df2 = es.label_encode(df1)
es.corr_heatmap(df2,'interactive')
es.dropcolumns(df2,'funded_amnt_inv','funded_amnt','collections_12_mths_ex_med')
es.showbias(df1,'loan_status')
X = df2.drop('loan_status',axis=1)
y = df2['loan_status']
es.stat_models(y,X)
es.VIF(df2)
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=.7,random_state=101)
print('\n *************** Random Forest ***************\n')
y_pred, rfc = es.randomforest_classifier(X_train,y_train,X_test,y_test)
es.classification_result(y_test,y_pred)
es.roc_curve_graph(y_test, y_pred)
Dataframe Imported Successfully *** Dataset Infomarion *** ################################################## No of Rows : 63999 No of columns : 45 No of Numerical columns: 27 No of Categorical columns: 18 ################################################## Total Missing values : 286991 ################################################## Summary of DataFrame: Column Name Nulls/NaN outof Unique Type of Columns 0 member_id 0 63999 63999 Numerical 1 loan_amnt 0 63999 1273 Numerical 2 funded_amnt 0 63999 1275 Numerical 3 funded_amnt_inv 0 63999 1965 Numerical 4 term 0 63999 2 Categorical 5 batch_enrolled 10264 63999 102 Categorical 6 int_rate 0 63999 459 Numerical 7 grade 0 63999 7 Categorical 8 sub_grade 0 63999 35 Categorical 9 emp_title 3826 63999 31609 Categorical 10 emp_length 3324 63999 11 Categorical 11 home_ownership 0 63999 6 Categorical 12 annual_inc 0 63999 6368 Numerical 13 verification_status 0 63999 3 Categorical 14 pymnt_plan 0 63999 1 Categorical 15 desc 54849 63999 8757 Categorical 16 purpose 0 63999 14 Categorical 17 title 13 63999 6409 Categorical 18 zip_code 0 63999 866 Categorical 19 addr_state 0 63999 51 Categorical 20 dti 0 63999 3943 Numerical 21 delinq_2yrs 0 63999 19 Numerical 22 inq_last_6mths 0 63999 15 Numerical 23 mths_since_last_delinq 32831 63999 103 Numerical 24 mths_since_last_record 54349 63999 121 Numerical 25 open_acc 0 63999 54 Numerical 26 pub_rec 0 63999 14 Numerical 27 revol_bal 0 63999 30608 Numerical 28 revol_util 29 63999 1090 Numerical 29 total_acc 0 63999 101 Numerical 30 initial_list_status 0 63999 2 Categorical 31 total_rec_int 0 63999 50744 Numerical 32 total_rec_late_fee 0 63999 600 Numerical 33 recoveries 0 63999 1754 Numerical 34 collection_recovery_fee 0 63999 1653 Numerical 35 collections_12_mths_ex_med 8 63999 5 Numerical 36 mths_since_last_major_derog 48155 63999 148 Numerical 37 application_type 0 63999 2 Categorical 38 verification_status_joint 63971 63999 3 Categorical 39 last_week_pay 0 63999 87 Categorical 40 acc_now_delinq 0 63999 5 Numerical 41 tot_coll_amt 5124 63999 2579 Numerical 42 tot_cur_bal 5124 63999 52207 Numerical 43 total_rev_hi_lim 5124 63999 3617 Numerical 44 loan_status 0 63999 2 Numerical
Null value summary: Total Percent verification_status_joint 63971 99.956249 desc 54849 85.702902 mths_since_last_record 54349 84.921639 mths_since_last_major_derog 48155 75.243363 mths_since_last_delinq 32831 51.299239 batch_enrolled 10264 16.037751 tot_cur_bal 5124 8.006375 tot_coll_amt 5124 8.006375 total_rev_hi_lim 5124 8.006375 emp_title 3826 5.978218 emp_length 3324 5.193831 revol_util 29 0.045313 title 13 0.020313 collections_12_mths_ex_med 8 0.012500 columns dropped Index(['verification_status_joint', 'desc', 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq'], dtype='object') columns dropped ['verification_status_joint', 'desc', 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq'] ['member_id', 'pymnt_plan', 'batch_enrolled']
columns inputed with : unknown are ['emp_title', 'title'] unknown Filled inplace of NaNs unknown Filled inplace of NaNs
columns inputed with : mean are ['revol_util', 'tot_cur_bal', 'total_rev_hi_lim'] revol_util filled with: 55.045863060809744 tot_cur_bal filled with: 139812.53492993632 total_rev_hi_lim filled with: 32223.922038216562 columns label encoded Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'title', 'zip_code', 'addr_state', 'initial_list_status', 'application_type', 'last_week_pay'], dtype='object')
************************************************************ OLS Regression Results ======================================================================================= Dep. Variable: loan_status R-squared (uncentered): 0.357 Model: OLS Adj. R-squared (uncentered): 0.357 Method: Least Squares F-statistic: 1110. Date: Tue, 17 Mar 2020 Prob (F-statistic): 0.00 Time: 15:52:24 Log-Likelihood: -30665. No. Observations: 63999 AIC: 6.139e+04 Df Residuals: 63967 BIC: 6.168e+04 Df Model: 32 Covariance Type: nonrobust =========================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------------- loan_amnt -2.078e-06 2.67e-07 -7.784 0.000 -2.6e-06 -1.55e-06 term -0.0601 0.004 -14.037 0.000 -0.069 -0.052 int_rate 0.0846 0.001 64.946 0.000 0.082 0.087 grade -0.0181 0.005 -3.321 0.001 -0.029 -0.007 sub_grade -0.0515 0.001 -34.544 0.000 -0.054 -0.049 emp_title -2.238e-06 1.65e-07 -13.588 0.000 -2.56e-06 -1.92e-06 emp_length -0.0013 0.001 -2.548 0.011 -0.002 -0.000 home_ownership -0.0027 0.001 -2.926 0.003 -0.004 -0.001 annual_inc -1.405e-07 3.62e-08 -3.884 0.000 -2.11e-07 -6.96e-08 verification_status -0.0059 0.002 -2.805 0.005 -0.010 -0.002 purpose -0.0040 0.001 -5.099 0.000 -0.005 -0.002 title 3.48e-05 1.56e-06 22.313 0.000 3.17e-05 3.79e-05 zip_code -3.354e-06 5.81e-06 -0.577 0.564 -1.47e-05 8.04e-06 addr_state -0.0005 0.000 -4.756 0.000 -0.001 -0.000 dti -0.0051 0.000 -23.481 0.000 -0.005 -0.005 delinq_2yrs -0.0229 0.002 -12.517 0.000 -0.027 -0.019 inq_last_6mths 0.0232 0.002 14.117 0.000 0.020 0.026 open_acc -0.0076 0.000 -17.411 0.000 -0.008 -0.007 pub_rec -0.0394 0.003 -13.728 0.000 -0.045 -0.034 revol_bal -2.196e-07 1.51e-07 -1.451 0.147 -5.16e-07 7.71e-08 revol_util -0.0009 8.26e-05 -10.311 0.000 -0.001 -0.001 total_acc 0.0033 0.000 17.411 0.000 0.003 0.004 initial_list_status -0.1331 0.003 -40.528 0.000 -0.140 -0.127 total_rec_int -2.848e-06 1.04e-06 -2.740 0.006 -4.89e-06 -8.11e-07 total_rec_late_fee -0.0006 0.000 -1.704 0.088 -0.001 9.44e-05 recoveries -0.0001 6.84e-06 -20.976 0.000 -0.000 -0.000 collection_recovery_fee 0.0004 4.18e-05 8.410 0.000 0.000 0.000 application_type -0.0415 0.074 -0.561 0.575 -0.186 0.103 last_week_pay -0.0004 6.09e-05 -7.363 0.000 -0.001 -0.000 acc_now_delinq -0.0310 0.020 -1.588 0.112 -0.069 0.007 tot_cur_bal 1.139e-07 1.46e-08 7.819 0.000 8.54e-08 1.43e-07 total_rev_hi_lim 2.649e-07 1.11e-07 2.381 0.017 4.68e-08 4.83e-07 ============================================================================== Omnibus: 7448.626 Durbin-Watson: 2.001 Prob(Omnibus): 0.000 Jarque-Bera (JB): 10200.031 Skew: 0.970 Prob(JB): 0.00 Kurtosis: 2.750 Cond. No. 1.05e+07 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.05e+07. This might indicate that there are strong multicollinearity or other numerical problems. ******** VIF ********* feature vif 4 sub_grade 153.09 2 int_rate 147.28 3 grade 61.36 17 open_acc 12.90 21 total_acc 11.97 31 total_rev_hi_lim 10.39 20 revol_util 10.31 0 loan_amnt 8.65 14 dti 7.85 19 revol_bal 7.45 11 title 5.76 28 last_week_pay 5.39 8 annual_inc 4.71 5 emp_title 4.33 7 home_ownership 4.27 12 zip_code 3.89 30 tot_cur_bal 3.65 13 addr_state 3.49 23 total_rec_int 3.40 9 verification_status 3.15 10 purpose 3.00 25 recoveries 2.87 26 collection_recovery_fee 2.78 1 term 2.32 22 initial_list_status 2.25 6 emp_length 2.19 16 inq_last_6mths 1.72 32 loan_status 1.56 15 delinq_2yrs 1.20 18 pub_rec 1.18 24 total_rec_late_fee 1.03 29 acc_now_delinq 1.03 27 application_type 1.00
*************** Random Forest ***************
----------------------------------------- ************* Model Results ************* ----------------------------------------- F1 Score : 0.781688137529659 Report: precision recall f1-score support 0 0.87 0.82 0.85 14622 1 0.52 0.62 0.57 4578 accuracy 0.78 19200 macro avg 0.70 0.72 0.71 19200 weighted avg 0.79 0.78 0.78 19200
-----------------------------------------
y_pred, model = es.classification(X,y)
es.classification_result(y_test,y_pred)
es.roc_curve_graph(y_test, y_pred)
es.quick_pred(X,y,'c')
########## MENU ############## 1. Logisitic Regression 2. Random Forest 3. XGB Classifier 4. LGB CLassifier 5. Gradient Boosting Classifier 6. AdaBoost Classifier 99. For all models Input No, which you want:6 Predicting with Ada Boost CLassifier
Model used: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0, n_estimators=100, random_state=None) Done. ----------------------------------------- ************* Model Results ************* ----------------------------------------- F1 Score : 0.6576365174253705 Report: precision recall f1-score support 0 0.76 0.87 0.81 14622 1 0.23 0.12 0.16 4578 accuracy 0.69 19200 macro avg 0.50 0.50 0.49 19200 weighted avg 0.63 0.69 0.66 19200
-----------------------------------------
Please wait Training-Testing with all models.. Done with LogisticRegression Done with RandomForestClassifier Done with XGBoost Classifier Done with LGBoost Classifier Done with AdaBoost Classifier Done with GradientBoostingClassifier
Model | F1 score | AUC-ROC score | Accuracy | Confustion-Matrix | |
---|---|---|---|---|---|
3 | LGBoost Classifier | 0.926282 | 0.8 | 0.933333 | [[25 0] [ 2 3]] |
5 | GradientBoostingClassifier | 0.926282 | 0.8 | 0.933333 | [[25 0] [ 2 3]] |
2 | XGBoost Classifier | 0.881402 | 0.7 | 0.9 | [[25 0] [ 3 2]] |
4 | AdaBoost Classifier | 0.881402 | 0.7 | 0.9 | [[25 0] [ 3 2]] |
1 | RandomForestClassifier | 0.852564 | 0.68 | 0.866667 | [[24 1] [ 3 2]] |
0 | LogisticRegression | 0.757576 | 0.5 | 0.833333 | [[25 0] [ 5 0]] |
## car popularity dataset
df = es.importdata('TrainDataset.csv')
es.info(df)
X = df.drop('popularity',axis=1)
y = df['popularity']
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=42)
es.stat_models(y,X)
es.VIF(df)
y_pred , model = es.classification(X,y)
print('\n\n*** Start Hypertuning ***\n')
y_pred,grid = es.grid_search(model,'quick',X_train,y_train,X_test,y_test)
print('\n\n*** Grid Search CV Result ***\n')
es.classification_result(y_test,y_pred)
Dataframe Imported Successfully *** Dataset Infomarion *** ################################################## No of Rows : 1302 No of columns : 7 No of Numerical columns: 7 No of Categorical columns: 0 ################################################## Total Missing values : 0 ################################################## Summary of DataFrame: Column Name Nulls/NaN outof Unique Type of Columns 0 buying_price 0 1302 4 Numerical 1 maintainence_cost 0 1302 4 Numerical 2 number_of_doors 0 1302 4 Numerical 3 number_of_seats 0 1302 3 Numerical 4 luggage_boot_size 0 1302 3 Numerical 5 safety_rating 0 1302 3 Numerical 6 popularity 0 1302 4 Numerical
************************************************************ OLS Regression Results ======================================================================================= Dep. Variable: popularity R-squared (uncentered): 0.894 Model: OLS Adj. R-squared (uncentered): 0.894 Method: Least Squares F-statistic: 1825. Date: Tue, 17 Mar 2020 Prob (F-statistic): 0.00 Time: 16:27:41 Log-Likelihood: -915.04 No. Observations: 1302 AIC: 1842. Df Residuals: 1296 BIC: 1873. Df Model: 6 Covariance Type: nonrobust ===================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------- buying_price -0.1362 0.012 -11.721 0.000 -0.159 -0.113 maintainence_cost -0.1092 0.011 -9.546 0.000 -0.132 -0.087 number_of_doors 0.0537 0.011 5.102 0.000 0.033 0.074 number_of_seats 0.2138 0.009 22.528 0.000 0.195 0.232 luggage_boot_size 0.1288 0.015 8.385 0.000 0.099 0.159 safety_rating 0.3725 0.015 24.433 0.000 0.343 0.402 ============================================================================== Omnibus: 281.511 Durbin-Watson: 2.017 Prob(Omnibus): 0.000 Jarque-Bera (JB): 701.940 Skew: 1.150 Prob(JB): 3.76e-153 Kurtosis: 5.766 Cond. No. 8.35 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. ******** VIF ********* feature vif 3 number_of_seats 10.14 6 popularity 9.45 5 safety_rating 8.54 2 number_of_doors 8.28 4 luggage_boot_size 6.26 0 buying_price 6.20 1 maintainence_cost 5.73 ########## MENU ############## 1. Logisitic Regression 2. Random Forest 3. XGB Classifier 4. LGB CLassifier 5. Gradient Boosting Classifier 6. AdaBoost Classifier 99. For all models Input No, which you want:2 Predicting with Random Forest Classifier
Model used: RandomForestClassifier(bootstrap=True, class_weight='balanced', criterion='gini', max_depth=8, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=101, verbose=0, warm_start=False) Done. *** Start Hypertuning *** Performing GridSearchCV Please wait............ Done ¯\_(ツ)_/¯ best parameters: RandomForestClassifier(bootstrap=True, class_weight='balanced', criterion='gini', max_depth=80, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=3, min_samples_split=8, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None, oob_score=False, random_state=101, verbose=0, warm_start=False) *** Grid Search CV Result *** ----------------------------------------- ************* Model Results ************* ----------------------------------------- F1 Score : 0.939271732329321 Report: precision recall f1-score support 1 1.00 0.94 0.97 277 2 0.86 0.90 0.88 94 3 0.67 1.00 0.80 12 4 0.62 1.00 0.76 8 accuracy 0.94 391 macro avg 0.79 0.96 0.85 391 weighted avg 0.95 0.94 0.94 391
-----------------------------------------
es.quick_pred(X,y,'c') ## use quick pred instantly on clean data to get immidate result
Please wait Training-Testing with all models.. Done with LogisticRegression Done with RandomForestClassifier Done with XGBoost Classifier Done with LGBoost Classifier Done with AdaBoost Classifier Done with GradientBoostingClassifier
Model | F1 score | AUC-ROC score | Accuracy | Confustion-Matrix | |
---|---|---|---|---|---|
1 | RandomForestClassifier | 1 | 1 | 1 | [[21 0] [ 0 9]] |
3 | LGBoost Classifier | 1 | 1 | 1 | [[21 0] [ 0 9]] |
2 | XGBoost Classifier | 0.967137 | 0.97619 | 0.966667 | [[20 1] [ 0 9]] |
5 | GradientBoostingClassifier | 0.967137 | 0.97619 | 0.966667 | [[20 1] [ 0 9]] |
4 | AdaBoost Classifier | 0.898222 | 0.865079 | 0.9 | [[20 1] [ 2 7]] |
0 | LogisticRegression | 0.822222 | 0.753968 | 0.833333 | [[20 1] [ 4 5]] |