Building Indicators through Principal Component Analysis

Fernando Santa

2018-06-12

> rm(list = ls())
> Dataset <-read.table("/Users/lfsantag83/Desktop/Composite Indicators/workshop_Agile_12062018.txt",header=TRUE, sep="\t", na.strings="NA", dec=".", strip.white=TRUE)
> row.names(Dataset) <- as.character(Dataset$city)
> Dataset$city <- NULL
> numSummary(Dataset[,c("gdp", "inf_mort", "lif_exp_fem", "lif_exp_male", 
+   "n_hig_edu_insts", "n_hosp", "pm10", "unem_rate"), drop=FALSE], statistics=c("mean", 
+   "sd", "quantiles", "cv", "skewness", "kurtosis"), quantiles=c(0,1), type="2")
                       mean           sd         cv   skewness   kurtosis
gdp             39.09432692  17.32784006 0.44323157 -0.3275936 -0.4877458
inf_mort         8.98028846  12.38886361 1.37956188  3.2248594 11.8258756
lif_exp_fem     79.57673077   6.54686775 0.08227113 -2.0273715  4.3217582
lif_exp_male    74.65346154   6.35579306 0.08513729 -1.8709156  3.5126388
n_hig_edu_insts 40.17307692  47.60899616 1.18509708  2.9223122 10.1698423
n_hosp          93.03846154 131.72356103 1.41579685  2.6259560  7.9295426
pm10            46.69230769  38.61145744 0.82693401  2.0050463  4.2983292
unem_rate        0.07961731   0.06288962 0.78989888  2.2376694  8.1304583
                     0%   100%  n
gdp              0.5000  69.30 52
inf_mort         0.0350  69.00 52
lif_exp_fem     57.0000  88.00 52
lif_exp_male    53.0000  82.00 52
n_hig_edu_insts  2.0000 264.00 52
n_hosp           2.0000 658.00 52
pm10            12.0000 198.00 52
unem_rate        0.0004   0.37 52
> with(Dataset, Hist(inf_mort, scale="percent", breaks="Sturges", col="darkgray"))

plot of chunk unnamed-chunk-8

> round(cor(Dataset[,c("gdp","inf_mort","lif_exp_fem","lif_exp_male","n_hig_edu_insts",
+   "n_hosp",
+   "pm10","unem_rate")], use="complete"),2)
                  gdp inf_mort lif_exp_fem lif_exp_male n_hig_edu_insts
gdp              1.00    -0.54        0.66         0.58           -0.01
inf_mort        -0.54     1.00       -0.78        -0.71           -0.05
lif_exp_fem      0.66    -0.78        1.00         0.94           -0.12
lif_exp_male     0.58    -0.71        0.94         1.00           -0.25
n_hig_edu_insts -0.01    -0.05       -0.12        -0.25            1.00
n_hosp           0.02    -0.08        0.16         0.09            0.40
pm10            -0.74     0.49       -0.62        -0.51            0.22
unem_rate       -0.34     0.60       -0.47        -0.43           -0.32
                n_hosp  pm10 unem_rate
gdp               0.02 -0.74     -0.34
inf_mort         -0.08  0.49      0.60
lif_exp_fem       0.16 -0.62     -0.47
lif_exp_male      0.09 -0.51     -0.43
n_hig_edu_insts   0.40  0.22     -0.32
n_hosp            1.00  0.01     -0.31
pm10              0.01  1.00      0.10
unem_rate        -0.31  0.10      1.00
> scatterplot(inf_mort~lif_exp_fem, regLine=TRUE, smooth=FALSE, boxplots=FALSE, 
+   data=Dataset)

plot of chunk unnamed-chunk-10

> scatterplotMatrix(~gdp+inf_mort+lif_exp_fem+lif_exp_male+n_hig_edu_insts+n_hosp+pm10+unem_rate,
+   
+    regLine=TRUE, smooth=FALSE, diagonal=list(method="histogram"), data=Dataset)

plot of chunk unnamed-chunk-11

> Dataset.PCA<-Dataset[, c("gdp", "unem_rate", "n_hig_edu_insts", "inf_mort", 
+   "lif_exp_male", "lif_exp_fem", "n_hosp", "pm10")]
> res<-PCA(Dataset.PCA , scale.unit=TRUE, ncp=5, graph = FALSE)
> res.hcpc<-HCPC(res ,nb.clust=0,consol=TRUE,min=3,max=10,graph=TRUE)
[1] "Click on the graph to cut the tree"
Error in while (coupe$y < min(t$tree$height)) {: argument is of length zero
> res.hcpc$data.clust[,ncol(res.hcpc$data.clust),drop=F]
              clust
Amsterdam         6
Bangkok           3
Barcelona         4
Beijing           7
Berlin            4
Bogota            3
Boston            6
Brussels          4
Buenos Aires      3
Cairo             3
Chicago           5
Cologne           4
Copenhagen        6
Delhi             2
Dhaka             1
Dubai             3
Dublin            5
Frankfurt         6
Guangzhou         7
Helsinki          6
Hong Kong         4
Houston           5
Istanbul          3
Jakarta           3
Jerusalem         4
Johannesburg      1
Lima              3
Lisbon            5
London            5
Los Angeles       5
Madrid            4
Melbourne         6
Moscow            2
Mumbai            1
New York City     5
Oslo              6
Paris             4
Prague            5
Rome              4
San Francisco     8
Sao Paulo         3
Seoul             7
Shenzhen          3
Singapore         6
Stockholm         6
Sydney            8
Taipei            4
Tokyo             7
Vancouver         6
Vienna            4
Warsaw            5
Zurich            6
> plot.PCA(res, axes=c(1, 2), choix="var", new.plot=TRUE, col.var="black",
+   col.quanti.sup="blue", label=c("var", "quanti.sup"), lim.cos2.var=0)
> summary(res, nb.dec = 3, nbelements=10, nbind = 10, ncp = 3, file="")

Call:
PCA(X = Dataset.PCA, scale.unit = TRUE, ncp = 5, graph = FALSE) 


Eigenvalues
                       Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6
Variance               3.933   1.710   0.807   0.705   0.361   0.289
% of var.             49.168  21.377  10.089   8.813   4.510   3.613
Cumulative % of var.  49.168  70.545  80.634  89.447  93.957  97.570
                       Dim.7   Dim.8
Variance               0.154   0.040
% of var.              1.926   0.504
Cumulative % of var.  99.496 100.000

Individuals (the 10 first)
                    Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
Amsterdam       |  1.230 |  0.837  0.342  0.463 | -0.596  0.400  0.235 |
Bangkok         |  1.931 | -0.811  0.321  0.176 |  0.975  1.068  0.255 |
Barcelona       |  1.838 |  0.399  0.078  0.047 | -1.329  1.988  0.523 |
Beijing         |  4.524 |  0.165  0.013  0.001 |  3.625 14.781  0.642 |
Berlin          |  2.270 |  1.026  0.515  0.204 | -0.960  1.037  0.179 |
Bogota          |  1.854 | -1.207  0.712  0.424 | -0.422  0.200  0.052 |
Boston          |  2.066 |  1.241  0.753  0.361 | -0.666  0.499  0.104 |
Brussels        |  1.668 |  0.475  0.110  0.081 | -1.172  1.544  0.494 |
Buenos Aires    |  1.037 | -0.181  0.016  0.030 | -0.095  0.010  0.008 |
Cairo           |  3.264 | -2.546  3.168  0.608 | -0.155  0.027  0.002 |
                 Dim.3    ctr   cos2  
Amsterdam       -0.150  0.054  0.015 |
Bangkok         -0.411  0.402  0.045 |
Barcelona       -0.055  0.007  0.001 |
Beijing         -0.934  2.076  0.043 |
Berlin          -0.084  0.017  0.001 |
Bogota          -1.092  2.842  0.347 |
Boston           0.824  1.617  0.159 |
Brussels         0.457  0.498  0.075 |
Buenos Aires    -0.496  0.587  0.229 |
Cairo           -1.853  8.179  0.322 |

Variables
                   Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
gdp             |  0.795 16.080  0.633 | -0.121  0.856  0.015 |  0.410
unem_rate       | -0.582  8.597  0.338 | -0.584 19.944  0.341 |  0.331
n_hig_edu_insts | -0.073  0.135  0.005 |  0.834 40.629  0.695 |  0.252
inf_mort        | -0.854 18.554  0.730 | -0.124  0.896  0.015 |  0.240
lif_exp_male    |  0.888 20.035  0.788 | -0.123  0.887  0.015 | -0.248
lif_exp_fem     |  0.945 22.707  0.893 | -0.039  0.089  0.002 | -0.104
n_hosp          |  0.157  0.630  0.025 |  0.710 29.462  0.504 |  0.289
pm10            | -0.722 13.261  0.522 |  0.352  7.236  0.124 | -0.502
                   ctr   cos2  
gdp             20.843  0.168 |
unem_rate       13.602  0.110 |
n_hig_edu_insts  7.846  0.063 |
inf_mort         7.114  0.057 |
lif_exp_male     7.642  0.062 |
lif_exp_fem      1.341  0.011 |
n_hosp          10.344  0.083 |
pm10            31.267  0.252 |
> remove(Dataset.PCA)
> res$var$coord
                      Dim.1       Dim.2      Dim.3       Dim.4       Dim.5
gdp              0.79530713 -0.12099413  0.4101562 -0.23978640 -0.17544298
unem_rate       -0.58151235 -0.58401541  0.3313367  0.19920757  0.39753453
n_hig_edu_insts -0.07282457  0.83355228  0.2516565 -0.36697213  0.29274951
inf_mort        -0.85429783 -0.12380783  0.2396279  0.15785666 -0.18249036
lif_exp_male     0.88773426 -0.12317755 -0.2483541  0.22322452  0.12358889
lif_exp_fem      0.94508498 -0.03895879 -0.1040426  0.15483775  0.17695918
n_hosp           0.15747224  0.70981738  0.2889502  0.61088594 -0.06795649
pm10            -0.72222539  0.35177188 -0.5023590  0.03602609  0.04172349
> Dataset1=Dataset
> summary(powerTransform(Dataset1[,1]))
bcPower Transformation to Normality 
              Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Dataset1[, 1]    1.0164           1       0.6025       1.4303

Likelihood ratio test that transformation parameter is equal to 0
 (log transformation)
                           LRT df       pval
LR test, lambda = (0) 44.29979  1 2.8175e-11

Likelihood ratio test that no transformation is needed
                              LRT df    pval
LR test, lambda = (1) 0.006077443  1 0.93786
> summary(powerTransform(Dataset1[,2]))
bcPower Transformation to Normality 
              Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Dataset1[, 2]    0.4039         0.5       0.2031       0.6047

Likelihood ratio test that transformation parameter is equal to 0
 (log transformation)
                          LRT df         pval
LR test, lambda = (0) 20.4956  1 0.0000059768

Likelihood ratio test that no transformation is needed
                           LRT df          pval
LR test, lambda = (1) 25.64775  1 0.00000040977
> Dataset1[,2] <- with(Dataset1, sqrt(Dataset1[,2]))
> summary(powerTransform(Dataset1[,3]))
bcPower Transformation to Normality 
              Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Dataset1[, 3]   -0.0467           0      -0.2708       0.1775

Likelihood ratio test that transformation parameter is equal to 0
 (log transformation)
                            LRT df    pval
LR test, lambda = (0) 0.1655132  1 0.68413

Likelihood ratio test that no transformation is needed
                           LRT df       pval
LR test, lambda = (1) 71.33823  1 < 2.22e-16
> Dataset1[,3] <- with(Dataset1, log(Dataset1[,3]))
> summary(powerTransform(Dataset1[,4]))
bcPower Transformation to Normality 
              Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Dataset1[, 4]    0.1418        0.14       0.0052       0.2784

Likelihood ratio test that transformation parameter is equal to 0
 (log transformation)
                           LRT df     pval
LR test, lambda = (0) 4.667473  1 0.030739

Likelihood ratio test that no transformation is needed
                          LRT df       pval
LR test, lambda = (1) 82.7025  1 < 2.22e-16
> Dataset1[,4] <- with(Dataset1, Dataset1[,4]^0.14)
> summary(powerTransform(Dataset1[,5]))
bcPower Transformation to Normality 
              Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Dataset1[, 5]    9.4917        9.49        5.822      13.1614

Likelihood ratio test that transformation parameter is equal to 0
 (log transformation)
                           LRT df       pval
LR test, lambda = (0) 39.08902  1 4.0491e-10

Likelihood ratio test that no transformation is needed
                           LRT df           pval
LR test, lambda = (1) 29.90819  1 0.000000045299
> summary(powerTransform(Dataset1[,6]))
bcPower Transformation to Normality 
              Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Dataset1[, 6]    9.0214        9.02       5.6794      12.3633

Likelihood ratio test that transformation parameter is equal to 0
 (log transformation)
                           LRT df       pval
LR test, lambda = (0) 40.43526  1 2.0324e-10

Likelihood ratio test that no transformation is needed
                           LRT df           pval
LR test, lambda = (1) 30.82181  1 0.000000028284
> summary(powerTransform(Dataset1[,7]))
bcPower Transformation to Normality 
              Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Dataset1[, 7]   -0.0154           0      -0.2036       0.1728

Likelihood ratio test that transformation parameter is equal to 0
 (log transformation)
                             LRT df    pval
LR test, lambda = (0) 0.02565001  1 0.87276

Likelihood ratio test that no transformation is needed
                           LRT df       pval
LR test, lambda = (1) 90.44312  1 < 2.22e-16
> Dataset1[,7] <- with(Dataset1, log(Dataset1[,7]))
> summary(powerTransform(Dataset1[,8]))
bcPower Transformation to Normality 
              Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Dataset1[, 8]   -0.4156        -0.5      -0.8138      -0.0173

Likelihood ratio test that transformation parameter is equal to 0
 (log transformation)
                           LRT df     pval
LR test, lambda = (0) 4.292003  1 0.038292

Likelihood ratio test that no transformation is needed
                           LRT df       pval
LR test, lambda = (1) 50.86919  1 9.8732e-13
> Dataset1[,8] <- with(Dataset1, 1/sqrt(Dataset1[,8]))
> numSummary(Dataset1[,c("gdp", "inf_mort", "lif_exp_fem", "lif_exp_male", 
+   "n_hig_edu_insts", "n_hosp", "pm10", "unem_rate"), drop=FALSE], statistics=c("mean", 
+   "sd", "cv", "skewness", "kurtosis"), quantiles=c(0,1), type="1")
                      mean          sd         cv    skewness   kurtosis
gdp             39.0943269 17.32784006 0.44323157 -0.31806705 -0.5553005
inf_mort         1.2710812  0.19320465 0.15200024  0.12642537  2.3539381
lif_exp_fem     79.5767308  6.54686775 0.08227113 -1.96841461  3.8040354
lif_exp_male    74.6534615  6.35579306 0.08513729 -1.81650850  3.0706493
n_hig_edu_insts  3.2394621  0.94101758 0.29048575  0.14590407  0.2949641
n_hosp           3.7585215  1.28760674 0.34258331  0.05020693 -0.3841216
pm10             0.1747114  0.05402128 0.30920296  0.03775801 -0.6210449
unem_rate        0.2622804  0.10506454 0.40058090  0.43835777  1.2590704
                 n
gdp             52
inf_mort        52
lif_exp_fem     52
lif_exp_male    52
n_hig_edu_insts 52
n_hosp          52
pm10            52
unem_rate       52
> with(Dataset1, Hist(inf_mort, scale="percent", breaks="Sturges", col="darkgray"))

plot of chunk unnamed-chunk-35

> round(cor(Dataset1[,c("gdp","inf_mort","lif_exp_fem","lif_exp_male","n_hig_edu_insts",
+   "n_hosp",
+   "pm10","unem_rate")], use="complete"),2)
                  gdp inf_mort lif_exp_fem lif_exp_male n_hig_edu_insts
gdp              1.00    -0.46        0.66         0.58            0.08
inf_mort        -0.46     1.00       -0.71        -0.66           -0.10
lif_exp_fem      0.66    -0.71        1.00         0.94            0.07
lif_exp_male     0.58    -0.66        0.94         1.00            0.00
n_hig_edu_insts  0.08    -0.10        0.07         0.00            1.00
n_hosp           0.04    -0.12        0.16         0.05            0.29
pm10             0.73    -0.27        0.52         0.46           -0.16
unem_rate       -0.31     0.36       -0.37        -0.33           -0.45
                n_hosp  pm10 unem_rate
gdp               0.04  0.73     -0.31
inf_mort         -0.12 -0.27      0.36
lif_exp_fem       0.16  0.52     -0.37
lif_exp_male      0.05  0.46     -0.33
n_hig_edu_insts   0.29 -0.16     -0.45
n_hosp            1.00 -0.05     -0.30
pm10             -0.05  1.00     -0.11
unem_rate        -0.30 -0.11      1.00
> scatterplotMatrix(~gdp+inf_mort+lif_exp_fem+lif_exp_male+n_hig_edu_insts+n_hosp+pm10+unem_rate,
+   
+    regLine=TRUE, smooth=FALSE, diagonal=list(method="histogram"), data=Dataset1)

plot of chunk unnamed-chunk-37

> Dataset1.PCA<-Dataset1[, c("gdp", "unem_rate", "n_hig_edu_insts", "inf_mort", 
+   "lif_exp_male", "lif_exp_fem", "n_hosp", "pm10")]
> res<-PCA(Dataset1.PCA , scale.unit=TRUE, ncp=5, graph = FALSE)
> res.hcpc<-HCPC(res ,nb.clust=0,consol=TRUE,min=3,max=10,graph=TRUE)
[1] "Click on the graph to cut the tree"
Error in while (coupe$y < min(t$tree$height)) {: argument is of length zero
> res.hcpc$data.clust[,ncol(res.hcpc$data.clust),drop=F]
              clust
Amsterdam         6
Bangkok           3
Barcelona         4
Beijing           7
Berlin            4
Bogota            3
Boston            6
Brussels          4
Buenos Aires      3
Cairo             3
Chicago           5
Cologne           4
Copenhagen        6
Delhi             2
Dhaka             1
Dubai             3
Dublin            5
Frankfurt         6
Guangzhou         7
Helsinki          6
Hong Kong         4
Houston           5
Istanbul          3
Jakarta           3
Jerusalem         4
Johannesburg      1
Lima              3
Lisbon            5
London            5
Los Angeles       5
Madrid            4
Melbourne         6
Moscow            2
Mumbai            1
New York City     5
Oslo              6
Paris             4
Prague            5
Rome              4
San Francisco     8
Sao Paulo         3
Seoul             7
Shenzhen          3
Singapore         6
Stockholm         6
Sydney            8
Taipei            4
Tokyo             7
Vancouver         6
Vienna            4
Warsaw            5
Zurich            6
> plot.PCA(res, axes=c(1, 2), choix="var", new.plot=TRUE, col.var="black",
+   col.quanti.sup="blue", label=c("var", "quanti.sup"), lim.cos2.var=0)
> summary(res, nb.dec = 3, nbelements=10, nbind = 10, ncp = 3, file="")

Call:
PCA(X = Dataset1.PCA, scale.unit = TRUE, ncp = 5, graph = FALSE) 


Eigenvalues
                       Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6
Variance               3.635   1.658   0.844   0.756   0.484   0.366
% of var.             45.439  20.729  10.551   9.452   6.047   4.574
Cumulative % of var.  45.439  66.169  76.719  86.171  92.218  96.792
                       Dim.7   Dim.8
Variance               0.208   0.049
% of var.              2.596   0.612
Cumulative % of var.  99.388 100.000

Individuals (the 10 first)
                    Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
Amsterdam       |  1.661 |  0.614  0.200  0.137 |  0.900  0.940  0.294 |
Bangkok         |  2.337 | -1.055  0.589  0.204 | -1.804  3.774  0.596 |
Barcelona       |  2.154 |  0.010  0.000  0.000 |  1.407  2.297  0.427 |
Beijing         |  3.519 |  0.433  0.099  0.015 | -3.113 11.239  0.783 |
Berlin          |  2.298 |  1.019  0.549  0.197 |  0.491  0.279  0.046 |
Bogota          |  2.226 | -1.602  1.358  0.518 |  0.146  0.025  0.004 |
Boston          |  2.424 |  1.234  0.806  0.259 |  0.967  1.084  0.159 |
Brussels        |  1.533 |  0.363  0.070  0.056 |  0.831  0.801  0.294 |
Buenos Aires    |  1.015 | -0.537  0.153  0.280 | -0.436  0.221  0.185 |
Cairo           |  3.021 | -2.604  3.589  0.743 |  0.324  0.122  0.012 |
                 Dim.3    ctr   cos2  
Amsterdam        0.037  0.003  0.001 |
Bangkok          0.011  0.000  0.000 |
Barcelona       -0.689  1.081  0.102 |
Beijing         -1.237  3.487  0.124 |
Berlin          -0.741  1.252  0.104 |
Bogota          -0.928  1.960  0.174 |
Boston           1.422  4.609  0.344 |
Brussels        -0.107  0.026  0.005 |
Buenos Aires    -0.191  0.083  0.035 |
Cairo           -1.259  3.610  0.174 |

Variables
                   Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
gdp             |  0.810 18.058  0.656 |  0.184  2.045  0.034 |  0.415
unem_rate       | -0.512  7.203  0.262 |  0.610 22.447  0.372 | -0.089
n_hig_edu_insts |  0.143  0.560  0.020 | -0.790 37.668  0.625 |  0.201
inf_mort        | -0.765 16.088  0.585 |  0.073  0.320  0.005 |  0.439
lif_exp_male    |  0.884 21.481  0.781 |  0.121  0.890  0.015 | -0.294
lif_exp_fem     |  0.935 24.049  0.874 |  0.053  0.169  0.003 | -0.200
n_hosp          |  0.182  0.911  0.033 | -0.637 24.442  0.405 |  0.138
pm10            |  0.651 11.650  0.423 |  0.446 12.019  0.199 |  0.534
                   ctr   cos2  
gdp             20.367  0.172 |
unem_rate        0.937  0.008 |
n_hig_edu_insts  4.773  0.040 |
inf_mort        22.882  0.193 |
lif_exp_male    10.248  0.087 |
lif_exp_fem      4.731  0.040 |
n_hosp           2.247  0.019 |
pm10            33.815  0.285 |
> remove(Dataset1.PCA)
> res$var$coord
                     Dim.1       Dim.2       Dim.3       Dim.4       Dim.5
gdp              0.8102051  0.18416302  0.41461720 -0.06952242  0.07886880
unem_rate       -0.5117151  0.61011627 -0.08893343  0.25196075  0.54230042
n_hig_edu_insts  0.1427373 -0.79035660  0.20072102 -0.37027498  0.41058218
inf_mort        -0.7647270  0.07283749  0.43947445  0.01597111 -0.06723370
lif_exp_male     0.8836599  0.12148247 -0.29411346  0.01153974  0.03448303
lif_exp_fem      0.9349980  0.05293423 -0.19982501  0.06884214  0.08987651
n_hosp           0.1819843 -0.63666320  0.13770648  0.73585128  0.02290250
pm10             0.6507592  0.44644816  0.53424659  0.06430530 -0.02318972