Data for this exercise is taken from kaggle. It includes two types of data from countries around the world.

The Human Development Index (HDI) is a summary measure of achievements in key dimensions of human development. The Gender Inequality Index (GII) reflects gender-based disadvantage in three dimensions-reproductive health, empowerment, and the labour market.

Here we explore the dimensions behind these two indicators.

Wrangle

# Tuomo Nieminen 2017

# meta
# browseURL("https://www.kaggle.com/undp/human-development")

# access dplyr
library(dplyr)

# read human develop data
hd <- read.csv("human_development.csv", stringsAsFactors = F)

# read gender inequality data
gii <- read.csv("gender_inequality.csv", stringsAsFactors = F, na.strings = "..")

# look at the data
glimpse(hd)
## Observations: 195
## Variables: 8
## $ HDI.Rank                               <int> 1, 2, 3, 4, 5, 6, 6, 8,...
## $ Country                                <chr> "Norway", "Australia", ...
## $ Human.Development.Index..HDI.          <dbl> 0.944, 0.935, 0.930, 0....
## $ Life.Expectancy.at.Birth               <dbl> 81.6, 82.4, 83.0, 80.2,...
## $ Expected.Years.of.Education            <dbl> 17.5, 20.2, 15.8, 18.7,...
## $ Mean.Years.of.Education                <dbl> 12.6, 13.0, 12.8, 12.7,...
## $ Gross.National.Income..GNI..per.Capita <chr> "64,992", "42,261", "56...
## $ GNI.per.Capita.Rank.Minus.HDI.Rank     <int> 5, 17, 6, 11, 9, 11, 16...
glimpse(gii)
## Observations: 195
## Variables: 10
## $ GII.Rank                                     <int> 1, 2, 3, 4, 5, 6,...
## $ Country                                      <chr> "Norway", "Austra...
## $ Gender.Inequality.Index..GII.                <dbl> 0.067, 0.110, 0.0...
## $ Maternal.Mortality.Ratio                     <int> 4, 6, 6, 5, 6, 7,...
## $ Adolescent.Birth.Rate                        <dbl> 7.8, 12.1, 1.9, 5...
## $ Percent.Representation.in.Parliament         <dbl> 39.6, 30.5, 28.5,...
## $ Population.with.Secondary.Education..Female. <dbl> 97.4, 94.3, 95.0,...
## $ Population.with.Secondary.Education..Male.   <dbl> 96.7, 94.6, 96.6,...
## $ Labour.Force.Participation.Rate..Female.     <dbl> 61.2, 58.8, 61.8,...
## $ Labour.Force.Participation.Rate..Male.       <dbl> 68.7, 71.8, 74.9,...
# rename human development variables
names(hd) <- c("HDI.Rank", "Country", "HDI", "Life.Expectancy", "Education.Expected", "Education.Mean", "GNI", "GNI.Minus.Rank")

# rename gender inequality variables
names(gii) <- c("GII.Rank", "Country", "GII", "Maternal.Mortality", 
                "Adolescent.Birth", "Percent.Parliament", "Edu2.Female", "Edu2.Male", 
                "Labour.Female", "Labour.Male")

# deal with comma  separator for 1000
hd$GNI <- gsub(",","",hd$GNI) %>% as.numeric

# do a bit of feature engineering
gii <- mutate(gii, Odds.Edu2 = Edu2.Female / Edu2.Male, Odds.Labour = Labour.Female / Labour.Male)

# join data
human <- inner_join(hd, gii, by = "Country")

# exclude unneeded variables
keep <- c("Country", "Odds.Edu2", "Odds.Labour", "Education.Expected", "GNI", "Maternal.Mortality", "Percent.Parliament")
human <- select(human, one_of(keep))

# remove rows with NA values
human <- filter(human, complete.cases(human))

# remove 'World' observation
human <- slice(human, -nrow(human))

# add rownames as countries and remove country variable
rownames(human) <- human$Country
human <- select(human, -Country)

Explore

# glimpse at the joined data
glimpse(human)
## Observations: 161
## Variables: 6
## $ Odds.Edu2          <dbl> 1.0072389, 0.9968288, 0.9834369, 0.9886128,...
## $ Odds.Labour        <dbl> 0.8908297, 0.8189415, 0.8251001, 0.8840361,...
## $ Education.Expected <dbl> 17.5, 20.2, 15.8, 18.7, 17.9, 16.5, 18.6, 1...
## $ GNI                <dbl> 64992, 42261, 56431, 44025, 45435, 43919, 3...
## $ Maternal.Mortality <int> 4, 6, 6, 5, 6, 7, 9, 28, 11, 8, 6, 4, 8, 4,...
## $ Percent.Parliament <dbl> 39.6, 30.5, 28.5, 38.0, 36.9, 36.9, 19.9, 1...
# summaries of variables
summary(human)
##    Odds.Edu2       Odds.Labour     Education.Expected      GNI        
##  Min.   :0.1717   Min.   :0.1857   Min.   : 5.40      Min.   :   581  
##  1st Qu.:0.7244   1st Qu.:0.5978   1st Qu.:11.20      1st Qu.:  4457  
##  Median :0.9349   Median :0.7523   Median :13.50      Median : 12040  
##  Mean   :0.8500   Mean   :0.7037   Mean   :13.14      Mean   : 17363  
##  3rd Qu.:0.9960   3rd Qu.:0.8531   3rd Qu.:15.20      3rd Qu.: 23177  
##  Max.   :1.4967   Max.   :1.0380   Max.   :20.20      Max.   :123124  
##  Maternal.Mortality Percent.Parliament
##  Min.   :   1.0     Min.   : 0.00     
##  1st Qu.:  13.0     1st Qu.:12.50     
##  Median :  52.0     Median :19.30     
##  Mean   : 149.9     Mean   :20.87     
##  3rd Qu.: 190.0     3rd Qu.:27.60     
##  Max.   :1100.0     Max.   :57.50
# pairs plot
library(GGally)
ggpairs(human)

PCA

# pca 
#---

# center
human_ <- scale(human)

# pca
pc <- prcomp(human_)

# plot
library(ggfortify)
autoplot(pc, data = human_, label = T, loadings =T, loadings.label  = T, label.size = 2.5, size = 0, loadings.label.size = 5, label.alpha = 0.4, loadings.label.alpha = 0.7, xlim = c(-0.2, 0.25))