Data for this exercise is taken from kaggle. It includes two types of data from countries around the world.
The Human Development Index (HDI) is a summary measure of achievements in key dimensions of human development. The Gender Inequality Index (GII) reflects gender-based disadvantage in three dimensions-reproductive health, empowerment, and the labour market.
Here we explore the dimensions behind these two indicators.
# Tuomo Nieminen 2017
# meta
# browseURL("https://www.kaggle.com/undp/human-development")
# access dplyr
library(dplyr)
# read human develop data
hd <- read.csv("human_development.csv", stringsAsFactors = F)
# read gender inequality data
gii <- read.csv("gender_inequality.csv", stringsAsFactors = F, na.strings = "..")
# look at the data
glimpse(hd)
## Observations: 195
## Variables: 8
## $ HDI.Rank <int> 1, 2, 3, 4, 5, 6, 6, 8,...
## $ Country <chr> "Norway", "Australia", ...
## $ Human.Development.Index..HDI. <dbl> 0.944, 0.935, 0.930, 0....
## $ Life.Expectancy.at.Birth <dbl> 81.6, 82.4, 83.0, 80.2,...
## $ Expected.Years.of.Education <dbl> 17.5, 20.2, 15.8, 18.7,...
## $ Mean.Years.of.Education <dbl> 12.6, 13.0, 12.8, 12.7,...
## $ Gross.National.Income..GNI..per.Capita <chr> "64,992", "42,261", "56...
## $ GNI.per.Capita.Rank.Minus.HDI.Rank <int> 5, 17, 6, 11, 9, 11, 16...
glimpse(gii)
## Observations: 195
## Variables: 10
## $ GII.Rank <int> 1, 2, 3, 4, 5, 6,...
## $ Country <chr> "Norway", "Austra...
## $ Gender.Inequality.Index..GII. <dbl> 0.067, 0.110, 0.0...
## $ Maternal.Mortality.Ratio <int> 4, 6, 6, 5, 6, 7,...
## $ Adolescent.Birth.Rate <dbl> 7.8, 12.1, 1.9, 5...
## $ Percent.Representation.in.Parliament <dbl> 39.6, 30.5, 28.5,...
## $ Population.with.Secondary.Education..Female. <dbl> 97.4, 94.3, 95.0,...
## $ Population.with.Secondary.Education..Male. <dbl> 96.7, 94.6, 96.6,...
## $ Labour.Force.Participation.Rate..Female. <dbl> 61.2, 58.8, 61.8,...
## $ Labour.Force.Participation.Rate..Male. <dbl> 68.7, 71.8, 74.9,...
# rename human development variables
names(hd) <- c("HDI.Rank", "Country", "HDI", "Life.Expectancy", "Education.Expected", "Education.Mean", "GNI", "GNI.Minus.Rank")
# rename gender inequality variables
names(gii) <- c("GII.Rank", "Country", "GII", "Maternal.Mortality",
"Adolescent.Birth", "Percent.Parliament", "Edu2.Female", "Edu2.Male",
"Labour.Female", "Labour.Male")
# deal with comma separator for 1000
hd$GNI <- gsub(",","",hd$GNI) %>% as.numeric
# do a bit of feature engineering
gii <- mutate(gii, Odds.Edu2 = Edu2.Female / Edu2.Male, Odds.Labour = Labour.Female / Labour.Male)
# join data
human <- inner_join(hd, gii, by = "Country")
# exclude unneeded variables
keep <- c("Country", "Odds.Edu2", "Odds.Labour", "Education.Expected", "GNI", "Maternal.Mortality", "Percent.Parliament")
human <- select(human, one_of(keep))
# remove rows with NA values
human <- filter(human, complete.cases(human))
# remove 'World' observation
human <- slice(human, -nrow(human))
# add rownames as countries and remove country variable
rownames(human) <- human$Country
human <- select(human, -Country)
# glimpse at the joined data
glimpse(human)
## Observations: 161
## Variables: 6
## $ Odds.Edu2 <dbl> 1.0072389, 0.9968288, 0.9834369, 0.9886128,...
## $ Odds.Labour <dbl> 0.8908297, 0.8189415, 0.8251001, 0.8840361,...
## $ Education.Expected <dbl> 17.5, 20.2, 15.8, 18.7, 17.9, 16.5, 18.6, 1...
## $ GNI <dbl> 64992, 42261, 56431, 44025, 45435, 43919, 3...
## $ Maternal.Mortality <int> 4, 6, 6, 5, 6, 7, 9, 28, 11, 8, 6, 4, 8, 4,...
## $ Percent.Parliament <dbl> 39.6, 30.5, 28.5, 38.0, 36.9, 36.9, 19.9, 1...
# summaries of variables
summary(human)
## Odds.Edu2 Odds.Labour Education.Expected GNI
## Min. :0.1717 Min. :0.1857 Min. : 5.40 Min. : 581
## 1st Qu.:0.7244 1st Qu.:0.5978 1st Qu.:11.20 1st Qu.: 4457
## Median :0.9349 Median :0.7523 Median :13.50 Median : 12040
## Mean :0.8500 Mean :0.7037 Mean :13.14 Mean : 17363
## 3rd Qu.:0.9960 3rd Qu.:0.8531 3rd Qu.:15.20 3rd Qu.: 23177
## Max. :1.4967 Max. :1.0380 Max. :20.20 Max. :123124
## Maternal.Mortality Percent.Parliament
## Min. : 1.0 Min. : 0.00
## 1st Qu.: 13.0 1st Qu.:12.50
## Median : 52.0 Median :19.30
## Mean : 149.9 Mean :20.87
## 3rd Qu.: 190.0 3rd Qu.:27.60
## Max. :1100.0 Max. :57.50
# pairs plot
library(GGally)
ggpairs(human)
# pca
#---
# center
human_ <- scale(human)
# pca
pc <- prcomp(human_)
# plot
library(ggfortify)
autoplot(pc, data = human_, label = T, loadings =T, loadings.label = T, label.size = 2.5, size = 0, loadings.label.size = 5, label.alpha = 0.4, loadings.label.alpha = 0.7, xlim = c(-0.2, 0.25))