Data wrangling

1 loading more datasets

## meta: 
##browseURL("https://archive.ics.uci.edu/ml/datasets/STUDENT+ALCOHOL+CONSUMPTION")

## read the math class questionaire data
math <- read.table("../datasets/student-mat.csv",sep=";",header=TRUE)

## look at the column names
colnames(math)
 [1] "school"     "sex"        "age"        "address"    "famsize"   
 [6] "Pstatus"    "Medu"       "Fedu"       "Mjob"       "Fjob"      
[11] "reason"     "guardian"   "traveltime" "studytime"  "failures"  
[16] "schoolsup"  "famsup"     "paid"       "activities" "nursery"   
[21] "higher"     "internet"   "romantic"   "famrel"     "freetime"  
[26] "goout"      "Dalc"       "Walc"       "health"     "absences"  
[31] "G1"         "G2"         "G3"        
## read the portuguese class questionaire data
por <- read.table("../datasets/student-por.csv",sep=";",header=TRUE)

## look at the column names
colnames(por)
 [1] "school"     "sex"        "age"        "address"    "famsize"   
 [6] "Pstatus"    "Medu"       "Fedu"       "Mjob"       "Fjob"      
[11] "reason"     "guardian"   "traveltime" "studytime"  "failures"  
[16] "schoolsup"  "famsup"     "paid"       "activities" "nursery"   
[21] "higher"     "internet"   "romantic"   "famrel"     "freetime"  
[26] "goout"      "Dalc"       "Walc"       "health"     "absences"  
[31] "G1"         "G2"         "G3"        

2 joining two datasets

## common columns to merge the datasets by
join_by <- c("school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet")

## access the dplyr library
library(dplyr)

## join the two datasets by selected common variables
math_por <- inner_join(math, por, by = join_by, suffix = c(".math", ".por"))

## see the new column names
colnames(math_por)
 [1] "school"          "sex"             "age"            
 [4] "address"         "famsize"         "Pstatus"        
 [7] "Medu"            "Fedu"            "Mjob"           
[10] "Fjob"            "reason"          "guardian.math"  
[13] "traveltime.math" "studytime.math"  "failures.math"  
[16] "schoolsup.math"  "famsup.math"     "paid.math"      
[19] "activities.math" "nursery"         "higher.math"    
[22] "internet"        "romantic.math"   "famrel.math"    
[25] "freetime.math"   "goout.math"      "Dalc.math"      
[28] "Walc.math"       "health.math"     "absences.math"  
[31] "G1.math"         "G2.math"         "G3.math"        
[34] "guardian.por"    "traveltime.por"  "studytime.por"  
[37] "failures.por"    "schoolsup.por"   "famsup.por"     
[40] "paid.por"        "activities.por"  "higher.por"     
[43] "romantic.por"    "famrel.por"      "freetime.por"   
[46] "goout.por"       "Dalc.por"        "Walc.por"       
[49] "health.por"      "absences.por"    "G1.por"         
[52] "G2.por"          "G3.por"         

3 glimpse at the joined dataset

## access the dplyr library
library(dplyr)

## glimpse at the new data
glimpse(math_por) 
Observations: 382
Variables: 53
$ school          <fctr> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, G...
$ sex             <fctr> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, ...
$ age             <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15...
$ address         <fctr> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, ...
$ famsize         <fctr> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, ...
$ Pstatus         <fctr> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, ...
$ Medu            <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4...
$ Fedu            <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4...
$ Mjob            <fctr> at_home, at_home, at_home, health, other, ser...
$ Fjob            <fctr> teacher, other, other, services, other, other...
$ reason          <fctr> course, course, other, home, home, reputation...
$ guardian.math   <fctr> mother, father, mother, mother, father, mothe...
$ traveltime.math <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1...
$ studytime.math  <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1...
$ failures.math   <int> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ schoolsup.math  <fctr> yes, no, yes, no, no, no, no, yes, no, no, no...
$ famsup.math     <fctr> no, yes, no, yes, yes, yes, no, yes, yes, yes...
$ paid.math       <fctr> no, no, yes, yes, yes, yes, no, no, yes, yes,...
$ activities.math <fctr> no, no, no, yes, no, yes, no, no, no, yes, no...
$ nursery         <fctr> yes, no, yes, yes, yes, yes, yes, yes, yes, y...
$ higher.math     <fctr> yes, yes, yes, yes, yes, yes, yes, yes, yes, ...
$ internet        <fctr> no, yes, yes, yes, no, yes, yes, no, yes, yes...
$ romantic.math   <fctr> no, no, no, yes, no, no, no, no, no, no, no, ...
$ famrel.math     <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4...
$ freetime.math   <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4...
$ goout.math      <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4...
$ Dalc.math       <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ Walc.math       <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2...
$ health.math     <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2...
$ absences.math   <int> 6, 4, 10, 2, 4, 10, 0, 6, 0, 0, 0, 4, 2, 2, 0,...
$ G1.math         <int> 5, 5, 7, 15, 6, 15, 12, 6, 16, 14, 10, 10, 14,...
$ G2.math         <int> 6, 5, 8, 14, 10, 15, 12, 5, 18, 15, 8, 12, 14,...
$ G3.math         <int> 6, 6, 10, 15, 10, 15, 11, 6, 19, 15, 9, 12, 14...
$ guardian.por    <fctr> mother, father, mother, mother, father, mothe...
$ traveltime.por  <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1...
$ studytime.por   <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1...
$ failures.por    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ schoolsup.por   <fctr> yes, no, yes, no, no, no, no, yes, no, no, no...
$ famsup.por      <fctr> no, yes, no, yes, yes, yes, no, yes, yes, yes...
$ paid.por        <fctr> no, no, no, no, no, no, no, no, no, no, no, n...
$ activities.por  <fctr> no, no, no, yes, no, yes, no, no, no, yes, no...
$ higher.por      <fctr> yes, yes, yes, yes, yes, yes, yes, yes, yes, ...
$ romantic.por    <fctr> no, no, no, yes, no, no, no, no, no, no, no, ...
$ famrel.por      <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4...
$ freetime.por    <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4...
$ goout.por       <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4...
$ Dalc.por        <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ Walc.por        <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2...
$ health.por      <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2...
$ absences.por    <int> 4, 2, 6, 0, 0, 6, 0, 2, 0, 0, 2, 0, 0, 0, 0, 6...
$ G1.por          <int> 0, 9, 12, 14, 11, 12, 13, 10, 15, 12, 14, 10, ...
$ G2.por          <int> 11, 11, 13, 14, 13, 12, 12, 13, 16, 12, 14, 12...
$ G3.por          <int> 11, 11, 12, 14, 13, 13, 13, 13, 17, 13, 14, 13...

4 Transforming by looping

join_by <- c("school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet")

## which columns in the datasets were not used for joining the data
notjoined_columns <- colnames(math)[!colnames(math) %in% join_by]

## create a new data frame with the common columns
alc <- select(math_por, one_of(join_by))

## access the dplyr library
library(dplyr)

## average/combine the rest of the columns
for(column_name in notjoined_columns) {
  
  df <- select(math_por, starts_with(column_name))
  
  if(is.numeric(select(df, 1))) {
    alc[column_name] <- rowMeans(df)
  } else {
    alc[column_name] <- select(df, 1)
  }
}

# final column names of the data
colnames(alc)
 [1] "school"     "sex"        "age"        "address"    "famsize"   
 [6] "Pstatus"    "Medu"       "Fedu"       "Mjob"       "Fjob"      
[11] "reason"     "nursery"    "internet"   "guardian"   "traveltime"
[16] "studytime"  "failures"   "schoolsup"  "famsup"     "paid"      
[21] "activities" "higher"     "romantic"   "famrel"     "freetime"  
[26] "goout"      "Dalc"       "Walc"       "health"     "absences"  
[31] "G1"         "G2"         "G3"        

5 Mutating and exploring

library(dplyr)
library(ggplot2)

## combine weekday and weekend alcohol use into alc_use
alc <- mutate(alc, alc_use = (Dalc + Walc) / 2)

## draw a histogram of alcohol use
qplot(alc_use, data = alc, bins = 10, main = "Student alcohol consumption")

6 Binarizing a variable

## transform alc_use into a binary (T, F) variable high_use
alc$high_use <- alc$alc_use > 2

## draw a bar plot
qplot(high_use, data = alc)

# save the data
# write.table(file = "../datasets/alc.txt", alc, sep = ",", row.names = F)

7 exploring relationships with cross tables

library(descr)

crosstab(alc$sex, alc$high_use, prop.c = T, prop.r = T)

   Cell Contents 
|-------------------------|
|                   Count | 
|             Row Percent | 
|          Column Percent | 
|-------------------------|

================================
           alc$high_use
alc$sex    FALSE    TRUE   Total
--------------------------------
F           157      41     198 
           79.3%   20.7%   51.8%
           58.1%   36.6%        
--------------------------------
M           113      71     184 
           61.4%   38.6%   48.2%
           41.9%   63.4%        
--------------------------------
Total       270     112     382 
           70.7%   29.3%        
================================

8 exploring relationships with box plots

library(ggplot2)
p <- ggplot(alc, aes(x = high_use, y = absences))
p + geom_boxplot() 

Analysis: Logistic regression

9 fitting a logistic regression model

m1 <- glm(high_use ~ sex + failures + absences, data = alc, family = binomial)

# print out a summary of the model
summary(m1)

Call:
glm(formula = high_use ~ sex + failures + absences, family = binomial, 
    data = alc)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6629  -0.8545  -0.5894   1.0339   2.0428  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.95397    0.22819  -8.563  < 2e-16 ***
sexM         0.98848    0.24453   4.042 5.29e-05 ***
failures     0.40462    0.15024   2.693  0.00708 ** 
absences     0.07294    0.01796   4.061 4.88e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 462.21  on 381  degrees of freedom
Residual deviance: 418.64  on 378  degrees of freedom
AIC: 426.64

Number of Fisher Scoring iterations: 4
## print out the coefficients
m1

Call:  glm(formula = high_use ~ sex + failures + absences, family = binomial, 
    data = alc)

Coefficients:
(Intercept)         sexM     failures     absences  
   -1.95397      0.98848      0.40462      0.07294  

Degrees of Freedom: 381 Total (i.e. Null);  378 Residual
Null Deviance:      462.2 
Residual Deviance: 418.6    AIC: 426.6
## print out the exponentiated coefficients
exp(coefficients(m1))
(Intercept)        sexM    failures    absences 
  0.1417107   2.6871365   1.4987270   1.0756623 

10 model performance: prediction accuracy

# predict the response
alc$prediction <- predict(m1, type = "response")

# first six values
head(select(alc, high_use, prediction))
  high_use prediction
1    FALSE  0.1799998
2    FALSE  0.1594640
3     TRUE  0.4973115
4    FALSE  0.1408685
5    FALSE  0.1594640
6    FALSE  0.4412412
# function to compute the proportion of wrong predictions
cost <- function(class, prediction) mean(abs(class - prediction) > 0.5)

# compute the average number of wrong predictions
cost(alc$high_use, alc$prediction)
[1] 0.2565445

11 model performance: cross-validation

# cost function
cost <- function(class, prediction) mean(abs(class - prediction) > 0.5)

# k-fold cross-validation
library(boot)
cv <- cv.glm(data = alc, cost = cost, glmfit = m1, K = 10)

# print out the average number of wrong predictions in the cross validation
cv$delta[1]
[1] 0.2617801