## meta:
##browseURL("https://archive.ics.uci.edu/ml/datasets/STUDENT+ALCOHOL+CONSUMPTION")
## read the math class questionaire data
math <- read.table("../datasets/student-mat.csv",sep=";",header=TRUE)
## look at the column names
colnames(math)
[1] "school" "sex" "age" "address" "famsize"
[6] "Pstatus" "Medu" "Fedu" "Mjob" "Fjob"
[11] "reason" "guardian" "traveltime" "studytime" "failures"
[16] "schoolsup" "famsup" "paid" "activities" "nursery"
[21] "higher" "internet" "romantic" "famrel" "freetime"
[26] "goout" "Dalc" "Walc" "health" "absences"
[31] "G1" "G2" "G3"
## read the portuguese class questionaire data
por <- read.table("../datasets/student-por.csv",sep=";",header=TRUE)
## look at the column names
colnames(por)
[1] "school" "sex" "age" "address" "famsize"
[6] "Pstatus" "Medu" "Fedu" "Mjob" "Fjob"
[11] "reason" "guardian" "traveltime" "studytime" "failures"
[16] "schoolsup" "famsup" "paid" "activities" "nursery"
[21] "higher" "internet" "romantic" "famrel" "freetime"
[26] "goout" "Dalc" "Walc" "health" "absences"
[31] "G1" "G2" "G3"
## common columns to merge the datasets by
join_by <- c("school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet")
## access the dplyr library
library(dplyr)
## join the two datasets by selected common variables
math_por <- inner_join(math, por, by = join_by, suffix = c(".math", ".por"))
## see the new column names
colnames(math_por)
[1] "school" "sex" "age"
[4] "address" "famsize" "Pstatus"
[7] "Medu" "Fedu" "Mjob"
[10] "Fjob" "reason" "guardian.math"
[13] "traveltime.math" "studytime.math" "failures.math"
[16] "schoolsup.math" "famsup.math" "paid.math"
[19] "activities.math" "nursery" "higher.math"
[22] "internet" "romantic.math" "famrel.math"
[25] "freetime.math" "goout.math" "Dalc.math"
[28] "Walc.math" "health.math" "absences.math"
[31] "G1.math" "G2.math" "G3.math"
[34] "guardian.por" "traveltime.por" "studytime.por"
[37] "failures.por" "schoolsup.por" "famsup.por"
[40] "paid.por" "activities.por" "higher.por"
[43] "romantic.por" "famrel.por" "freetime.por"
[46] "goout.por" "Dalc.por" "Walc.por"
[49] "health.por" "absences.por" "G1.por"
[52] "G2.por" "G3.por"
## access the dplyr library
library(dplyr)
## glimpse at the new data
glimpse(math_por)
Observations: 382
Variables: 53
$ school <fctr> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, G...
$ sex <fctr> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, ...
$ age <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15...
$ address <fctr> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, ...
$ famsize <fctr> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, ...
$ Pstatus <fctr> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, ...
$ Medu <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4...
$ Fedu <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4...
$ Mjob <fctr> at_home, at_home, at_home, health, other, ser...
$ Fjob <fctr> teacher, other, other, services, other, other...
$ reason <fctr> course, course, other, home, home, reputation...
$ guardian.math <fctr> mother, father, mother, mother, father, mothe...
$ traveltime.math <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1...
$ studytime.math <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1...
$ failures.math <int> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ schoolsup.math <fctr> yes, no, yes, no, no, no, no, yes, no, no, no...
$ famsup.math <fctr> no, yes, no, yes, yes, yes, no, yes, yes, yes...
$ paid.math <fctr> no, no, yes, yes, yes, yes, no, no, yes, yes,...
$ activities.math <fctr> no, no, no, yes, no, yes, no, no, no, yes, no...
$ nursery <fctr> yes, no, yes, yes, yes, yes, yes, yes, yes, y...
$ higher.math <fctr> yes, yes, yes, yes, yes, yes, yes, yes, yes, ...
$ internet <fctr> no, yes, yes, yes, no, yes, yes, no, yes, yes...
$ romantic.math <fctr> no, no, no, yes, no, no, no, no, no, no, no, ...
$ famrel.math <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4...
$ freetime.math <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4...
$ goout.math <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4...
$ Dalc.math <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ Walc.math <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2...
$ health.math <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2...
$ absences.math <int> 6, 4, 10, 2, 4, 10, 0, 6, 0, 0, 0, 4, 2, 2, 0,...
$ G1.math <int> 5, 5, 7, 15, 6, 15, 12, 6, 16, 14, 10, 10, 14,...
$ G2.math <int> 6, 5, 8, 14, 10, 15, 12, 5, 18, 15, 8, 12, 14,...
$ G3.math <int> 6, 6, 10, 15, 10, 15, 11, 6, 19, 15, 9, 12, 14...
$ guardian.por <fctr> mother, father, mother, mother, father, mothe...
$ traveltime.por <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1...
$ studytime.por <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1...
$ failures.por <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ schoolsup.por <fctr> yes, no, yes, no, no, no, no, yes, no, no, no...
$ famsup.por <fctr> no, yes, no, yes, yes, yes, no, yes, yes, yes...
$ paid.por <fctr> no, no, no, no, no, no, no, no, no, no, no, n...
$ activities.por <fctr> no, no, no, yes, no, yes, no, no, no, yes, no...
$ higher.por <fctr> yes, yes, yes, yes, yes, yes, yes, yes, yes, ...
$ romantic.por <fctr> no, no, no, yes, no, no, no, no, no, no, no, ...
$ famrel.por <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4...
$ freetime.por <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4...
$ goout.por <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4...
$ Dalc.por <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ Walc.por <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2...
$ health.por <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2...
$ absences.por <int> 4, 2, 6, 0, 0, 6, 0, 2, 0, 0, 2, 0, 0, 0, 0, 6...
$ G1.por <int> 0, 9, 12, 14, 11, 12, 13, 10, 15, 12, 14, 10, ...
$ G2.por <int> 11, 11, 13, 14, 13, 12, 12, 13, 16, 12, 14, 12...
$ G3.por <int> 11, 11, 12, 14, 13, 13, 13, 13, 17, 13, 14, 13...
join_by <- c("school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet")
## which columns in the datasets were not used for joining the data
notjoined_columns <- colnames(math)[!colnames(math) %in% join_by]
## create a new data frame with the common columns
alc <- select(math_por, one_of(join_by))
## access the dplyr library
library(dplyr)
## average/combine the rest of the columns
for(column_name in notjoined_columns) {
df <- select(math_por, starts_with(column_name))
if(is.numeric(select(df, 1))) {
alc[column_name] <- rowMeans(df)
} else {
alc[column_name] <- select(df, 1)
}
}
# final column names of the data
colnames(alc)
[1] "school" "sex" "age" "address" "famsize"
[6] "Pstatus" "Medu" "Fedu" "Mjob" "Fjob"
[11] "reason" "nursery" "internet" "guardian" "traveltime"
[16] "studytime" "failures" "schoolsup" "famsup" "paid"
[21] "activities" "higher" "romantic" "famrel" "freetime"
[26] "goout" "Dalc" "Walc" "health" "absences"
[31] "G1" "G2" "G3"
library(dplyr)
library(ggplot2)
## combine weekday and weekend alcohol use into alc_use
alc <- mutate(alc, alc_use = (Dalc + Walc) / 2)
## draw a histogram of alcohol use
qplot(alc_use, data = alc, bins = 10, main = "Student alcohol consumption")
## transform alc_use into a binary (T, F) variable high_use
alc$high_use <- alc$alc_use > 2
## draw a bar plot
qplot(high_use, data = alc)
# save the data
# write.table(file = "../datasets/alc.txt", alc, sep = ",", row.names = F)
library(descr)
crosstab(alc$sex, alc$high_use, prop.c = T, prop.r = T)
Cell Contents
|-------------------------|
| Count |
| Row Percent |
| Column Percent |
|-------------------------|
================================
alc$high_use
alc$sex FALSE TRUE Total
--------------------------------
F 157 41 198
79.3% 20.7% 51.8%
58.1% 36.6%
--------------------------------
M 113 71 184
61.4% 38.6% 48.2%
41.9% 63.4%
--------------------------------
Total 270 112 382
70.7% 29.3%
================================
library(ggplot2)
p <- ggplot(alc, aes(x = high_use, y = absences))
p + geom_boxplot()
m1 <- glm(high_use ~ sex + failures + absences, data = alc, family = binomial)
# print out a summary of the model
summary(m1)
Call:
glm(formula = high_use ~ sex + failures + absences, family = binomial,
data = alc)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6629 -0.8545 -0.5894 1.0339 2.0428
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.95397 0.22819 -8.563 < 2e-16 ***
sexM 0.98848 0.24453 4.042 5.29e-05 ***
failures 0.40462 0.15024 2.693 0.00708 **
absences 0.07294 0.01796 4.061 4.88e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 462.21 on 381 degrees of freedom
Residual deviance: 418.64 on 378 degrees of freedom
AIC: 426.64
Number of Fisher Scoring iterations: 4
## print out the coefficients
m1
Call: glm(formula = high_use ~ sex + failures + absences, family = binomial,
data = alc)
Coefficients:
(Intercept) sexM failures absences
-1.95397 0.98848 0.40462 0.07294
Degrees of Freedom: 381 Total (i.e. Null); 378 Residual
Null Deviance: 462.2
Residual Deviance: 418.6 AIC: 426.6
## print out the exponentiated coefficients
exp(coefficients(m1))
(Intercept) sexM failures absences
0.1417107 2.6871365 1.4987270 1.0756623
# predict the response
alc$prediction <- predict(m1, type = "response")
# first six values
head(select(alc, high_use, prediction))
high_use prediction
1 FALSE 0.1799998
2 FALSE 0.1594640
3 TRUE 0.4973115
4 FALSE 0.1408685
5 FALSE 0.1594640
6 FALSE 0.4412412
# function to compute the proportion of wrong predictions
cost <- function(class, prediction) mean(abs(class - prediction) > 0.5)
# compute the average number of wrong predictions
cost(alc$high_use, alc$prediction)
[1] 0.2565445
# cost function
cost <- function(class, prediction) mean(abs(class - prediction) > 0.5)
# k-fold cross-validation
library(boot)
cv <- cv.glm(data = alc, cost = cost, glmfit = m1, K = 10)
# print out the average number of wrong predictions in the cross validation
cv$delta[1]
[1] 0.2617801