#+ fig.width=12, fig.height=8
rm(list=ls(all=TRUE)) # clear memory
# load packages and define some useful functions
library(car); library(effects); library(rgl); library(rms)
logit <- function(x) { # logit: maps the range (0,1) to the range (-∞,∞)
log(x/(1-x)) # Gelman & Hill (2007:80)
}
ilogit <- function(x) { # inverse logit: maps the range (-∞,∞) to the range (0,1)
1/(1+exp(-x)) # Gelman & Hill (2007:80)
}
# load and inspect data
x <- read.delim("104_06b_clauseorders.csv")
summary(x) # check the data structure
attach(x) # make the columns available as vectors/factors
## Section 5.3.1: A logistic regression with a binary predictor
# -> chi-squared test for independence
summary(model.01 <- # return the summary of model.01,
glm(ORDER ~ # a generalized linear model of ORDER as a function of
SUBORDTYPE, # SUBORDTYPE, using
data=x, # the data in the data frame x
family=binomial)) # and the dependent variable is binary
# frequently used as well but not necessary here since there is only one predictor and it is binary or interval-scaled
drop1(model.01, test="Chisq") # p-value of the droppable predictor(s)
Anova(model.01, type="II") # p-value of all predictors, included here for consistency with multinomial models
rms::lrm( # use from the package rms the function lrm
formula(model.01))$ # on the formula of the above-defined model.01
stats[c("C", "R2")] # and retrieve the concordance statistic & R-squared
# determine the nature of the effect(s) numerically
(preds.hyp <- data.frame( # create an object preds.hyp that is a data frame
sot <- effect("SUBORDTYPE", # of sot, which is the effect that the predictor SUBORDTYPE
model.01)) # has in model.01
)
ilogit(-2.5069) # the ilogit of the intercept is
# the predicted prob. of ORDER="sc-mc"
# when SUBORDTYPE="caus": preds.hyp[1,]
ilogit(-2.5069 + 2.7234) # the ilogit of the intercept + coefficient 2 is
# the predicted prob. of ORDER="sc-mc"
# when SUBORDTYPE="temp": preds.hyp[2,]
# compute the individual predictions of the model and determine the classification accuracy
predictions.num <- # make predictions.num
predict(model.01, # the predictions for the data from model.01
type="response") # on the 'probability of the response' scale
predictions.cat <- # make predictions.cat:
ifelse(predictions.num>=0.5, # when the 2nd level of the DV is more likely than not,
"sc-mc", # predict the 2nd level of the DV "sc-mc", otherwise
"mc-sc") # predict the 1st level of the DV "mc-sc"
table(ORDER, # cross-tabulate the actually produced orders of clauses
predictions.cat) # against the predictions
# this classification table or confusion matrix has the following cells/components:
matrix(c("true negatives", "false negatives", # tn fn
"false positives", "true positives"), # fp tp
ncol=2, dimnames=list(DATA=levels(ORDER),
PREDICTIONS=sort(unique(predictions.cat))))
# therefore, you can compute measures such as
# accuracy:
(113+184) / length(predictions.cat) # (tp+tn) / (tp+tn+fp+fn) = 0.7369727
# precision:
113/(113+91) # tp/(tp+fp) = 0.5539216
# recall/sensitivity:
113/(113+15) # tp/(tp+fn) = 0.8828125
# F:
2*((0.5539216*0.8828125)/(0.5539216+0.8828125)) # 2 * ((prec*recall)/(prec+recall)) = 0.6807229
# Cohen's kappa:
psych::cohen.kappa(table(ORDER, predictions.cat))$kappa
# excursus: you can compare this percentage of correct classifications of 0.7369727 against
# - the accuracy a dumb model that always guesses the most frequent ORDER would get (this I mention in the book):
baseline.1 <- prop.table(table(ORDER))[1]
# - the accuracy you get from guessing an ORDER randomly:
baseline.2 <- sum(prop.table(table(ORDER))^2)
# the obtained accuracy is significantly better than either:
sum(dbinom((184+113):length(predictions.cat), length(predictions.cat), baseline.1))
sum(dbinom((184+113):length(predictions.cat), length(predictions.cat), baseline.2))
# determine the nature of the effect(s) graphically
# predictions of the model
plot(sot, # plot the effects object with its predictions
type="response", # on the 'probability of the response' scale
ylim=c(0, 1), # with these y-axis limits (all probabilities possible)
xlab="Subordinate clause type", # with this x-axis label
ylab="Predicted probability of 'sc-mc'", # with this y-axis label
grid=TRUE) # with a grid
# observed data
mosaicplot( # generate a mosaicplot
t(table(ORDER, # of the transposed frequency table of ORDER (the response)
SUBORDTYPE)), # and SUBORDTYPE (the predictor)
shade=TRUE) # use colors to illustrate (dis)preferences from residuals
# excursus: see how the logistic regression is, so to speak, a special case of multinomial regression
summary(model.01)
summary(nnet::multinom(ORDER ~ SUBORDTYPE, data=x), Wald.ratios=TRUE)
## Section 5.3.2: A logistic regression with a categorical predictor
# -> chi-squared test for independence
summary(model.01 <- glm(ORDER ~ CONJ, data=x, family=binomial))
# frequently used as well and useful here since the one predictor is neither binary nor interval-scaled so you don't get one p-value for the predictor as a whole
drop1(model.01, test="Chisq") # p-value of the droppable predictor(s)
Anova(model.01, type="II") # p-value of all predictors, included here for consistency with multinomial models
rms::lrm(formula(model.01))$stats[c("C", "R2")] # the concordance statistic and R-squared
# determine the nature of the effect(s) numerically
(preds.hyp <- data.frame(conj <- effect("CONJ", model.01)))
ilogit(0.4144) # the ilogit of the intercept is
# the predicted prob. of ORDER="sc-mc" when CONJ="als/when": preds.hyp[1,]
ilogit(0.4144 + -0.8563) # the ilogit of the intercept + coefficient 2 is
# the predicted prob. of ORDER="sc-mc" when CONJ="bevor/before": preds.hyp[2,]
ilogit(0.4144 + -0.0090) # the ilogit of the intercept + coefficient 3 is
# the predicted prob. of ORDER="sc-mc" when CONJ="nachdem/after": preds.hyp[3,]
ilogit(0.4144 + -2.9213) # the ilogit of the intercept + coefficient 4 is
# the predicted prob. of ORDER="sc-mc" when CONJ="weil/because": preds.hyp[4,]
# compute the individual predictions of the model and determine the classification accuracy
predictions.num <- predict(model.01, type="response")
predictions.cat <- ifelse(predictions.num>=0.5, "sc-mc", "mc-sc")
table(ORDER, predictions.cat)
# accuracy:
(212+95)/length(predictions.cat) # 0.7617866
# precision:
95/(95+63) # tp/(tp+fp) = 0.6012658
# recall:
95/(95+33) # tp/(tp+fn) = 0.7421875
# F:
2*((0.6012658*0.7421875)/(0.6012658+0.7421875)) # 2 * ((prec*recall)/(prec+recall)) = 0.6643357
# determine the nature of the effect(s) graphically
# predictions of the model
plot(conj, type="response", ylim=c(0, 1), grid=TRUE,
xlab="Subordinate conjunction", ylab="Predicted probability of 'sc-mc'")
# observed data
mosaicplot(t(table(ORDER, CONJ)), shade=TRUE)
## Section 5.3.3: A logistic regression with a numeric predictor
summary(model.01 <- glm(ORDER ~ LENGTH_DIFF, data=x, family=binomial))
# frequently used as well but not necessary here since there is only one predictor and it is binary or interval-scaled
drop1(model.01, test="Chisq") # p-value of the droppable predictor(s)
Anova(model.01, type="II") # p-value of all predictors, included here for consistency with multinomial models
rms::lrm(formula(model.01))$stats[c("C", "R2")] # the concordance statistic and R-squared
# determine the nature of the effect(s) numerically; note the use of xlevels to define for which lengths predictions are computed
(preds.hyp <- data.frame(lendiff <- effect("LENGTH_DIFF", model.01, xlevels=list(LENGTH_DIFF=seq(-35, 35, 5)))))
ilogit(-0.77673) # the ilogit of the intercept is
# the predicted prob. of ORDER="sc-mc" when LENGTH_DIFF=0 (i.e. both clauses are equally long)
ilogit(-0.77673+0.04418) # the ilogit of the intercept is
# the predicted prob. of ORDER="sc-mc" when LENGTH_DIFF=1
# compute the individual predictions of the model and determine the classification accuracy
predictions.num <- predict(model.01, type="response")
predictions.cat <- ifelse(predictions.num>=0.5, "sc-mc", "mc-sc")
table(ORDER, predictions.cat)
# accuracy:
(272+2)/length(predictions.cat) # 0.6799007
# precision:
2/(2+3) # tp/(tp+fp) = 0.4
# recall:
2/(2+126) # tp/(tp+fn) = 0.015625
# F:
2*((0.4*0.015625)/(0.4+0.015625)) # 2 * ((prec*recall)/(prec+recall)) = 0.03007519
# determine the nature of the effect(s) graphically
# predictions of the model
plot(lendiff, type="response", grid=TRUE, ylim=c(0, 1),
xlab="Main cl. length - subord. cl. length (in words)", ylab="Predicted probability of 'sc-mc'")
# excursus: observed data, the equivalent to a mosaicplot of the above kind: plotting observed percentages against binned LENGTH_DIFF values
plot(tapply(LENGTH_DIFF, # plot at the x-values of the LENGTH-DIFF values
cut(LENGTH_DIFF, # grouped into
10), # 10 groups
median), # for which you compute the medians
prop.table( # at the y-coordinates, which are proportions
table( # of frequencies
cut(LENGTH_DIFF, 10), # when you group the LENGTH_DIFF values into 10 groups & cross-tabulate them
ORDER), # with the 2 orders
1)[,2], # and take the second column of the row percentages
type="b", pch=16, ylim=c(0, 1), # plot filled circles and lines and set the y-limits
cex=1+rowSums(table(cut(LENGTH_DIFF, 10), ORDER))/75) # make the filled circles' sizes dep on their frequencies
grid(); matlines(preds.hyp[,1], preds.hyp[,c(2, 4:5)], col="red") # add a grid and the CI of the predictions
## Section 5.3.4: A logistic regression with two categorical predictors
summary(model.01 <- glm(ORDER ~ CONJ*MORETHAN2CL, data=x, family=binomial))
# frequently used as well and necessary here since one predictor is neither binary nor interval-scaled so you don't get one p-value for that predictor or the interaction it participates in as a whole
drop1(model.01, test="Chisq") # p-value of the droppable predictor(s)
Anova(model.01, type="II") # p-value of all predictors, included here for consistency with multinomial models
rms::lrm(formula(model.01))$stats[c("C", "R2")] # the concordance statistic and R-squared
# determine the nature of the effect(s) numerically
(preds.hyp <- data.frame(intact <- effect("CONJ:MORETHAN2CL", model.01)))
# compute the individual predictions of the model and determine the classification accuracy
predictions.num <- predict(model.01, type="response")
predictions.cat <- ifelse(predictions.num>=0.5, "sc-mc", "mc-sc")
table(ORDER, predictions.cat)
# accuracy:
(107+202)/length(predictions.cat) # 0.7667494
# precision:
107/(107+73) # tp/(tp+fp) = 0.5944444
# recall:
107/(107+21) # tp/(tp+fn) = 0.8359375
# F:
2*((0.5944444*0.8359375)/(0.5944444+0.8359375)) # 2 * ((prec*recall)/(prec+recall)) = 0.6948052
# determine the nature of the effect(s) graphically
# predictions of the model
plot(intact, type="response", grid=TRUE, ylim=c(0, 1),
xlab="Subordinate conjunction", ylab="Predicted probability of 'sc-mc'")
plot(intact, type="response", grid=TRUE, ylim=c(0, 1),
xlab="Subordinate conjunction", ylab="Predicted probability of 'sc-mc'",
x.var="MORETHAN2CL") # reverse perspective
## Section 5.3.5: A logistic regression with a categorical and a numeric predictor
summary(model.01 <- glm(ORDER ~ CONJ*LENGTH_DIFF, data=x, family=binomial))
# frequently used as well and necessary here since one predictor is neither binary nor interval-scaled so you don't get one p-value for that predictor or the interaction it participates in as a whole
drop1(model.01, test="Chisq") # p-value of the droppable predictor(s)
Anova(model.01, type="II") # p-value of all predictors, included here for consistency with multinomial models
rms::lrm(formula(model.01))$stats[c("C", "R2")] # the concordance statistic and R-squared
# determine the nature of the effect(s) numerically
(preds.hyp <- data.frame(intact <- effect("CONJ:LENGTH_DIFF", model.01, xlevels=list(LENGTH_DIFF=seq(-35, 35, 10)))))
# compute the individual predictions of the model and determine the classification accuracy
predictions.num <- predict(model.01, type="response")
predictions.cat <- ifelse(predictions.num>=0.5, "sc-mc", "mc-sc")
table(ORDER, predictions.cat)
# accuracy:
(83+228)/length(predictions.cat) # 0.7717122
# precision:
83/(83+47) # tp/(tp+fp) = 0.6384615
# recall:
83/(83+45) # tp/(tp+fn) = 0.6484375
# F:
2*((0.6384615*0.6484375)/(0.6384615+0.6484375)) # 2 * ((prec*recall)/(prec+recall)) = 0.6434108
## determine the nature of the effect(s) graphically
# predictions of the model
plot(intact, type="response", grid=TRUE, ylim=c(0, 1), ylab="Predicted probability of 'sc-mc'")
## Section 5.3.6: A logistic regression with two numeric predictors
summary(model.01 <- glm(ORDER ~ LEN_MC*LEN_SC, data=x, family=binomial))
# frequently used as well but not necessary here since both predictors are binary or interval-scaled
drop1(model.01, test="Chisq") # p-value of the droppable predictor(s)
Anova(model.01, type="II") # p-value of all predictors, included here for consistency with multinomial models
rms::lrm(formula(model.01))$stats[c("C", "R2")] # the concordance statistic and R-squared
# determine the nature of the effect(s) numerically
(preds.hyp <- data.frame(intact <- effect("LEN_MC:LEN_SC", model.01,
xlevels=list(LEN_MC=seq(0, 35, 5), LEN_SC=seq(0, 35, 5)))))
# compute the individual predictions of the model and determine the classification accuracy
predictions.num <- predict(model.01, type="response")
predictions.cat <- ifelse(predictions.num>=0.5, "sc-mc", "mc-sc") # to increase discriminatory power
table(ORDER, predictions.cat)
# accuracy:
(1+275)/length(predictions.cat) # 0.6848635
# precision:
1/(1+0) # tp/(tp+fp) = 1
# recall:
1/(1+127) # tp/(tp+fn) = 0.0078125
# F:
2*((1*0.0078125)/(1+0.0078125)) # 2 * ((prec*recall)/(prec+recall)) = 0.01550388
# determine the nature of the effect(s) graphically
# predictions of the model
plot(intact, type="response", grid=TRUE, ylim=c(0, 1),
xlab="Length of main clause in words", ylab="Predicted probability of 'sc-mc'")
# a 3-d scatterplot of predicted probabilties and order choices
plot3d(preds.hyp$LEN_MC, preds.hyp$LEN_SC, preds.hyp$fit, zlim=c(0, 1),
type="n", main="The interaction of clause lengths in model.01",
xlab="Length of main clause", ylab="Length of subord. clause", zlab="Predicted interaction from model",
col=ifelse(preds.hyp$PREDICTIONS>=median(preds.hyp$PREDICTIONS), "blue", "red"))
text3d(preds.hyp$LEN_MC, preds.hyp$LEN_SC, preds.hyp$fit,
ifelse(preds.hyp$fit>=0.5, "s", "m"),
col=ifelse(preds.hyp$fit>=0.5, "blue", "red"),
cex=2/3)
# excursus: sometimes it's easier to represent the third dimension, the size of the predicted values, not in a third dimension but with colors or symbols
# a 2-d scatterplot of predicted values with letters reflecting the predicted orderings and grey shading reflecting the certainty of the prediction
plot(preds.hyp$LEN_MC, preds.hyp$LEN_SC, type="n",
main="The interaction of clause lengths in model.01",
xlab="Length of main clause",
ylab="Length of subord. clause")
text(preds.hyp$LEN_MC,
preds.hyp$LEN_SC,
ifelse(preds.hyp$fit>=0.5, "s", "m"),
col=grey(1-as.numeric(cut(abs(preds.hyp$fit-median(preds.hyp$fit)), 100))/100),
cex=0.6+2*abs(preds.hyp$PREDICTIONS-median(preds.hyp$fit)))
abline(0, 1, lty=3, col="darkgrey")
lattice::contourplot(preds.hyp$fit ~
preds.hyp$LEN_MC * preds.hyp$LEN_SC,
xlab="Main clause length",
ylab="Subordinate clause length",
zlab="Predicted probability of 'sc-mc'",
main="Predicted RTs : LEN_MC:LEN_SC",
cuts=10, region=TRUE, pretty=TRUE)
# how does one get one significance test / p-value for the complete regression model?
# you compare it to the null model, i.e. a model without predictors
anova( # make a comparison between
model.01, # your current model of interest
glm(ORDER ~ 1, # and a corresponding model with only the intercept
data=x, # on the same data
family=binomial), # and with the same binary dependent variable
test="Chisq") # return a likelihood-ratio statistic
# the overall significance test for model.01 is significant (LR=8.22, df=3, p=0.042).