### cluster analysis: identifying structure in data by quantifying similarities between cases (not variables) and expressing the similarities between (groups/clusters of) cases in a dendrogram
#+ fig.width=12, fig.height=8
rm(list=ls(all=TRUE)) # clear memory
library(amap); library(cluster) # load useful packages
library(fpc); library(pvclust) # load useful packages
# similarity measures for interval/ratio-scaled variables
aa <- c(0, 1, 2, 3); bb <- c(2, 3, 1, 4)
aabb <- rbind(aa, bb)
sqrt(sum((aa-bb)^2)) # Euclidean distance
sum(abs(aa-bb)) # Manhattan distance
max(abs(aa-bb)) # maximum distance
sum(abs((aa-bb)/(aa+bb))) # Canberra distance
p <- 1; sum(abs(aa-bb)^p)^(1/p) # Minkowski distance 1, = Manhattan distance
p <- 2; sum(abs(aa-bb)^p)^(1/p) # Minkowski distance 2, = Euclidean distance
1-sum(aa*bb)/sqrt(sum(aa^2)*sum(bb^2)) # Pearson
1-cor(aa, bb) # correlation
1-cor.test(aa, bb, method="pearson")$estimate # correlation
cor.test(aa, bb, method="spearman")$statistic # approximates Spearman
1-Dist(aabb, method="pearson") # cosine
# example 1: English consonants
data.file <- read.delim("104_10b_consonants.csv", row.names=1) # row.names=1: we need names in the dendrogram
data.dist <- dist( # make data.dist a distance matrix
data.file, # for the rows of the data just loaded
method="euclidean") # use the Euclidean distance
plot(data.clust <- # plot a cluster-analysis object called data.clust,
hclust(data.dist, # which is computed from the distance matrix in data.dist
method="ward.D2")) # using Ward's amalgamation rule
# interpretation: it seems like there are 3 or 4 clusters
rect.hclust(data.clust, 3); cutree(data.clust, 3) # seems more appropriate given the data
rect.hclust(data.clust, 4, border="green"); cutree(data.clust, 4) # seems more appropriate given articulation
# validation: resampling: what happens if we fit many trees on bootstrapped samples of the data?
data.pvclust <- pvclust( # make data.pvclust the validation object
t(data.file), # based on our input data (note: unfortunately, we must transpose)
method.dist="euclidean", # using the same distance measure as before
method.hclust="ward.D2", # using the same amalgmation rule as before
nboot=500, # resample 500 times
parallel=TRUE) # and use more than 1 core/processor/thread, if possible
plot(data.pvclust) # plot the corresponding tree and
pvrect(data.pvclust) # add rectangles to it; also
pvpick(data.pvclust) # return which element is in which cluster
## example 2: collocates of 7 nouns
data.file <- read.delim("104_10c_collocates.csv", row.names=1)
data.dist <- Dist( # make data.dist a distance matrix (using amap::Dist)
data.file, # for the rows of the data just loaded
method="correlation") # use the Euclidean distance
plot(data.clust <- # plot a cluster-analysis object called data.clust,
hclust(data.dist, # which is computed from the distance matrix in data.dist
method="ward.D2"), # using Ward's amalgamation rule
hang=-1) # make all words be on the same height (relatives distances, too)
# interpretation: it seems like there are 2 clusters
rect.hclust(data.clust, 2)
# validation: resampling: what happens if we fit many trees on bootstrapped samples of the data?
data.pvclust <- pvclust( # make data.pvclust the validation object
t(data.file), # based on our input data (note: unfortunately, we must transpose)
method.dist="correlation", # using the same distance measure as before
method.hclust="ward.D2", # using the same amalgmation rule as before
nboot=500, # resample 500 times
parallel=TRUE) # and use more than 1 core/processor/thread, if possible
plot(data.pvclust) # plot the corresponding tree and
pvrect(data.pvclust) # add rectangles to it; also
pvpick(data.pvclust) # return which element is in which cluster