Example R programs and commands
28. Clustering
# All lines preceded by the "#" character are comments or output.
# All other left-justified lines are my input
### Make sure that all graphics functions are available:
require(graphics)
### Generate a 2-cluster, 2-dimensional example,
# with 50 bivariate normals per cluster in k=2 clusters.
xy <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(xy) <- c("x", "y")
plot(xy) # initial scatterplot, no clustering.
plot(xy, col=rep(1:2,each=50)) # Identify true clusters by color
########## Voronoi cell clustering around means, or barycenters
#
## Voronoi algorithm to update the means, assuming 2 clusters
# This chooses 2 initial centers at random from among the data:
k<-2;
cl <- kmeans(xy, centers=k)
plot(xy, col = cl$cluster) # Identify clusters by color 1:k
points(cl$centers, col = 1:k, pch = 8) # Mark centers with *
################## Voronoi cell clustering around medoids
#
install.packages("cluster")
library(cluster) # for "pam()" and cluster graphing functions.
# Reuse the 2-d example from above
## Voronoi algorithm to update the medoids, assuming 2 clusters
# This chooses 2 initial centers at random from among the data:
k<-2;
pc <- pam(xy, k)
plot(xy, col = pc$cluster) # Identify clusters by color 1:k
points(pc$medoids, col = 1:k, pch = 9, cex=2) # Mark centers with #
# Alternative plotting in principal component variables:
plot(pc)
## Compare means with medoids:
plot(xy) # Scatterplot of clusters
points(cl$centers, pch = 8, cex=2) # Mark centers with big *
points(pc$medoids, pch = 9, cex=2) # Mark centers with big #
########### Agglomerative clustering
#
# Estimate the number of cluster from a dendrogram, or classification
# tree.
library(cluster) # for "agnes()" and "diana()" functions
### Agglomerative hierarchical clustering (agnes)
ah <- agnes(xy) # notice that the number of clusters is not specified
plot(ah,which=2) # plot only the dendrogram
#
# Commentary: the tree has an initial fork into two trunks with
# approximately equal numbers of branches, suggesting k=2 clusters.
### Divisive hierarchical clustering (diana)
dh <- diana(xy) # notice that the number of clusters is not specified
plot(dh,which=2) # plot only the dendrogram
#
# Commentary: again, the tree has an initial fork into two trunks with
# approximately equal numbers of branches, suggesting k=2 clusters.
# Add a third cluster and repeat:
xy3 <- rbind(xy, matrix(rnorm(100, mean=2,sd=0.3), ncol=2))
plot(xy3)
plot(agnes(xy3),which=2) # 1+2 main trunks suggests k=3
plot(diana(xy3),which=2) # 1+2 main trunks suggests k=3
############ Fisher Iris dataset -- 3 clusters in 4 dimensions
data(iris)
imeas <- iris[,1:4] # just the 4 measurements
# Get a prior for the number of clusters
plot(agnes(imeas), which=2) # unclear
plot(diana(imeas), which=2) # more suggestive of k=3 clusters
#
# ... so use k=3 in the clustering functions
#
kmeans(imeas,3) # 3 clusters of sizes 96, 33, 21 --- bad
summary(pam(imeas,3)) # 3 clusters of sizes 50, 62, 38 ---- better
# Misclassification rate:
truth <- rep(c(2,3,1), each=50); # kmeans() names the species 2,3,1
guess <- kmeans(imeas, 3)$cluster
sum(truth!=guess) ### OUTPUT: number of misclassifications, out of 150
truth <- rep(1:3, each=50); # pam() names the species 1,2,3
guess <- pam(imeas, 3)$clustering
sum(truth!=guess) ### OUTPUT: number of misclassifications, out of 150