Example R programs and commands 28. Clustering # All lines preceded by the "#" character are comments or output. # All other left-justified lines are my input ### Make sure that all graphics functions are available: require(graphics) ### Generate a 2-cluster, 2-dimensional example, # with 50 bivariate normals per cluster in k=2 clusters. xy <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2), matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2)) colnames(xy) <- c("x", "y") plot(xy) # initial scatterplot, no clustering. plot(xy, col=rep(1:2,each=50)) # Identify true clusters by color ########## Voronoi cell clustering around means, or barycenters # ## Voronoi algorithm to update the means, assuming 2 clusters # This chooses 2 initial centers at random from among the data: k<-2; cl <- kmeans(xy, centers=k) plot(xy, col = cl$cluster) # Identify clusters by color 1:k points(cl$centers, col = 1:k, pch = 8) # Mark centers with * ################## Voronoi cell clustering around medoids # install.packages("cluster") library(cluster) # for "pam()" and cluster graphing functions. # Reuse the 2-d example from above ## Voronoi algorithm to update the medoids, assuming 2 clusters # This chooses 2 initial centers at random from among the data: k<-2; pc <- pam(xy, k) plot(xy, col = pc$cluster) # Identify clusters by color 1:k points(pc$medoids, col = 1:k, pch = 9, cex=2) # Mark centers with # # Alternative plotting in principal component variables: plot(pc) ## Compare means with medoids: plot(xy) # Scatterplot of clusters points(cl$centers, pch = 8, cex=2) # Mark centers with big * points(pc$medoids, pch = 9, cex=2) # Mark centers with big # ########### Agglomerative clustering # # Estimate the number of cluster from a dendrogram, or classification # tree. library(cluster) # for "agnes()" and "diana()" functions ### Agglomerative hierarchical clustering (agnes) ah <- agnes(xy) # notice that the number of clusters is not specified plot(ah,which=2) # plot only the dendrogram # # Commentary: the tree has an initial fork into two trunks with # approximately equal numbers of branches, suggesting k=2 clusters. ### Divisive hierarchical clustering (diana) dh <- diana(xy) # notice that the number of clusters is not specified plot(dh,which=2) # plot only the dendrogram # # Commentary: again, the tree has an initial fork into two trunks with # approximately equal numbers of branches, suggesting k=2 clusters. # Add a third cluster and repeat: xy3 <- rbind(xy, matrix(rnorm(100, mean=2,sd=0.3), ncol=2)) plot(xy3) plot(agnes(xy3),which=2) # 1+2 main trunks suggests k=3 plot(diana(xy3),which=2) # 1+2 main trunks suggests k=3 ############ Fisher Iris dataset -- 3 clusters in 4 dimensions data(iris) imeas <- iris[,1:4] # just the 4 measurements # Get a prior for the number of clusters plot(agnes(imeas), which=2) # unclear plot(diana(imeas), which=2) # more suggestive of k=3 clusters # # ... so use k=3 in the clustering functions # kmeans(imeas,3) # 3 clusters of sizes 96, 33, 21 --- bad summary(pam(imeas,3)) # 3 clusters of sizes 50, 62, 38 ---- better # Misclassification rate: truth <- rep(c(2,3,1), each=50); # kmeans() names the species 2,3,1 guess <- kmeans(imeas, 3)$cluster sum(truth!=guess) ### OUTPUT: number of misclassifications, out of 150 truth <- rep(1:3, each=50); # pam() names the species 1,2,3 guess <- pam(imeas, 3)$clustering sum(truth!=guess) ### OUTPUT: number of misclassifications, out of 150