Example R programs and commands
			    28. Clustering 

# All lines preceded by the "#" character are comments or output.
# All other left-justified lines are my input


### Make sure that all graphics functions are available:
require(graphics)

### Generate a 2-cluster, 2-dimensional example,
#  with 50 bivariate normals per cluster in k=2 clusters.
xy <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
	  matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(xy) <- c("x", "y")
plot(xy)  # initial scatterplot, no clustering.
plot(xy, col=rep(1:2,each=50))   # Identify true clusters by color


########## Voronoi cell clustering around means, or barycenters
#
## Voronoi algorithm to update the means, assuming 2 clusters
# This chooses 2 initial centers at random from among the data:
k<-2;
cl <- kmeans(xy, centers=k)

plot(xy, col = cl$cluster)   # Identify clusters by color 1:k
points(cl$centers, col = 1:k, pch = 8) # Mark centers with *


################## Voronoi cell clustering around medoids 
#
install.packages("cluster")
library(cluster)    # for "pam()" and cluster graphing functions.

# Reuse the 2-d example from above

## Voronoi algorithm to update the medoids, assuming 2 clusters
# This chooses 2 initial centers at random from among the data:
k<-2;
pc <- pam(xy, k)

plot(xy, col = pc$cluster)   # Identify clusters by color 1:k
points(pc$medoids, col = 1:k, pch = 9, cex=2) # Mark centers with #

# Alternative plotting in principal component variables:
plot(pc)


## Compare means with medoids:
plot(xy)   # Scatterplot of clusters
points(cl$centers, pch = 8, cex=2) # Mark centers with big *
points(pc$medoids, pch = 9, cex=2) # Mark centers with big #


########### Agglomerative clustering
#
# Estimate the number of cluster from a dendrogram, or classification
# tree. 

library(cluster)  # for "agnes()" and "diana()" functions

### Agglomerative hierarchical clustering (agnes)
ah <- agnes(xy)  # notice that the number of clusters is not specified
plot(ah,which=2)  # plot only the dendrogram
#
# Commentary: the tree has an initial fork into two trunks with
# approximately equal numbers of branches, suggesting k=2 clusters.

### Divisive hierarchical clustering (diana)
dh <- diana(xy)  # notice that the number of clusters is not specified
plot(dh,which=2)  # plot only the dendrogram
#
# Commentary: again, the tree has an initial fork into two trunks with
# approximately equal numbers of branches, suggesting k=2 clusters.

# Add a third cluster and repeat:
xy3 <- rbind(xy, matrix(rnorm(100, mean=2,sd=0.3), ncol=2))
plot(xy3)

plot(agnes(xy3),which=2)   # 1+2 main trunks suggests k=3


plot(diana(xy3),which=2)   # 1+2 main trunks suggests k=3


############  Fisher Iris dataset -- 3 clusters in 4 dimensions

data(iris)
imeas <- iris[,1:4]   # just the 4 measurements

# Get a prior for the number of clusters
plot(agnes(imeas), which=2)   # unclear
plot(diana(imeas), which=2)   # more suggestive of k=3 clusters
#
# ... so use k=3 in the clustering functions
#
kmeans(imeas,3)    # 3 clusters of sizes 96, 33, 21 --- bad
summary(pam(imeas,3))  # 3 clusters of sizes 50, 62, 38 ---- better

# Misclassification rate:
truth <- rep(c(2,3,1), each=50);  # kmeans() names the species 2,3,1
guess <- kmeans(imeas, 3)$cluster
sum(truth!=guess)   ### OUTPUT: number of misclassifications, out of 150

truth <- rep(1:3, each=50);   # pam() names the species 1,2,3
guess <- pam(imeas, 3)$clustering
sum(truth!=guess)   ### OUTPUT: number of misclassifications, out of 150