Math 322, Biostatistics
M. V. Wickerhauser
Homework 11 R commands output
REVISED 2020-05-01 MVW to fix problem 1 erratum


########## PROBLEM 1

# (a) The following commands, similar to those in r-eg-27.txt, find
#     the cancers that appear at least 7 times: 
cancers <- scan("nci.names", what="character") # read the cancer names
uc <- unique(cancers)  # List the unique cancer names.
atleast7 <- NULL; # Initialize an empty array for quadruplicated names.
for( c in uc )                  # Loop over the cancer names.
   if ( length(cancers[cancers==c]) > 6 )  # If name "c" is septuplicated
     atleast7 <- c( atleast7, c );         # ...then append it to the array.
length(atleast7)   # Verify that 5 cancer names are at least septuplicated
atleast7           # ...and print their names for confirmation
## OUTPUT: [1] "RENAL"    "BREAST"   "NSCLC"    "MELANOMA" "COLON"

# (b)  The following {\tt R} commands, similar to those in
# r-eg-27.txt, extract the rows with cancers that appear at least 7 times:
keepers <- is.element( cancers, atleast7 ) # Get TRUE/FALSE subsampling array
length(cancers[keepers])  # ...and confirm that there will be 40 sample rows.
## OUTPUT: [1] 40

# (c)  The following R commands, similar to those in 
# r-eg-27.txt, extract the columns with the top 10 variances:
ncidata <- scan("nci.data" )      # read (lots) of numerical gene data	
length(ncidata)/64   # verify that there are 6830 columns
# Build a matrix of 64 rows (cancer name) and 6830 columns (gene number):
genexps <- matrix(ncidata,nrow=64,ncol=6830);
# Find the minimum expression variance by gene number (2-margin==columns):
minvar <- apply(genexps, MARGIN=2, FUN=var)    # ...computes 6830 vars()'s
sort(minvar,decreasing=TRUE)[1:12]   # ...and look at the top 12
## OUTPUT
## [1] 11.722255 11.323549  8.010836  7.772252  7.608615  7.531815  6.762230
## [8]  6.643322  6.612500  6.460722  6.349983  6.316970
#
# Notice that 7.6 is a threshold that cuts off the top 5.
top5vars <- (minvar > 7.6)
length(minvar[top5vars])       # verify that 5 genes survived
## OUTPUT: [1] 5
names(data.frame(genexps)[,top5vars]) # Find the names X**** of the
# top 5 genes by expression variance:
## OUTPUT: 
## [1] "X256"  "X4699" "X4700" "X4701" "X6393"

# (d) 
# Extract a submatrix
# Bind the top-5-variance gene columns with the 7-replicated cancer
# rows in a data frame:
cancer <- data.frame(cancers)[keepers,]          # 40x1 array, column name "cancer"
genes <- data.frame(genexps)[keepers,top5vars]  # 40x5 array, column names "X****"
nci <- cbind.data.frame(cancer, genes)   # 40x6 array with good column names
names(nci)             # check the column names for safety
## OUTPUT:
## [1] "cancer" "X256"   "X4699"  "X4700"  "X4701"  "X6393" 
#
install.packages("tree")  # Contributed package, available from <cran.wustl.edu>
library(tree)
cancergene <- tree(cancer~., data=nci)  # Build the classification tree
plot(cancergene)   # Show the decision forks graphically
text(cancergene)   # Label the decisions by gene number and threshold
# Write the plot to a file
pdf("hw11ex1d.pdf"); plot(cancergene); text(cancergene); dev.off()

# (e) Misclassification rate
summary(cancergene)                     # Summarize its [poor] quality.
## OUTPUT: 
## Classification tree:			       
## tree(formula = cancer ~ ., data = nci)	       
## Variables actually used in tree construction:  
## [1] "X256"  "X6393" "X4700"		       
## Number of terminal nodes:  5 		       
## Residual mean deviance:  1.528 = 53.49 / 35    
## Misclassification error rate: 0.3 = 12 / 40    


########## PROBLEM 2


# Look in
#   http://www.math.wustl.edu/~victor/classes/ma322/r-eg-28.txt
# for this code
install.packages("cluster")
library(cluster)    # for "pam()" and cluster graphing functions.
data(iris)
imeas <- iris[,1:4]   # just the 4 measurements

# Get a prior for the number of clusters
#
# (a) ... by agglomerative hierarchical clustering.
plot(agnes(imeas), which=2)   # unclear
# Write the plot to a file
pdf("hw11ex2a.pdf"); plot(agnes(imeas), which=2); dev.off()

# (b) ... by divisive hierarchical clustering.
plot(diana(imeas), which=2)   # more suggestive of k=3 clusters
# Write the plot to a file
pdf("hw11ex2b.pdf"); plot(diana(imeas), which=2); dev.off()
#
# ... so use k=3 in the clustering functions
#
# (c) 3 clusters by Voronoi's algorithm around means
kmeans(imeas,3)    # 3 clusters of sizes 96, 33, 21 --- bad
# Misclassification rate:
truth <- rep(c(2,3,1), each=50);  # kmeans() names the species 2,3,1
guess <- kmeans(imeas, 3)$cluster
sum(truth!=guess)   ### OUTPUT: 71 misclassifications, out of 150

# (d) 3 clusters by Voronoi's algorithm around medians
summary(pam(imeas,3))  # 3 clusters of sizes 50, 62, 38 ---- better
# Misclassification rate:
truth <- rep(1:3, each=50);   # pam() names the species 1,2,3
guess <- pam(imeas, 3)$clustering
sum(truth!=guess)   ### OUTPUT: 16 misclassifications, out of 150


########## PROBLEM 3

# Get the required data
#
# Alternatively, use load("nci57x13.R")

###  Reading microarray gene expression data as in r-eg-27.txt
cancers <- scan("nci.names", what="character") # read the cancer names
ncidata <- scan("nci.data" )      # read (lots) of numerical gene data
genexps <- matrix(ncidata,nrow=64,ncol=6830);
uc <- unique(cancers)   # lists the unique cancer names
atleast3 <- NULL;       # Initialize an empty array for triplicated names.
for( c in uc )                             # Loop over the 14 cancer names.
   if ( length(cancers[cancers==c]) > 2 )  # If the name "c" is triplicated
     atleast3 <- c( atleast3, c );         # ...then append it to the array.
keepers <- is.element( cancers, atleast3 ) # Get TRUE/FALSE subsampling array
# Find the minimum expression by gene number, which is the 2-index (columns):
genemin <- apply(genexps, MARGIN=2, FUN=min)      # ...computes 6830 min()'s
top12genes <- (genemin > -0.80)  # Identify the top 12 genes
# Bind the good gene columns with the triplicated cancer rows in a data frame:
cancer <- data.frame(cancers)[keepers,]          # 57x1 array, column name "cancer"
genes <- data.frame(genexps)[keepers,top12genes] # 57x12 array, column names "X****"
nci <- cbind.data.frame(cancer, genes)   # 57x13 array with good column names

### Install and access the multidimensional scaling library
install.packages("vegan")    # This contains "isomap()"
library(vegan)


# (a) 
# Isomap graphs, k=2 nearest neighbors by Euclidean distance
y<-nci[,-1]  # omit the "cancers" column, use only measurements
idist <- dist(y,"eucl");
imap<-isomap(idist,k=2);
plot(imap)

# Write the plot to a file:
pdf("hw11ex3a.pdf"); plot(imap); dev.off()


# (b) 
# Isomap graphs, k=2 nearest neighbors by Manhattan distance
y<-nci[,-1]  # omit the "cancers" column, use only measurements
idist <- dist(y,"manh");
imap<-isomap(idist,k=2);
plot(imap)

# Write the plot to a file:
pdf("hw11ex3b.pdf"); plot(imap); dev.off()