Math 322, Biostatistics M. V. Wickerhauser Homework 11 R commands output REVISED 2020-05-01 MVW to fix problem 1 erratum ########## PROBLEM 1 # (a) The following commands, similar to those in r-eg-27.txt, find # the cancers that appear at least 7 times: cancers <- scan("nci.names", what="character") # read the cancer names uc <- unique(cancers) # List the unique cancer names. atleast7 <- NULL; # Initialize an empty array for quadruplicated names. for( c in uc ) # Loop over the cancer names. if ( length(cancers[cancers==c]) > 6 ) # If name "c" is septuplicated atleast7 <- c( atleast7, c ); # ...then append it to the array. length(atleast7) # Verify that 5 cancer names are at least septuplicated atleast7 # ...and print their names for confirmation ## OUTPUT: [1] "RENAL" "BREAST" "NSCLC" "MELANOMA" "COLON" # (b) The following {\tt R} commands, similar to those in # r-eg-27.txt, extract the rows with cancers that appear at least 7 times: keepers <- is.element( cancers, atleast7 ) # Get TRUE/FALSE subsampling array length(cancers[keepers]) # ...and confirm that there will be 40 sample rows. ## OUTPUT: [1] 40 # (c) The following R commands, similar to those in # r-eg-27.txt, extract the columns with the top 10 variances: ncidata <- scan("nci.data" ) # read (lots) of numerical gene data length(ncidata)/64 # verify that there are 6830 columns # Build a matrix of 64 rows (cancer name) and 6830 columns (gene number): genexps <- matrix(ncidata,nrow=64,ncol=6830); # Find the minimum expression variance by gene number (2-margin==columns): minvar <- apply(genexps, MARGIN=2, FUN=var) # ...computes 6830 vars()'s sort(minvar,decreasing=TRUE)[1:12] # ...and look at the top 12 ## OUTPUT ## [1] 11.722255 11.323549 8.010836 7.772252 7.608615 7.531815 6.762230 ## [8] 6.643322 6.612500 6.460722 6.349983 6.316970 # # Notice that 7.6 is a threshold that cuts off the top 5. top5vars <- (minvar > 7.6) length(minvar[top5vars]) # verify that 5 genes survived ## OUTPUT: [1] 5 names(data.frame(genexps)[,top5vars]) # Find the names X**** of the # top 5 genes by expression variance: ## OUTPUT: ## [1] "X256" "X4699" "X4700" "X4701" "X6393" # (d) # Extract a submatrix # Bind the top-5-variance gene columns with the 7-replicated cancer # rows in a data frame: cancer <- data.frame(cancers)[keepers,] # 40x1 array, column name "cancer" genes <- data.frame(genexps)[keepers,top5vars] # 40x5 array, column names "X****" nci <- cbind.data.frame(cancer, genes) # 40x6 array with good column names names(nci) # check the column names for safety ## OUTPUT: ## [1] "cancer" "X256" "X4699" "X4700" "X4701" "X6393" # install.packages("tree") # Contributed package, available from library(tree) cancergene <- tree(cancer~., data=nci) # Build the classification tree plot(cancergene) # Show the decision forks graphically text(cancergene) # Label the decisions by gene number and threshold # Write the plot to a file pdf("hw11ex1d.pdf"); plot(cancergene); text(cancergene); dev.off() # (e) Misclassification rate summary(cancergene) # Summarize its [poor] quality. ## OUTPUT: ## Classification tree: ## tree(formula = cancer ~ ., data = nci) ## Variables actually used in tree construction: ## [1] "X256" "X6393" "X4700" ## Number of terminal nodes: 5 ## Residual mean deviance: 1.528 = 53.49 / 35 ## Misclassification error rate: 0.3 = 12 / 40 ########## PROBLEM 2 # Look in # http://www.math.wustl.edu/~victor/classes/ma322/r-eg-28.txt # for this code install.packages("cluster") library(cluster) # for "pam()" and cluster graphing functions. data(iris) imeas <- iris[,1:4] # just the 4 measurements # Get a prior for the number of clusters # # (a) ... by agglomerative hierarchical clustering. plot(agnes(imeas), which=2) # unclear # Write the plot to a file pdf("hw11ex2a.pdf"); plot(agnes(imeas), which=2); dev.off() # (b) ... by divisive hierarchical clustering. plot(diana(imeas), which=2) # more suggestive of k=3 clusters # Write the plot to a file pdf("hw11ex2b.pdf"); plot(diana(imeas), which=2); dev.off() # # ... so use k=3 in the clustering functions # # (c) 3 clusters by Voronoi's algorithm around means kmeans(imeas,3) # 3 clusters of sizes 96, 33, 21 --- bad # Misclassification rate: truth <- rep(c(2,3,1), each=50); # kmeans() names the species 2,3,1 guess <- kmeans(imeas, 3)$cluster sum(truth!=guess) ### OUTPUT: 71 misclassifications, out of 150 # (d) 3 clusters by Voronoi's algorithm around medians summary(pam(imeas,3)) # 3 clusters of sizes 50, 62, 38 ---- better # Misclassification rate: truth <- rep(1:3, each=50); # pam() names the species 1,2,3 guess <- pam(imeas, 3)$clustering sum(truth!=guess) ### OUTPUT: 16 misclassifications, out of 150 ########## PROBLEM 3 # Get the required data # # Alternatively, use load("nci57x13.R") ### Reading microarray gene expression data as in r-eg-27.txt cancers <- scan("nci.names", what="character") # read the cancer names ncidata <- scan("nci.data" ) # read (lots) of numerical gene data genexps <- matrix(ncidata,nrow=64,ncol=6830); uc <- unique(cancers) # lists the unique cancer names atleast3 <- NULL; # Initialize an empty array for triplicated names. for( c in uc ) # Loop over the 14 cancer names. if ( length(cancers[cancers==c]) > 2 ) # If the name "c" is triplicated atleast3 <- c( atleast3, c ); # ...then append it to the array. keepers <- is.element( cancers, atleast3 ) # Get TRUE/FALSE subsampling array # Find the minimum expression by gene number, which is the 2-index (columns): genemin <- apply(genexps, MARGIN=2, FUN=min) # ...computes 6830 min()'s top12genes <- (genemin > -0.80) # Identify the top 12 genes # Bind the good gene columns with the triplicated cancer rows in a data frame: cancer <- data.frame(cancers)[keepers,] # 57x1 array, column name "cancer" genes <- data.frame(genexps)[keepers,top12genes] # 57x12 array, column names "X****" nci <- cbind.data.frame(cancer, genes) # 57x13 array with good column names ### Install and access the multidimensional scaling library install.packages("vegan") # This contains "isomap()" library(vegan) # (a) # Isomap graphs, k=2 nearest neighbors by Euclidean distance y<-nci[,-1] # omit the "cancers" column, use only measurements idist <- dist(y,"eucl"); imap<-isomap(idist,k=2); plot(imap) # Write the plot to a file: pdf("hw11ex3a.pdf"); plot(imap); dev.off() # (b) # Isomap graphs, k=2 nearest neighbors by Manhattan distance y<-nci[,-1] # omit the "cancers" column, use only measurements idist <- dist(y,"manh"); imap<-isomap(idist,k=2); plot(imap) # Write the plot to a file: pdf("hw11ex3b.pdf"); plot(imap); dev.off()