Example R programs and commands 27. Classification trees # All lines preceded by the "#" character are comments or output. # All other left-justified lines are my input ############# Gene expression microarray example ### Reading microarray gene expression data # # Source: # NCI microarray data # http://genome-www.stanford.edu/nci60/ # # The files "nci.names" and "nci.data" are copied on this website # at the links: # # http://www.math.wustl.edu/~victor/classes/ma322/nci.names # http://www.math.wustl.edu/~victor/classes/ma322/nci.data # # Save the files into your R folder under those names. # Read their contents into R variables with `scan()': # cancers <- scan("nci.names", what="character") # read the cancer names ncidata <- scan("nci.data" ) # read (lots) of numerical gene data # # There are 64 rows of 6830 gene expression values in "nci.data". # This fact is deducible from what is mentioned in the e-text by # Seefeld and Linder on p.308. You must divide the total number of # data by the mentioned number of rows, 64, to get the number of columns: # length(ncidata)/64 ## OUTPUT: [1] 6830 # # Build a matrix of 64 rows (cancer name) and 6830 columns (gene number): genexps <- matrix(ncidata,nrow=64,ncol=6830); ### Data clean-up # ## Get cancers with at least 3 sample rows: # uc <- unique(cancers) # lists the unique cancer names length(uc) # verify that there are 14 unique cancer names. for( c in uc ) print( cancers[cancers==c] ); # display cancer repetitions atleast3 <- NULL; # Initialize an empty array for triplicated names. for( c in uc ) # Loop over the 14 cancer names. if ( length(cancers[cancers==c]) > 2 ) # If the name "c" is triplicated atleast3 <- c( atleast3, c ); # ...then append it to the array. length(atleast3) # Verify that 8 cancer names are at least triplicated atleast3 # ...and print their names for confirmation keepers <- is.element( cancers, atleast3 ) # Get TRUE/FALSE subsampling array length(cancers[keepers]) # ...and confirm that there will be 57 sample rows. # ## Find important genes # # Find the minimum expression by gene number, which is the 2-index (columns): genemin <- apply(genexps, MARGIN=2, FUN=min) # ...computes 6830 min()'s # # EXPLANATION of 'apply': This evaluates FUNction 'min' on the 2-MARGIN # of matrix 'genexps', which is its columns, and thus returns a # vector of 6830 minimum values, one for each column. # Replace 'FUN=min' with 'FUN=max' to get column maximums. # Replace 'FUN=min' with 'FUN=mean' to get column averages, and so on. # The 1-MARGIN is rows. # Replace 'MARGIN=2' with 'MARGIN=1' to get row minimums, and so on. # # Rearrange the column mins in decreasing order to indentify the biggest: sort(genemin,decreasing=TRUE)[1:20] # ...and look at the top 20 ## OUTPUT: ## [1] -0.3700000 -0.3700000 -0.4150000 -0.4199610 -0.4200000 -0.4200195 ## [7] -0.4350000 -0.4600000 -0.4700000 -0.4800000 -0.4800000 -0.4800000 ##[13] -0.4850000 -0.4949805 -0.5000000 -0.5000000 -0.5050000 -0.5100000 ##[19] -0.5100000 -0.5100195 # ... so use -0.425 as a threshold to select the top 6 genes, as in the e-text. top6genes <- (genemin > -0.421) # NOTE: this disgrees with the e-text value. length(genemin[top6genes]) # verify that 6 genes survived top12genes <- (genemin > -0.481) # Also identify the top 12 genes length(genemin[top12genes]) # Verify that 12 genes survived ## Classification by recursive partitioning ("rpart", a replacement for "tree") install.packages("rpart") # Contributed package, available from library(rpart) # ... must be loaded to make its functions available. install.packages("rpart.plot") # Plotting package, works under R 4 and higher library(rpart.plot) # ... must be loaded to make its functions available. # Note: Confusingly, there is also an older "plot.rpart" package with similar capabilities # Top 6 genes # Extract a submatrix # # 8 triplicated cancers are row-subsetted with "keepers" # 6 highly-expressed genes are column-subsetted with "top6genes" # # Bind the good gene columns with the triplicated cancer rows in a # data frame: cancer <- data.frame(cancers)[keepers,] # 57x1 array, column name "cancer" genes6 <- data.frame(genexps)[keepers,top6genes] # 57x6 array, column names "X****" nci6 <- cbind.data.frame(cancer, genes6) # 57x7 array with good column names names(nci6) # check the column names for future use ## OUTPUT: ##[1] "cancer" "X2838" "X3234" "X3320" "X4831" "X5680" "X6596" cancergene6 <- rpart(cancer~., data=nci6) # Build the classification tree summary(cancergene6) # Summarize its [poor] quality. ## Lots of output describing the tree. rpart.plot(cancergene6) # Show the decision forks graphically printcp(cancergene6) # Display various classification error rates ## OUTPUT of printcp: ## Classification tree: ## rpart(formula = cancer ~ ., data = nci6) ## ## Variables actually used in tree construction: ## [1] X3234 X4831 X5680 ## ## Root node error: 48/57 = 0.84211 ## ## n= 57 ## ## CP nsplit rel error xerror xstd ## 1 0.14583 0 1.00000 1.12500 0.035122 ## 2 0.06250 2 0.70833 0.81250 0.073112 ## 3 0.01000 4 0.58333 0.83333 0.071957 # # Interpretation: # "rel error" is the misclassification rate for various trees of increasing depth. # "xerror" is the cross-validation error rate for various subtrees pruned from the # full tree using the complexity parameter "CP" value. It depends on how the # original data was randomly subdivided into "train" and "test" subsets by rpart() # and thus may change if rpart() is rerun. # "xstd" is the standard deviation of the cross-validation error rate. # Misclassification rate = (Root node error)*(rel error)=(0.84211)*(0.58333) = 0.4912 # where the smallest, here bottommost, "rel error" (0.58333) should be used. # # Alternatively, compute the number of misclassifications using predict(): sum( predict(cancergene6,type="class") != nci6$cancer ) # number of misclassifications sum(predict(cancergene6,type="class")!=nci6$cancer)/nrow(nci6) # misclassification rate # Top 12 genes # Extract a submatrix # # 8 triplicated cancers are row-subsetted with "keepers" # 12 highly-expressed genes are column-subsetted with "top12genes" # # Bind the good gene columns with the triplicated cancer rows in a # data frame: cancer <- data.frame(cancers)[keepers,] # 57x1 array, column name "cancer" genes12 <- data.frame(genexps)[keepers,top12genes] # 57x12 array, column names "X****" nci12 <- cbind.data.frame(cancer, genes12) # 57x13 array with good column names names(nci12) # check the column names for future use ## OUTPUT: ## [1] "cancer" "X984" "X2540" "X2838" "X2912" "X2930" "X3205" "X3234" ## [9] "X3320" "X4831" "X5006" "X5680" "X6596" cancergene12 <- rpart(cancer~., data=nci12) # Build the classification tree summary(cancergene12) # Summarize its [slightly less poor] quality. ## Lots of output. rpart.plot(cancergene12) # Show the decision forks graphically printcp(cancergene12) # Display various classification error rates # Misclassification rate = (Root node error)*(rel error) # where the smallest, here bottommost, "rel error" should be used. # # Alternatively, compute the number of misclassifications using predict(): sum(predict(cancergene12,type="class")!=nci12$cancer)/nrow(nci12) # misclassification rate # Top genes, rpart()'s choice. # Let the program choose its favorite genes (this will be slow) # cancer <- data.frame(cancers)[keepers,] # 57x1 array, column name "cancer" genes <- data.frame(genexps)[keepers,] # 57x6830 array, column names "X****" nci <- cbind.data.frame(cancer, genes) # 57x6831 array with good column names cancergene <- rpart(cancer~., data=nci) # Build the classification tree summary(cancergene) ## Lots of output summary(cancergene) # Summarize its quality. rpart.plot(cancergene) # Show the decision forks graphically printcp(cancergene) # Display various classification error rates # Misclassification rate = (Root node error)*(rel error) # where the smallest, here bottommost, "rel error" should be used. # # Alternatively, compute the number of misclassifications using predict(): sum(predict(cancergene,type="class")!=nci$cancer)/nrow(nci) # misclassification rate ############# Iris data example # # Compare the results here with linear discriminant analysis # data(iris) # Load the iris data set, included in R. irislengths <- rpart(Species ~., iris) summary(irislengths) rpart.plot(irislengths) # plot the tree printcp(irislengths) # summarize the misclassification rates # Misclassification rate: sum(predict(irislengths,type="class")!=iris$Species) # number of misclassifications sum(predict(irislengths,type="class")!=iris$Species)/nrow(iris) # rate ## Predict the species from new measurements, using the classification tree # # First example: new<-data.frame(Sepal.Length=4.9,Sepal.Width=3.3,Petal.Length=1.2,Petal.Width=0.2) predict(irislengths, new) ## OUTPUT: setosa(1) versicolor(0) virginica(0) # ==> setosa # # Second example: new<-data.frame(Sepal.Length=6.8,Sepal.Width=3.0,Petal.Length=6.1,Petal.Width=2.0) predict(irislengths, new) ## OUTPUT setosa(0) versicolor(0) virginica(1) # ==> virginica