Example R programs and commands
			    27. Classification trees

# All lines preceded by the "#" character are comments or output.
# All other left-justified lines are my input


#############  Gene expression microarray example

###  Reading microarray gene expression data
#
# Source:
#	NCI microarray data
#	http://genome-www.stanford.edu/nci60/
#
# The files "nci.names" and "nci.data" are copied on this website
# at the links:
#
#    http://www.math.wustl.edu/~victor/classes/ma322/nci.names
#    http://www.math.wustl.edu/~victor/classes/ma322/nci.data
#
# Save the files into your R folder under those names.
# Read their contents into R variables with `scan()':
#
cancers <- scan("nci.names", what="character") # read the cancer names
ncidata <- scan("nci.data" )      # read (lots) of numerical gene data
#
# There are 64 rows of 6830 gene expression values in "nci.data".
# This fact is deducible from what is mentioned in the e-text by
# Seefeld and Linder on p.308. You must divide the total number of
# data by the mentioned number of rows, 64, to get the number of columns:
#
length(ncidata)/64
## OUTPUT: [1] 6830
#
# Build a matrix of 64 rows (cancer name) and 6830 columns (gene number):
genexps <- matrix(ncidata,nrow=64,ncol=6830);


### Data clean-up
#
## Get cancers with at least 3 sample rows:
#
uc <- unique(cancers)   # lists the unique cancer names
length(uc)              # verify that there are 14 unique cancer names.

for( c in uc ) print( cancers[cancers==c] ); # display cancer repetitions

atleast3 <- NULL;       # Initialize an empty array for triplicated names.
for( c in uc )                             # Loop over the 14 cancer names.
   if ( length(cancers[cancers==c]) > 2 )  # If the name "c" is triplicated
     atleast3 <- c( atleast3, c );         # ...then append it to the array.

length(atleast3)   # Verify that 8 cancer names are at least triplicated
atleast3           # ...and print their names for confirmation

keepers <- is.element( cancers, atleast3 ) # Get TRUE/FALSE subsampling array
length(cancers[keepers])  # ...and confirm that there will be 57 sample rows.

#
## Find important genes
#
# Find the minimum expression by gene number, which is the 2-index (columns):
genemin <- apply(genexps, MARGIN=2, FUN=min)      # ...computes 6830 min()'s
# 
# EXPLANATION of 'apply': This evaluates FUNction 'min' on the 2-MARGIN
#   of matrix 'genexps', which is its columns, and thus returns a
#   vector of 6830 minimum values, one for each column.
# Replace 'FUN=min' with 'FUN=max' to get column maximums.
# Replace 'FUN=min' with 'FUN=mean' to get column averages, and so on.
# The 1-MARGIN is rows.
# Replace 'MARGIN=2' with 'MARGIN=1' to get row minimums, and so on.
#
# Rearrange the column mins in decreasing order to indentify the biggest:
sort(genemin,decreasing=TRUE)[1:20]   # ...and look at the top 20
## OUTPUT:
## [1] -0.3700000 -0.3700000 -0.4150000 -0.4199610 -0.4200000 -0.4200195
## [7] -0.4350000 -0.4600000 -0.4700000 -0.4800000 -0.4800000 -0.4800000
##[13] -0.4850000 -0.4949805 -0.5000000 -0.5000000 -0.5050000 -0.5100000
##[19] -0.5100000 -0.5100195					      
# ... so use -0.425 as a threshold to select the top 6 genes, as in the e-text.
top6genes <- (genemin > -0.421)   # NOTE: this disgrees with the e-text value.
length(genemin[top6genes])       # verify that 6 genes survived

top12genes <- (genemin > -0.481)  # Also identify the top 12 genes
length(genemin[top12genes])       # Verify that 12 genes survived


## Classification by recursive partitioning ("rpart", a replacement for "tree")
install.packages("rpart")  # Contributed package, available from <cran.wustl.edu>
library(rpart)             # ... must be loaded to make its functions available.
install.packages("rpart.plot")  # Plotting package, works under R 4 and higher
library(rpart.plot)             # ... must be loaded to make its functions available.
# Note: Confusingly, there is also an older "plot.rpart" package with similar capabilities 

# Top 6 genes
# Extract a submatrix
#
# 8 triplicated cancers are row-subsetted with "keepers"
# 6 highly-expressed genes are column-subsetted with "top6genes"
#
# Bind the good gene columns with the triplicated cancer rows in a
# data frame:
cancer <- data.frame(cancers)[keepers,]          # 57x1 array, column name "cancer"
genes6 <- data.frame(genexps)[keepers,top6genes]  # 57x6 array, column names "X****"

nci6 <- cbind.data.frame(cancer, genes6)   # 57x7 array with good column names

names(nci6)             # check the column names for future use
## OUTPUT:
##[1] "cancer" "X2838"  "X3234"  "X3320"  "X4831"  "X5680"  "X6596" 

cancergene6 <- rpart(cancer~., data=nci6)  # Build the classification tree
summary(cancergene6)                      # Summarize its [poor] quality.
## Lots of output describing the tree. 

rpart.plot(cancergene6) # Show the decision forks graphically

printcp(cancergene6)    # Display various classification error rates 
## OUTPUT of printcp:
## Classification tree:
## rpart(formula = cancer ~ ., data = nci6)
##
## Variables actually used in tree construction:
## [1] X3234 X4831 X5680
##
## Root node error: 48/57 = 0.84211
##
## n= 57 
##
##       CP nsplit rel error  xerror     xstd
## 1 0.14583      0   1.00000 1.12500 0.035122
## 2 0.06250      2   0.70833 0.81250 0.073112
## 3 0.01000      4   0.58333 0.83333 0.071957
#
# Interpretation:
# "rel error" is the misclassification rate for various trees of increasing depth.
# "xerror" is the cross-validation error rate for various subtrees pruned from the
#     full tree using the complexity parameter "CP" value.  It depends on how the
#     original data was randomly subdivided into "train" and "test" subsets by rpart()
#     and thus may change if rpart() is rerun.
# "xstd" is the standard deviation of the cross-validation error rate.
# Misclassification rate = (Root node error)*(rel error)=(0.84211)*(0.58333) = 0.4912
#    where the smallest, here bottommost, "rel error" (0.58333) should be used.
#
# Alternatively, compute the number of misclassifications using predict():
sum( predict(cancergene6,type="class") != nci6$cancer )  # number of misclassifications
sum(predict(cancergene6,type="class")!=nci6$cancer)/nrow(nci6)  # misclassification rate


# Top 12 genes
# Extract a submatrix
#
# 8 triplicated cancers are row-subsetted with "keepers"
# 12 highly-expressed genes are column-subsetted with "top12genes"
#
# Bind the good gene columns with the triplicated cancer rows in a
# data frame:
cancer <- data.frame(cancers)[keepers,]          # 57x1 array, column name "cancer"
genes12 <- data.frame(genexps)[keepers,top12genes] # 57x12 array, column names "X****"

nci12 <- cbind.data.frame(cancer, genes12)   # 57x13 array with good column names

names(nci12)           # check the column names for future use
## OUTPUT:
##  [1] "cancer" "X984"   "X2540"  "X2838"  "X2912"  "X2930"  "X3205"  "X3234" 
##  [9] "X3320"  "X4831"  "X5006"  "X5680"  "X6596" 

cancergene12 <- rpart(cancer~., data=nci12) # Build the classification tree
summary(cancergene12)                      # Summarize its [slightly less poor] quality.
## Lots of output. 

rpart.plot(cancergene12)   # Show the decision forks graphically

printcp(cancergene12)    # Display various classification error rates 
# Misclassification rate = (Root node error)*(rel error)
#    where the smallest, here bottommost, "rel error" should be used.
#
# Alternatively, compute the number of misclassifications using predict():
sum(predict(cancergene12,type="class")!=nci12$cancer)/nrow(nci12)  # misclassification rate


# Top genes, rpart()'s choice.
# Let the program choose its favorite genes (this will be slow)
#
cancer <- data.frame(cancers)[keepers,] # 57x1 array, column name "cancer"
genes <- data.frame(genexps)[keepers,]  # 57x6830 array, column names "X****"
nci <- cbind.data.frame(cancer, genes)  # 57x6831 array with good column names
cancergene <- rpart(cancer~., data=nci)  # Build the classification tree
summary(cancergene)
## Lots of output
summary(cancergene)                      # Summarize its quality.

rpart.plot(cancergene)   # Show the decision forks graphically

printcp(cancergene)    # Display various classification error rates 
# Misclassification rate = (Root node error)*(rel error)
#    where the smallest, here bottommost, "rel error" should be used.
#
# Alternatively, compute the number of misclassifications using predict():
sum(predict(cancergene,type="class")!=nci$cancer)/nrow(nci)  # misclassification rate


#############  Iris data example
#
# Compare the results here with linear discriminant analysis
#
data(iris)         #  Load the iris data set, included in R.
irislengths <- rpart(Species ~., iris)
summary(irislengths)

rpart.plot(irislengths)  # plot the tree

printcp(irislengths)     # summarize the misclassification rates

# Misclassification rate:
sum(predict(irislengths,type="class")!=iris$Species)  # number of misclassifications
sum(predict(irislengths,type="class")!=iris$Species)/nrow(iris)  # rate


## Predict the species from new measurements, using the classification tree
#
# First example:
new<-data.frame(Sepal.Length=4.9,Sepal.Width=3.3,Petal.Length=1.2,Petal.Width=0.2)
predict(irislengths, new)  ## OUTPUT:   setosa(1) versicolor(0) virginica(0)
# ==> setosa
#
# Second example:
new<-data.frame(Sepal.Length=6.8,Sepal.Width=3.0,Petal.Length=6.1,Petal.Width=2.0)
predict(irislengths, new)  ## OUTPUT    setosa(0) versicolor(0) virginica(1)
# ==> virginica