Example R programs and commands
       26. Multivariate analysis; linear discriminant analysis

# All lines preceded by the "#" character are my comments.
# All other left-justified lines are my input.
# All other indented lines are the R program output.


##################  Displaying multivariate data  ##########################

# Load a 4-variate data set on irises:
data(iris)
# ...now there is a data.frame named "iris"

### Basic data frame examination

# See the names of the variables in "iris"
names(iris)
## Output:
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" "Species"

# See what values the "Species" member takes
levels(iris$Species)
## Output:
## [1] "setosa"     "versicolor" "virginica"

# See how many data rows there are in "iris"
nrow(iris)
## Output:
## [1] 150


###  Data visualizations

## Histograms

# Plot the histogram of "Sepal.Length" for "virginica"
hist(iris$Sepal.Length[iris$Species=="virginica"])

# Plot histograms of the 4 variables for "virginica"
par(mfrow=c(2,2));      # arrange 4 plots in a 2x2 matrix
iris1 <- iris[iris$Species=="virginica",1:4];  # Use first 4 columns
for( i in 1:4 )
  hist( iris1[,i], xlab=NULL, main=names(iris1)[i] ); 
# ...emphasizing the variates in the plot titles

# Plot histograms of the 4 variables for all 3 species 
par(mfrow=c(3,4));      # arrange 12 plots in 3 1x4 matrices
for( j in levels(iris$Species) ) {
     iris1 <- iris[iris$Species==j, 1:4];  # First 4 columns of species "j"
     for( i in 1:4 ) {
       hist( iris1[,i], xlab=names(iris1)[i], main=j ); }
}
# ...emphasizing the species in the titles atop each row of 4 plots


## Box plots

# Box plots comparing the 4 variables by species
par(mfrow=c(2,2))
plot( Sepal.Length ~ Species, data=iris)
plot( Sepal.Width ~ Species, data=iris)
plot( Petal.Length ~ Species, data=iris)
plot( Petal.Width ~ Species, data=iris)


## Scatter plots 

# Plot 2-D scatterplots of all (4 choose 2)=6 pairs of variables
# Create point labels "s", "v", "c" for the 3 species
point.labels <- rep("X", nrow(iris));
point.labels[iris$Species=="setosa"] <-"s"
point.labels[iris$Species=="virginica"] <-"v"
point.labels[iris$Species=="versicolor"] <-"c"
pairs( iris[,1:4], pch=point.labels);

# Plot 3-D scatterplots of all (4 choose 3)=4 triples of variables
library(scatterplot3d);  # after  install.packages("scatterplot3d")
par(mfrow=c(2,2));
for(i in 1:4) 
   scatterplot3d(iris[c(-i,-5)], pch=point.labels);
# ...a loop which is a tricky way of executing the equivalent 4 commands:
scatterplot3d(iris[c(1,2,3)], pch=point.labels);
scatterplot3d(iris[c(1,2,4)], pch=point.labels);
scatterplot3d(iris[c(1,3,4)], pch=point.labels);
scatterplot3d(iris[c(2,3,4)], pch=point.labels);


################# Testing for outliers by Mahalanobis distance ############

#  This is one way to detect misidentified samples.

# Plot the Mahalanobis distance of each 4-variables sample 
dev.off()  # resets par() and clears the previous plot.
iris1 <- iris[iris$Species=="setosa", 1:4];  # First 4 columns of "setosa"
dist2 <- mahalanobis(iris1, center=colMeans(iris1), cov=var(iris1));
plot(sqrt(dist2), main="Setosa", ylab="Mahalanobis distance (std. devs)" );
# ...emphasizing the species in the titles atop each plot

# Plot histograms of the 4 variables for all 3 species 
par(mfrow=c(1,3));      # arrange 3 species in adjacent plots
for( j in levels(iris$Species) ) {
     iris1 <- iris[iris$Species==j, 1:4];  # First 4 columns of species "j"
     dist<-sqrt(mahalanobis(iris1, center=colMeans(iris1), cov=var(iris1)));
     plot(dist, main=j, ylab="Mahalanobis distance (standard deviations)" ); }
# ...emphasizing the species in the titles atop each plot


#######################  Mahalanobis classification  ###################

# Decide species from new measurements based on which species mean
# is closest by Mahalanobis distance.
#
# First compute mean vectors and variance matrices for the 3 classes
#
# Species 1: setosa
species1 <- iris[iris$Species=="setosa",1:4];  # the 4 data for each setosa
mean1 <- colMeans( species1 ); var1<-var(species1);
#
# Species 2: versicolor
species2 <- iris[iris$Species=="versicolor",1:4];  #  4 data for each versicolor
mean2 <- colMeans( species2 ); var2<-var(species2);
#
# Species 3: virginica
species3 <- iris[iris$Species=="virginica",1:4];  #  4 data for each virginica
mean3 <- colMeans( species3 ); var3<-var(species3);
#
# Predict the species from 4 measurements
new<-data.frame(Sepal.Length=4.9,Sepal.Width=3.3,Petal.Length=1.2,Petal.Width=0.2)
mahalanobis(new, mean1, var1)   ## OUTPUT:   2.338388
mahalanobis(new, mean2, var2)   ## OUTPUT: 106.9140
mahalanobis(new, mean3, var3)   ## OUTPUT: 179.5317
# ==> probably species 1 (setosa)
#
# Second example:
new<-data.frame(Sepal.Length=6.8,Sepal.Width=3.0,Petal.Length=6.1,Petal.Width=2.0)
mahalanobis(new,mean1,var1)     ## OUTPUT:  845.2354
mahalanobis(new,mean2,var2)     ## OUTPUT:   22.36729
mahalanobis(new,mean3,var3)     ## OUTPUT:    2.275968
# ==> probably species 3 (virginica)
 

#####################  Principal Components Analysis  #####################

# Identify how many variables are really needed to account for the variance.

data(iris)
iris1<-iris[iris$Species=="setosa", 1:4]
pca <- princomp(iris1); pca
screeplot(pca)
summary(pca)
# ...which indicates that 3 components contain 97% of the variance


################### Linear Discriminant Analysis ##################

# Find a linear function that predicts class membership

library(MASS)      # contains the lda() function
lda(Species ~ ., data=iris)  # determine "Species" from all other measurements
## Output:
## Call:
## lda(Species ~ ., data = iris)
##  
## Prior probabilities of groups:
##     setosa versicolor  virginica
##  0.3333333  0.3333333  0.3333333
##  
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa            5.006       3.428        1.462       0.246
## versicolor        5.936       2.770        4.260       1.326
## virginica         6.588       2.974        5.552       2.026
## 
## Coefficients of linear discriminants:
##                     LD1         LD2
## Sepal.Length  0.8293776  0.02410215
## Sepal.Width   1.5344731  2.16452123
## Petal.Length -2.2012117 -0.93192121
## Petal.Width  -2.8104603  2.83918785
##  
## Proportion of trace:
##    LD1    LD2
## 0.9912 0.0088
## 


# Check how well the discriminator classifies the data that built it:
disc<-lda(Species ~ ., data=iris)
predict(disc)
# ...which produces lots of output.

# Check just the classifications:
predict(disc)$class

# Find the indices of samples that were misclassified
allindices <- 1:nrow(iris)
misclassified <- allindices[ predict(disc)$class != iris$Species ]
misclassified

# Find how the species were misclassified 
right <- iris$Species[misclassified];
wrong <- predict(disc)$class[misclassified] 

# Find the misclassification table --- truth labels the rows
freq <- table( iris$Species, predict(disc)$class )
freq                   # raw frequencies
freq/rowSums(freq)     # probabilities; Prob(class|truth) row-sums to 1

# Predict the species from 4 measurements
# First example:
new<-data.frame(Sepal.Length=4.9,Sepal.Width=3.3,Petal.Length=1.2,Petal.Width=0.2)
predict(disc, new) ## OUTPUT: $class: setosa
# ==> setosa
#
# Second example:
new<-data.frame(Sepal.Length=6.8,Sepal.Width=3.0,Petal.Length=6.1,Petal.Width=2.0)
predict(disc, new) ## OUTPUT: $class: virginica
# ==> virginica


############### Cross-validation #######################

# Train a classifier on part of the data, use that to predict the rest
train <- sample(1:150, 75)      # Half the indices, no repeats
table(iris$Species[train])
## Output [your answer may differ]
##    setosa versicolor  virginica
##       26         24         25
classifier <- lda(Species ~ ., data=iris, subset = train)
class <- predict(classifier, iris[-train,])$class
truth <- iris$Species[-train]
table(truth,class)

# Repeat 100 times to estimate the probability of correct classification
pcc <- matrix(0,3,3);  # There are 3 species, so 3 classes
for( n in 1:100 ) {
  train <- sample(1:150, 75)    # Train the classifier with half of the data...
  classifier <- lda(Species ~ ., data=iris, subset = train)
  class <- predict(classifier, iris[-train,])$class  # ...to predict the remainder
  truth <- iris$Species[-train]  # ...which will be compared with the true species
  pcc <- pcc + table(truth,class)
}
pcc
pcc/rowSums(pcc)     # Convert to probabilities