Example R programs and commands 26. Multivariate analysis; linear discriminant analysis # All lines preceded by the "#" character are my comments. # All other left-justified lines are my input. # All other indented lines are the R program output. ################## Displaying multivariate data ########################## # Load a 4-variate data set on irises: data(iris) # ...now there is a data.frame named "iris" ### Basic data frame examination # See the names of the variables in "iris" names(iris) ## Output: ## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" # See what values the "Species" member takes levels(iris$Species) ## Output: ## [1] "setosa" "versicolor" "virginica" # See how many data rows there are in "iris" nrow(iris) ## Output: ## [1] 150 ### Data visualizations ## Histograms # Plot the histogram of "Sepal.Length" for "virginica" hist(iris$Sepal.Length[iris$Species=="virginica"]) # Plot histograms of the 4 variables for "virginica" par(mfrow=c(2,2)); # arrange 4 plots in a 2x2 matrix iris1 <- iris[iris$Species=="virginica",1:4]; # Use first 4 columns for( i in 1:4 ) hist( iris1[,i], xlab=NULL, main=names(iris1)[i] ); # ...emphasizing the variates in the plot titles # Plot histograms of the 4 variables for all 3 species par(mfrow=c(3,4)); # arrange 12 plots in 3 1x4 matrices for( j in levels(iris$Species) ) { iris1 <- iris[iris$Species==j, 1:4]; # First 4 columns of species "j" for( i in 1:4 ) { hist( iris1[,i], xlab=names(iris1)[i], main=j ); } } # ...emphasizing the species in the titles atop each row of 4 plots ## Box plots # Box plots comparing the 4 variables by species par(mfrow=c(2,2)) plot( Sepal.Length ~ Species, data=iris) plot( Sepal.Width ~ Species, data=iris) plot( Petal.Length ~ Species, data=iris) plot( Petal.Width ~ Species, data=iris) ## Scatter plots # Plot 2-D scatterplots of all (4 choose 2)=6 pairs of variables # Create point labels "s", "v", "c" for the 3 species point.labels <- rep("X", nrow(iris)); point.labels[iris$Species=="setosa"] <-"s" point.labels[iris$Species=="virginica"] <-"v" point.labels[iris$Species=="versicolor"] <-"c" pairs( iris[,1:4], pch=point.labels); # Plot 3-D scatterplots of all (4 choose 3)=4 triples of variables library(scatterplot3d); # after install.packages("scatterplot3d") par(mfrow=c(2,2)); for(i in 1:4) scatterplot3d(iris[c(-i,-5)], pch=point.labels); # ...a loop which is a tricky way of executing the equivalent 4 commands: scatterplot3d(iris[c(1,2,3)], pch=point.labels); scatterplot3d(iris[c(1,2,4)], pch=point.labels); scatterplot3d(iris[c(1,3,4)], pch=point.labels); scatterplot3d(iris[c(2,3,4)], pch=point.labels); ################# Testing for outliers by Mahalanobis distance ############ # This is one way to detect misidentified samples. # Plot the Mahalanobis distance of each 4-variables sample dev.off() # resets par() and clears the previous plot. iris1 <- iris[iris$Species=="setosa", 1:4]; # First 4 columns of "setosa" dist2 <- mahalanobis(iris1, center=colMeans(iris1), cov=var(iris1)); plot(sqrt(dist2), main="Setosa", ylab="Mahalanobis distance (std. devs)" ); # ...emphasizing the species in the titles atop each plot # Plot histograms of the 4 variables for all 3 species par(mfrow=c(1,3)); # arrange 3 species in adjacent plots for( j in levels(iris$Species) ) { iris1 <- iris[iris$Species==j, 1:4]; # First 4 columns of species "j" dist<-sqrt(mahalanobis(iris1, center=colMeans(iris1), cov=var(iris1))); plot(dist, main=j, ylab="Mahalanobis distance (standard deviations)" ); } # ...emphasizing the species in the titles atop each plot ####################### Mahalanobis classification ################### # Decide species from new measurements based on which species mean # is closest by Mahalanobis distance. # # First compute mean vectors and variance matrices for the 3 classes # # Species 1: setosa species1 <- iris[iris$Species=="setosa",1:4]; # the 4 data for each setosa mean1 <- colMeans( species1 ); var1<-var(species1); # # Species 2: versicolor species2 <- iris[iris$Species=="versicolor",1:4]; # 4 data for each versicolor mean2 <- colMeans( species2 ); var2<-var(species2); # # Species 3: virginica species3 <- iris[iris$Species=="virginica",1:4]; # 4 data for each virginica mean3 <- colMeans( species3 ); var3<-var(species3); # # Predict the species from 4 measurements new<-data.frame(Sepal.Length=4.9,Sepal.Width=3.3,Petal.Length=1.2,Petal.Width=0.2) mahalanobis(new, mean1, var1) ## OUTPUT: 2.338388 mahalanobis(new, mean2, var2) ## OUTPUT: 106.9140 mahalanobis(new, mean3, var3) ## OUTPUT: 179.5317 # ==> probably species 1 (setosa) # # Second example: new<-data.frame(Sepal.Length=6.8,Sepal.Width=3.0,Petal.Length=6.1,Petal.Width=2.0) mahalanobis(new,mean1,var1) ## OUTPUT: 845.2354 mahalanobis(new,mean2,var2) ## OUTPUT: 22.36729 mahalanobis(new,mean3,var3) ## OUTPUT: 2.275968 # ==> probably species 3 (virginica) ##################### Principal Components Analysis ##################### # Identify how many variables are really needed to account for the variance. data(iris) iris1<-iris[iris$Species=="setosa", 1:4] pca <- princomp(iris1); pca screeplot(pca) summary(pca) # ...which indicates that 3 components contain 97% of the variance ################### Linear Discriminant Analysis ################## # Find a linear function that predicts class membership library(MASS) # contains the lda() function lda(Species ~ ., data=iris) # determine "Species" from all other measurements ## Output: ## Call: ## lda(Species ~ ., data = iris) ## ## Prior probabilities of groups: ## setosa versicolor virginica ## 0.3333333 0.3333333 0.3333333 ## ## Group means: ## Sepal.Length Sepal.Width Petal.Length Petal.Width ## setosa 5.006 3.428 1.462 0.246 ## versicolor 5.936 2.770 4.260 1.326 ## virginica 6.588 2.974 5.552 2.026 ## ## Coefficients of linear discriminants: ## LD1 LD2 ## Sepal.Length 0.8293776 0.02410215 ## Sepal.Width 1.5344731 2.16452123 ## Petal.Length -2.2012117 -0.93192121 ## Petal.Width -2.8104603 2.83918785 ## ## Proportion of trace: ## LD1 LD2 ## 0.9912 0.0088 ## # Check how well the discriminator classifies the data that built it: disc<-lda(Species ~ ., data=iris) predict(disc) # ...which produces lots of output. # Check just the classifications: predict(disc)$class # Find the indices of samples that were misclassified allindices <- 1:nrow(iris) misclassified <- allindices[ predict(disc)$class != iris$Species ] misclassified # Find how the species were misclassified right <- iris$Species[misclassified]; wrong <- predict(disc)$class[misclassified] # Find the misclassification table --- truth labels the rows freq <- table( iris$Species, predict(disc)$class ) freq # raw frequencies freq/rowSums(freq) # probabilities; Prob(class|truth) row-sums to 1 # Predict the species from 4 measurements # First example: new<-data.frame(Sepal.Length=4.9,Sepal.Width=3.3,Petal.Length=1.2,Petal.Width=0.2) predict(disc, new) ## OUTPUT: $class: setosa # ==> setosa # # Second example: new<-data.frame(Sepal.Length=6.8,Sepal.Width=3.0,Petal.Length=6.1,Petal.Width=2.0) predict(disc, new) ## OUTPUT: $class: virginica # ==> virginica ############### Cross-validation ####################### # Train a classifier on part of the data, use that to predict the rest train <- sample(1:150, 75) # Half the indices, no repeats table(iris$Species[train]) ## Output [your answer may differ] ## setosa versicolor virginica ## 26 24 25 classifier <- lda(Species ~ ., data=iris, subset = train) class <- predict(classifier, iris[-train,])$class truth <- iris$Species[-train] table(truth,class) # Repeat 100 times to estimate the probability of correct classification pcc <- matrix(0,3,3); # There are 3 species, so 3 classes for( n in 1:100 ) { train <- sample(1:150, 75) # Train the classifier with half of the data... classifier <- lda(Species ~ ., data=iris, subset = train) class <- predict(classifier, iris[-train,])$class # ...to predict the remainder truth <- iris$Species[-train] # ...which will be compared with the true species pcc <- pcc + table(truth,class) } pcc pcc/rowSums(pcc) # Convert to probabilities