Math 322, Biostatistics M. V. Wickerhauser Homework 10 R commands ########## PROBLEM 1 # x1 x2 i (of 40 samples) #---------- ---------- -- data <- scan() 2.864810 -0.087901 1 2.404386 1.536228 2 0.579622 2.072845 3 -1.153178 0.435754 4 1.384192 4.185621 5 -2.040037 -0.076255 6 3.015638 2.956750 7 -0.156409 0.444964 8 0.852800 0.633167 9 -0.467125 -0.712590 10 1.744421 2.453734 11 2.108316 -2.300278 12 4.754022 1.574119 13 1.432215 1.049123 14 0.469449 1.740265 15 1.998294 3.047970 16 1.226427 1.741965 17 -0.963964 -1.826650 18 1.122402 -1.337069 19 -0.124088 0.895195 20 2.388924 -0.112396 21 0.980159 -1.113963 22 -1.170284 0.211460 23 0.739514 2.413948 24 2.157917 3.993882 25 1.189135 -0.800904 26 1.360922 1.483508 27 1.827972 0.590482 28 0.258943 1.706435 29 2.863697 -1.876853 30 1.788729 -1.266549 31 1.364329 1.972333 32 2.610398 -0.411356 33 1.041985 0.760463 34 0.090927 2.289402 35 2.124222 0.543565 36 2.167013 1.948388 37 1.367142 1.569296 38 1.074869 2.284006 39 1.873769 1.341474 40 # ...Read 120 items x123<-matrix(data,nrow=3) x12<-x123[1:2,] y<-data.frame(t(x12)) colMeans(y) # X1 X2 # 1.2288119 0.8988394 Sigma <- cov(y); Sigma # X1 X2 # X1 1.7509651 0.3100323 # X2 0.3100323 2.4573637 eigen(Sigma) # $values # [1] 2.574132 1.634197 # $vectors # [,1] [,2] # [1,] -0.9358257 -0.3524632 # [2,] 0.3524632 -0.9358257 #################### Problem 2: ## R commands for single factor MANOVA: # Original tabulation of the data: # Male: # # Alanine Aspartic Acid Tyrosine # ------- ------------- -------- 7.0 17.0 19.7 7.3 17.2 20.3 8.0 19.3 22.6 8.1 19.8 23.7 7.9 18.4 22.0 6.4 15.1 18.1 6.6 15.9 18.7 8.0 18.2 21.5 # Female: # # # Alanine Aspartic Acid Tyrosine # ------- ------------- -------- 7.3 17.4 22.5 7.7 19.8 24.9 8.2 20.2 26.1 8.3 22.6 27.5 6.4 23.4 28.1 7.1 21.3 25.8 6.4 22.1 26.9 8.6 18.8 25.5 ### One way to read the data: cut, paste, and scan() data<-scan() 7.0 17.0 19.7 7.3 17.2 20.3 8.0 19.3 22.6 8.1 19.8 23.7 7.9 18.4 22.0 6.4 15.1 18.1 6.6 15.9 18.7 8.0 18.2 21.5 7.3 17.4 22.5 7.7 19.8 24.9 8.2 20.2 26.1 8.3 22.6 27.5 6.4 23.4 28.1 7.1 21.3 25.8 6.4 22.1 26.9 8.6 18.8 25.5 aminos <- t(matrix(data,ncol=3,byrow=TRUE)); alanine <- aminos[,1]; asparticacid <- aminos[,2]; tyrosine <- aminos[,3]; ### Alternatively, read the columns of the table directly into arrays # First 8 are Male, last eight are Female: alanine <- c(7.0, 7.3, 8.0, 8.1, 7.9, 6.4, 6.6, 8.0, 7.3, 7.7, 8.2, 8.3, 6.4, 7.1, 6.4, 8.6); asparticacid <- c(17.0, 17.2, 19.3, 19.8, 18.4, 15.1, 15.9, 18.2, 17.4, 19.8, 20.2, 22.6, 23.4, 21.3, 22.1, 18.8); tyrosine <- c(19.7, 20.3, 22.6, 23.7, 22.0, 18.1, 18.7, 21.5, 22.5, 24.9, 26.1, 27.5, 28.1, 25.8, 26.9, 25.5); # Form the 3-variate measurement set: X <- cbind(alanine,asparticacid,tyrosine); # Identify the two levels of the single factor: sex <- factor(gl(2,8),labels=c("Male","Female")); # Calculate SS, DF, MS, and all variance ratios: fit <- manova(X ~ sex); # (a) Three individual ANOVAs: anova(lm(alanine~sex)) anova(lm(asparticacid~sex)) anova(lm(tyrosine~sex)) # Equivalently: summary.aov(manova(X ~ sex)) # gives univariate ANOVA tables > summary.aov(fit) Response alanine : Df Sum Sq Mean Sq F value Pr(>F) sex 1 0.0306 0.0306 0.0519 0.8232 Residuals 14 8.2687 0.5906 Response asparticacid : Df Sum Sq Mean Sq F value Pr(>F) sex 1 38.131 38.131 11.317 0.004632 ** Residuals 14 47.169 3.369 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Response tyrosine : Df Sum Sq Mean Sq F value Pr(>F) sex 1 103.531 103.531 30.257 7.814e-05 *** Residuals 14 47.904 3.422 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 # (b) MANOVA on the three variables together: # NOTE: All four methods give exactly the same p-value. # NOTE: summary(fit) recognizes that `fit' is a MANOVA fit <- manova(X ~ sex); summary(fit, test="Wilks") # Wilks' lambda test summary(fit, test="Hotelling") # Hotelling-Lawley test summary(fit, test="Pillai") # Pillai's test summary(fit, test="Roy") # Roy's largest root test > summary.manova(fit) Df Pillai approx F num Df den Df Pr(>F) sex 1 0.913 41.872 3 12 1.24e-06 *** Residuals 14 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 > summary.manova(fit, test="W") Df Wilks approx F num Df den Df Pr(>F) sex 1 0.087 41.872 3 12 1.24e-06 *** Residuals 14 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 > summary.manova(fit, test="H") Df Hotelling-Lawley approx F num Df den Df Pr(>F) sex 1 10.468 41.872 3 12 1.24e-06 *** Residuals 14 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 > summary.manova(fit, test="R") Df Roy approx F num Df den Df Pr(>F) sex 1 10.468 41.872 3 12 1.24e-06 *** Residuals 14 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #################### Problem 3 ## R commands for two-factor bivariate MANOVA: # Original data tabulation: # Hormone Treatment: # -------------------------------------------- # Female Male # -------------------- -------------------- # Plasma Ca H20 loss Plasma Ca H20 loss # --------- -------- --------- -------- 16.5 76 14.5 80 18.4 71 11.0 72 12.7 64 10.8 77 14.0 66 14.3 69 12.8 69 10.0 74 # No Hormone Treatment: # -------------------------------------------- # Female Male # -------------------- -------------------- # Plasma Ca H20 loss Plasma Ca H20 loss # --------- -------- --------- -------- 39.1 71 32.0 65 26.2 70 23.8 69 21.3 63 28.8 67 35.8 59 25.0 56 40.2 60 29.3 52 ### One way to read the data: cut, paste, and scan() data<-scan() 16.5 76 14.5 80 18.4 71 11.0 72 12.7 64 10.8 77 14.0 66 14.3 69 12.8 69 10.0 74 39.1 71 32.0 65 26.2 70 23.8 69 21.3 63 28.8 67 35.8 59 25.0 56 40.2 60 29.3 52 x<-matrix(data,ncol=2,byrow=TRUE); plasmaca1 <- x[,1]; h20loss1 <- x[,2]; # Form the bi-variate measurement set: Y1 <- cbind(plasmaca1,h20loss1); # Carefully label the data by factor levels: hormonetreatment <- factor(gl(2,10), labels=c("No", "Yes")) gender1 <- factor(gl(2, 1, len=20), labels=c("Female", "Male")) # Calculate SS, DF, MS, and all variance ratios: model1 <- manova(Y1 ~ hormonetreatment * gender1); # Display the results: summary.aov(model1) # univariate ANOVA tables summary(model1, test="Wilks") # ANOVA table of Wilks' lambda summary(model1, test="Pillai") # ANOVA table of Pillai's test summary(model1, test="Roy") # ANOVA table of Roy's largest root test summary(model1, test="Hotelling") # ANOVA table of Hotelling-Lawley test ### Alternatively, read the data directly into arrays: plasmaca2 <- c( 76, 71, 64, 66, 69, 80, 72, 77, 69, 74, 71, 70, 63, 59, 60, 65, 69, 67, 56, 52); h20loss2 <- c( 16.5, 18.4, 12.7, 14.0, 12.8, 14.5, 11.0, 10.8, 14.3, 10.0, 39.1, 26.2, 21.3, 35.8, 40.2, 32.0, 23.8, 28.8, 25.0, 29.3); # Form the bi-variate measurement set: Y2 <- cbind(plasmaca2,h20loss2); # Identify the levels and arrangement of the two factors, # somewhat differently: hormonetreatment <- factor(gl(2,10), labels=c("No", "Yes")) gender2 <- factor(gl(2, 5, len=20), labels=c("Female", "Male")) # Calculate SS, DF, MS, and all variance ratios: model2 <- manova(Y2 ~ hormonetreatment * gender2); # Display the results: summary.aov(model2) # univariate ANOVA tables summary(model2, test="Wilks") # ANOVA table of Wilks' lambda summary(model2, test="Pillai") # ANOVA table of Pillai's test summary(model2, test="Roy") # ANOVA table of Roy's largest root test summary(model2, test="Hotelling") # ANOVA table of Hotelling-Lawley test #### Both data entry/labelling methods should yield the same test results > summary.aov(model) Response plasmaca : Df Sum Sq Mean Sq F value Pr(>F) hormonetreatment 1 369.8 369.8 11.7397 0.003462 ** gender 1 7.2 7.2 0.2286 0.639052 hormonetreatment:gender 1 80.0 80.0 2.5397 0.130578 Residuals 16 504.0 31.5 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Response h20loss : Df Sum Sq Mean Sq F value Pr(>F) hormonetreatment 1 1386.11 1386.11 60.5336 7.943e-07 *** gender 1 70.31 70.31 3.0706 0.09886 . hormonetreatment:gender 1 4.90 4.90 0.2140 0.64987 Residuals 16 366.37 22.90 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 > summary.manova(model, test="Wilks") Df Wilks approx F num Df den Df Pr(>F) hormonetreatment 1 0.182 33.717 2 15 2.818e-06 *** gender 1 0.830 1.541 2 15 0.2461 hormonetreatment:gender 1 0.853 1.295 2 15 0.3027 Residuals 16 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 > summary.manova(model, test="Pillai") Df Pillai approx F num Df den Df Pr(>F) hormonetreatment 1 0.818 33.717 2 15 2.818e-06 *** gender 1 0.170 1.541 2 15 0.2461 hormonetreatment:gender 1 0.147 1.295 2 15 0.3027 Residuals 16 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 > summary.manova(model, test="Roy") Df Roy approx F num Df den Df Pr(>F) hormonetreatment 1 4.496 33.717 2 15 2.818e-06 *** gender 1 0.206 1.541 2 15 0.2461 hormonetreatment:gender 1 0.173 1.295 2 15 0.3027 Residuals 16 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 > summary.manova(model, test="Hotelling") Df Hotelling-Lawley approx F num Df den Df Pr(>F) hormonetreatment 1 4.496 33.717 2 15 2.818e-06 gender 1 0.206 1.541 2 15 0.2461 hormonetreatment:gender 1 0.173 1.295 2 15 0.3027 Residuals 16 hormonetreatment *** gender hormonetreatment:gender Residuals --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #################### PROBLEM 4 # Reload the data from Problem 2: alanine <- c(7.0, 7.3, 8.0, 8.1, 7.9, 6.4, 6.6, 8.0, 7.3, 7.7, 8.2, 8.3, 6.4, 7.1, 6.4, 8.6); asparticacid <- c(17.0, 17.2, 19.3, 19.8, 18.4, 15.1, 15.9, 18.2, 17.4, 19.8, 20.2, 22.6, 23.4, 21.3, 22.1, 18.8); tyrosine <- c(19.7, 20.3, 22.6, 23.7, 22.0, 18.1, 18.7, 21.5, 22.5, 24.9, 26.1, 27.5, 28.1, 25.8, 26.9, 25.5); acids <- data.frame(cbind(alanine,asparticacid,tyrosine)); # Identify the two levels of the single factor. # ...first 8 are Male, last eight are Female: sex <- gl(2,8,labels=c("Male","Female")); # Build a data frame with the variables and the factor label centipede <- data.frame(cbind(acids,sex)); # (a) # Plot 2-D scatterplots of all (4 choose 2)=6 pairs of variables # Create point labels "s", "v", "c" for the 3 species point.labels <- rep("X", nrow(centipede)); point.labels[centipede$sex=="Male"] <-"x" point.labels[centipede$sex=="Female"] <-"o" pdf("hw11ex4a.pdf") pairs( centipede[,1:3], pch=point.labels); dev.off() # (b) Plot the 3-d scatterplot amino acid concentrations. # # Follow the hint: install.packages("scatterplot3d") library(scatterplot3d); pdf("hw11ex4b.pdf") scatterplot3d(centipede[,1:3],pch=point.labels) dev.off() # (c) Principal components and screeplot # pca <- princomp(centipede[,1:3]); pca; summary(pca) # ...which indicates that 2 components contain 98% of the variance pdf("hw11ex4c.pdf") screeplot(pca) dev.off() # (d) Linear discriminant analysis library(MASS) # contains the lda() function. disc<-lda(sex ~ ., data=centipede); disc new=data.frame(alanine=7.5,asparticacid=18.1,tyrosine=22.1) predict(disc, new) ## OUTPUT: ##$class ## [1] Male ## Levels: Male Female ## ## $posterior ## Male Female ## 1 0.997199 0.002800927 ## ... # # ...so it's a Male with 99.7% probability. # (e) Cross-validation # # Train a classifier on part of the data, use that to predict the rest train <- sample(1:16, 8) # Half the indices, no repeats table(centipede$sex[train]) ## Output [your answer may differ] ## Male Female ## 5 3 classifier <- lda(sex ~ ., data=centipede, subset = train) class <- predict(classifier, centipede[-train,])$class truth <- centipede$sex[-train] table(truth,class) ## OUTPUT: ## truth Male Female ## Male 3 0 ## Female 0 5 #...pretty good, no mistakes # # Repeat 100 times to estimate the probability of correct classification pcc <- matrix(0,2,2); # There are 2 sexes, so 2 classes for( n in 1:100 ) { train <- c(sample(1:8,4),sample(9:16,4)); # Train with 4 males, 4 females. classifier <- lda(sex ~ ., data=centipede, subset = train) class <- predict(classifier, centipede[-train,])$class # ...predict remainder truth <- centipede$sex[-train] # ...which will be compared with the true sex pcc <- pcc + table(truth,class) } pcc ## OUTPUT: ## class ## truth Male Female ## Male 400 0 ## Female 5 395 ## attr(,"class") ## [1] "table" # pcc/rowSums(pcc) # Convert to probabilities ## OUTPUT: ## class ## truth Male Female ## Male 1.0000 0.0000 ## Female 0.0125 0.9875 ## attr(,"class") ## [1] "table" #################### PROBLEM 5 # Reload the data from Problem 2: alanine <- c(7.0, 7.3, 8.0, 8.1, 7.9, 6.4, 6.6, 8.0, 7.3, 7.7, 8.2, 8.3, 6.4, 7.1, 6.4, 8.6); asparticacid <- c(17.0, 17.2, 19.3, 19.8, 18.4, 15.1, 15.9, 18.2, 17.4, 19.8, 20.2, 22.6, 23.4, 21.3, 22.1, 18.8); tyrosine <- c(19.7, 20.3, 22.6, 23.7, 22.0, 18.1, 18.7, 21.5, 22.5, 24.9, 26.1, 27.5, 28.1, 25.8, 26.9, 25.5); acids <- data.frame(cbind(alanine,asparticacid,tyrosine)); # Compute the means and variances for the male and female clusters: male <- acids[1:8,]; female <- acids[9:16,]; mmean <- colMeans(male); fmean <- colMeans(female); mvar <- var(male); fvar <- var(female); # Predict the gender from 3 new measurements new=data.frame(alanine=7.55,asparticacid=18.1,tyrosine=23.3) mahalanobis(new, mmean, mvar) # 64.47468 farther mahalanobis(new, fmean, fvar) # 2.380527 nearer