Math 322, Biostatistics
M. V. Wickerhauser
Homework 10 R commands


########## PROBLEM 1

#    x1          x2      i  (of 40 samples)
#---------- ----------  --
data <- scan()
  2.864810  -0.087901   1
  2.404386   1.536228   2
  0.579622   2.072845   3
 -1.153178   0.435754   4
  1.384192   4.185621   5
 -2.040037  -0.076255   6
  3.015638   2.956750   7
 -0.156409   0.444964   8
  0.852800   0.633167   9
 -0.467125  -0.712590  10
  1.744421   2.453734  11
  2.108316  -2.300278  12
  4.754022   1.574119  13
  1.432215   1.049123  14
  0.469449   1.740265  15
  1.998294   3.047970  16
  1.226427   1.741965  17
 -0.963964  -1.826650  18
  1.122402  -1.337069  19
 -0.124088   0.895195  20
  2.388924  -0.112396  21
  0.980159  -1.113963  22
 -1.170284   0.211460  23
  0.739514   2.413948  24
  2.157917   3.993882  25
  1.189135  -0.800904  26
  1.360922   1.483508  27
  1.827972   0.590482  28
  0.258943   1.706435  29
  2.863697  -1.876853  30
  1.788729  -1.266549  31
  1.364329   1.972333  32
  2.610398  -0.411356  33
  1.041985   0.760463  34
  0.090927   2.289402  35
  2.124222   0.543565  36
  2.167013   1.948388  37
  1.367142   1.569296  38
  1.074869   2.284006  39
  1.873769   1.341474  40

# ...Read 120 items
x123<-matrix(data,nrow=3)
x12<-x123[1:2,]
y<-data.frame(t(x12))
colMeans(y)
#	  X1        X2
#   1.2288119 0.8988394

Sigma <- cov(y);  Sigma

#	     X1        X2
#   X1 1.7509651 0.3100323
#   X2 0.3100323 2.4573637

eigen(Sigma)

#   $values
#   [1]  2.574132 1.634197

#   $vectors
#	      [,1]       [,2]
#   [1,] -0.9358257 -0.3524632
#   [2,]  0.3524632 -0.9358257


#################### Problem 2:
## R commands for single factor MANOVA:

# Original tabulation of the data:
#  Male:
#
#  Alanine  Aspartic Acid  Tyrosine
#  -------  -------------  --------
    7.0         17.0        19.7
    7.3         17.2        20.3
    8.0         19.3        22.6
    8.1         19.8        23.7
    7.9         18.4        22.0
    6.4         15.1        18.1
    6.6         15.9        18.7
    8.0         18.2        21.5

# Female:
#
#
#  Alanine  Aspartic Acid  Tyrosine
#  -------  -------------  --------
    7.3         17.4        22.5
    7.7         19.8        24.9
    8.2         20.2        26.1
    8.3         22.6        27.5
    6.4         23.4        28.1
    7.1         21.3        25.8
    6.4         22.1        26.9
    8.6         18.8        25.5

### One way to read the data: cut, paste, and scan()
data<-scan()
    7.0         17.0        19.7
    7.3         17.2        20.3
    8.0         19.3        22.6
    8.1         19.8        23.7
    7.9         18.4        22.0
    6.4         15.1        18.1
    6.6         15.9        18.7
    8.0         18.2        21.5
    7.3         17.4        22.5
    7.7         19.8        24.9
    8.2         20.2        26.1
    8.3         22.6        27.5
    6.4         23.4        28.1
    7.1         21.3        25.8
    6.4         22.1        26.9
    8.6         18.8        25.5

aminos <- t(matrix(data,ncol=3,byrow=TRUE));
alanine <- aminos[,1];
asparticacid <- aminos[,2];
tyrosine <- aminos[,3];

### Alternatively, read the columns of the table directly into arrays
# First 8 are Male, last eight are Female:
alanine <- c(7.0, 7.3, 8.0, 8.1, 7.9, 6.4, 6.6, 8.0,
             7.3, 7.7, 8.2, 8.3, 6.4, 7.1, 6.4, 8.6);
asparticacid <- c(17.0, 17.2, 19.3, 19.8, 18.4, 15.1, 15.9, 18.2,
                  17.4, 19.8, 20.2, 22.6, 23.4, 21.3, 22.1, 18.8);
tyrosine <- c(19.7, 20.3, 22.6, 23.7, 22.0, 18.1, 18.7, 21.5,
              22.5, 24.9, 26.1, 27.5, 28.1, 25.8, 26.9, 25.5);

# Form the 3-variate measurement set:
X <- cbind(alanine,asparticacid,tyrosine);
# Identify the two levels of the single factor:
sex <- factor(gl(2,8),labels=c("Male","Female"));
# Calculate SS, DF, MS, and all variance ratios:
fit <- manova(X ~ sex);

# (a) Three individual ANOVAs:
anova(lm(alanine~sex))
anova(lm(asparticacid~sex))
anova(lm(tyrosine~sex))
# Equivalently:
summary.aov(manova(X ~ sex))   # gives univariate ANOVA tables


> summary.aov(fit)
 Response alanine :
            Df Sum Sq Mean Sq F value Pr(>F)
sex          1 0.0306  0.0306  0.0519 0.8232
Residuals   14 8.2687  0.5906               

 Response asparticacid :
            Df Sum Sq Mean Sq F value   Pr(>F)   
sex          1 38.131  38.131  11.317 0.004632 **
Residuals   14 47.169   3.369                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

 Response tyrosine :
            Df  Sum Sq Mean Sq F value    Pr(>F)    
sex          1 103.531 103.531  30.257 7.814e-05 ***
Residuals   14  47.904   3.422                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 


# (b) MANOVA on the three variables together:
# NOTE: All four methods give exactly the same p-value.
# NOTE: summary(fit) recognizes that `fit' is a MANOVA
fit <- manova(X ~ sex);
summary(fit, test="Wilks")     # Wilks' lambda test
summary(fit, test="Hotelling") # Hotelling-Lawley test
summary(fit, test="Pillai")    # Pillai's test
summary(fit, test="Roy")       # Roy's largest root test

> summary.manova(fit)
          Df Pillai approx F num Df den Df   Pr(>F)    
sex        1  0.913   41.872      3     12 1.24e-06 ***
Residuals 14                                           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
> summary.manova(fit, test="W")
          Df  Wilks approx F num Df den Df   Pr(>F)    
sex        1  0.087   41.872      3     12 1.24e-06 ***
Residuals 14                                           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
> summary.manova(fit, test="H")
          Df Hotelling-Lawley approx F num Df den Df   Pr(>F)    
sex        1           10.468   41.872      3     12 1.24e-06 ***
Residuals 14                                                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
> summary.manova(fit, test="R")
          Df    Roy approx F num Df den Df   Pr(>F)    
sex        1 10.468   41.872      3     12 1.24e-06 ***
Residuals 14                                           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 


#################### Problem 3

## R commands for two-factor bivariate MANOVA:

# Original data tabulation:
# Hormone Treatment:
# --------------------------------------------
#        Female                   Male
# --------------------    --------------------
# Plasma Ca   H20 loss    Plasma Ca   H20 loss
# ---------   --------    ---------   --------
    16.5         76          14.5        80
    18.4         71          11.0        72
    12.7         64          10.8        77
    14.0         66          14.3        69
    12.8         69          10.0        74

# No Hormone Treatment:
# --------------------------------------------
#        Female                   Male
# --------------------    --------------------
# Plasma Ca   H20 loss    Plasma Ca   H20 loss
# ---------   --------    ---------   --------
      39.1       71          32.0       65
      26.2       70          23.8       69
      21.3       63          28.8       67
      35.8       59          25.0       56
      40.2       60          29.3       52

### One way to read the data: cut, paste, and scan()
data<-scan()
    16.5         76          14.5        80
    18.4         71          11.0        72
    12.7         64          10.8        77
    14.0         66          14.3        69
    12.8         69          10.0        74
      39.1       71          32.0       65
      26.2       70          23.8       69
      21.3       63          28.8       67
      35.8       59          25.0       56
      40.2       60          29.3       52

x<-matrix(data,ncol=2,byrow=TRUE);
plasmaca1 <- x[,1];  h20loss1 <- x[,2];
# Form the bi-variate measurement set:
Y1 <- cbind(plasmaca1,h20loss1);
# Carefully label the data by factor levels:
hormonetreatment <- factor(gl(2,10), labels=c("No", "Yes"))
gender1 <- factor(gl(2, 1, len=20), labels=c("Female", "Male"))
# Calculate SS, DF, MS, and all variance ratios:
model1 <- manova(Y1 ~ hormonetreatment * gender1);
# Display the results:
summary.aov(model1)               # univariate ANOVA tables
summary(model1, test="Wilks")     # ANOVA table of Wilks' lambda
summary(model1, test="Pillai")    # ANOVA table of Pillai's test
summary(model1, test="Roy")       # ANOVA table of Roy's largest root test
summary(model1, test="Hotelling") # ANOVA table of Hotelling-Lawley test


### Alternatively, read the data directly into arrays:
plasmaca2 <- c( 76, 71, 64, 66, 69,
                80, 72, 77, 69, 74,
                71, 70, 63, 59, 60,
                65, 69, 67, 56, 52);

h20loss2 <- c( 16.5, 18.4, 12.7, 14.0, 12.8,
               14.5, 11.0, 10.8, 14.3, 10.0,
               39.1, 26.2, 21.3, 35.8, 40.2,
               32.0, 23.8, 28.8, 25.0, 29.3);

# Form the bi-variate measurement set:
Y2 <- cbind(plasmaca2,h20loss2);
# Identify the levels and arrangement of the two factors,
#  somewhat differently:
hormonetreatment <- factor(gl(2,10), labels=c("No", "Yes"))
gender2 <- factor(gl(2, 5, len=20), labels=c("Female", "Male"))
# Calculate SS, DF, MS, and all variance ratios:
model2 <- manova(Y2 ~ hormonetreatment * gender2);

# Display the results:
summary.aov(model2)               # univariate ANOVA tables
summary(model2, test="Wilks")     # ANOVA table of Wilks' lambda
summary(model2, test="Pillai")    # ANOVA table of Pillai's test
summary(model2, test="Roy")       # ANOVA table of Roy's largest root test
summary(model2, test="Hotelling") # ANOVA table of Hotelling-Lawley test

#### Both data entry/labelling methods should yield the same test results

> summary.aov(model)
 Response plasmaca :
                        Df Sum Sq Mean Sq F value   Pr(>F)   
hormonetreatment         1  369.8   369.8 11.7397 0.003462 **
gender                   1    7.2     7.2  0.2286 0.639052   
hormonetreatment:gender  1   80.0    80.0  2.5397 0.130578   
Residuals               16  504.0    31.5                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

 Response h20loss :
                        Df  Sum Sq Mean Sq F value    Pr(>F)    
hormonetreatment         1 1386.11 1386.11 60.5336 7.943e-07 ***
gender                   1   70.31   70.31  3.0706   0.09886 .  
hormonetreatment:gender  1    4.90    4.90  0.2140   0.64987    
Residuals               16  366.37   22.90                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

> summary.manova(model, test="Wilks")
                        Df  Wilks approx F num Df den Df    Pr(>F)    
hormonetreatment         1  0.182   33.717      2     15 2.818e-06 ***
gender                   1  0.830    1.541      2     15    0.2461    
hormonetreatment:gender  1  0.853    1.295      2     15    0.3027    
Residuals               16                                            
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
> summary.manova(model, test="Pillai")
                        Df Pillai approx F num Df den Df    Pr(>F)    
hormonetreatment         1  0.818   33.717      2     15 2.818e-06 ***
gender                   1  0.170    1.541      2     15    0.2461    
hormonetreatment:gender  1  0.147    1.295      2     15    0.3027    
Residuals               16                                            
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
> summary.manova(model, test="Roy")
                        Df    Roy approx F num Df den Df    Pr(>F)    
hormonetreatment         1  4.496   33.717      2     15 2.818e-06 ***
gender                   1  0.206    1.541      2     15    0.2461    
hormonetreatment:gender  1  0.173    1.295      2     15    0.3027    
Residuals               16                                            
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
> summary.manova(model, test="Hotelling")
                        Df Hotelling-Lawley approx F num Df den Df    Pr(>F)
hormonetreatment         1            4.496   33.717      2     15 2.818e-06
gender                   1            0.206    1.541      2     15    0.2461
hormonetreatment:gender  1            0.173    1.295      2     15    0.3027
Residuals               16                                                  
                           
hormonetreatment        ***
gender                     
hormonetreatment:gender    
Residuals                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
 

####################  PROBLEM 4

# Reload the data from Problem 2:
alanine <- c(7.0, 7.3, 8.0, 8.1, 7.9, 6.4, 6.6, 8.0,
             7.3, 7.7, 8.2, 8.3, 6.4, 7.1, 6.4, 8.6);
asparticacid <- c(17.0, 17.2, 19.3, 19.8, 18.4, 15.1, 15.9, 18.2,
                  17.4, 19.8, 20.2, 22.6, 23.4, 21.3, 22.1, 18.8);
tyrosine <- c(19.7, 20.3, 22.6, 23.7, 22.0, 18.1, 18.7, 21.5,
              22.5, 24.9, 26.1, 27.5, 28.1, 25.8, 26.9, 25.5);
acids <- data.frame(cbind(alanine,asparticacid,tyrosine));
# Identify the two levels of the single factor.
# ...first 8 are Male, last eight are Female:
sex <- gl(2,8,labels=c("Male","Female"));
# Build a data frame with the variables and the factor label
centipede <- data.frame(cbind(acids,sex));

# (a) 
# Plot 2-D scatterplots of all (4 choose 2)=6 pairs of variables
# Create point labels "s", "v", "c" for the 3 species
point.labels <- rep("X", nrow(centipede));
point.labels[centipede$sex=="Male"] <-"x"
point.labels[centipede$sex=="Female"] <-"o"
pdf("hw11ex4a.pdf")
pairs( centipede[,1:3], pch=point.labels);
dev.off()

# (b) Plot the 3-d scatterplot amino acid concentrations.  
#
# Follow the hint:
install.packages("scatterplot3d")
library(scatterplot3d);    
pdf("hw11ex4b.pdf")
scatterplot3d(centipede[,1:3],pch=point.labels)
dev.off()

# (c) Principal components and screeplot
#
pca <- princomp(centipede[,1:3]); pca; summary(pca)
# ...which indicates that 2 components contain 98% of the variance
pdf("hw11ex4c.pdf")
screeplot(pca)
dev.off()

# (d) Linear discriminant analysis
library(MASS)    # contains the lda() function.
disc<-lda(sex ~ ., data=centipede);  disc
new=data.frame(alanine=7.5,asparticacid=18.1,tyrosine=22.1)
predict(disc, new)
## OUTPUT:
##$class
## [1] Male
## Levels: Male Female
##
## $posterior
##       Male      Female
## 1 0.997199 0.002800927
## ...
#
# ...so it's a Male with 99.7% probability.

# (e)  Cross-validation 
#
# Train a classifier on part of the data, use that to predict the rest
train <- sample(1:16, 8)      # Half the indices, no repeats
table(centipede$sex[train])
## Output [your answer may differ]
##  Male Female 
##     5      3 
classifier <- lda(sex ~ ., data=centipede, subset = train)
class <- predict(classifier, centipede[-train,])$class
truth <- centipede$sex[-train]
table(truth,class)
## OUTPUT:
## truth    Male Female
##   Male      3      0
##   Female    0      5
#...pretty good, no mistakes
#
# Repeat 100 times to estimate the probability of correct classification
pcc <- matrix(0,2,2);  # There are 2 sexes, so 2 classes
for( n in 1:100 ) {
  train <- c(sample(1:8,4),sample(9:16,4));  # Train with 4 males, 4 females.
  classifier <- lda(sex ~ ., data=centipede, subset = train)
  class <- predict(classifier, centipede[-train,])$class  # ...predict remainder
  truth <- centipede$sex[-train]  # ...which will be compared with the true sex
  pcc <- pcc + table(truth,class)
}
pcc
## OUTPUT:
##         class
## truth    Male Female
##   Male    400      0
##   Female    5    395
## attr(,"class")
## [1] "table"
#
pcc/rowSums(pcc)     # Convert to probabilities
## OUTPUT:
##         class
## truth      Male Female
##   Male   1.0000 0.0000
##   Female 0.0125 0.9875
## attr(,"class")
## [1] "table"


####################  PROBLEM 5

# Reload the data from Problem 2:
alanine <- c(7.0, 7.3, 8.0, 8.1, 7.9, 6.4, 6.6, 8.0,
             7.3, 7.7, 8.2, 8.3, 6.4, 7.1, 6.4, 8.6);
asparticacid <- c(17.0, 17.2, 19.3, 19.8, 18.4, 15.1, 15.9, 18.2,
                  17.4, 19.8, 20.2, 22.6, 23.4, 21.3, 22.1, 18.8);
tyrosine <- c(19.7, 20.3, 22.6, 23.7, 22.0, 18.1, 18.7, 21.5,
              22.5, 24.9, 26.1, 27.5, 28.1, 25.8, 26.9, 25.5);
acids <- data.frame(cbind(alanine,asparticacid,tyrosine));

# Compute the means and variances for the male and female clusters:
male <- acids[1:8,];  female <- acids[9:16,];
mmean <- colMeans(male);  fmean <- colMeans(female);
mvar <- var(male);   fvar <- var(female);

# Predict the gender from 3 new measurements
new=data.frame(alanine=7.55,asparticacid=18.1,tyrosine=23.3)
mahalanobis(new, mmean, mvar)   # 64.47468   farther
mahalanobis(new, fmean, fvar)   #  2.380527  nearer