# Math 322 Biostatistics # Lecture, April 1, 2009 # Introduction to regression analysis and analysis of variance # Geir Arne Hjelle (hjelle@math.wustl.edu) # # Analysis of variance # # Read in a dataset and look at it. We want to figure out how crop yield is # affected by the kind of fertilizer used. crop <- read.table("http://www.math.wustl.edu/~hjelle/m322/r/cropyield.txt", header=TRUE) crop summary(crop) plot(crop) # Let us first look at the variance of the data without taking the fertilizer # into account attach(crop) plot(Yield) lines(c(1, 30), rep(mean(Yield), 2)) for (i in 1:30) lines(rep(i, 2), c(Yield[i], mean(Yield))) # To quantify the variance, we calculate the sum of squares of the # differences between the observations and their grand mean. Yield - mean(Yield) (Yield - mean(Yield))^2 sum( (Yield - mean(Yield))^2 ) ss.total <- sum( (Yield - mean(Yield))^2 ) # This is the same as the unscaled variance sum( (Yield - mean(Yield))^2 ) / 29 var(Yield) # Next, we take the fertilizers into account, and carry out the same # kind of calculations tapply(Yield, Fertilizer, mean) means <- tapply(Yield, Fertilizer, mean) means plot(Yield) lines(c(1, 30), rep(mean(Yield), 2), lty = 2) lines(c(1, 10), rep(means['A'], 2)) lines(c(11, 20), rep(means['B'], 2)) lines(c(21, 30), rep(means['C'], 2)) for (i in 1:30) lines(rep(i, 2), c(Yield[i], means[Fertilizer[i]])) # To simplify our notation, let us add the fertilizer means to the original # table means[Fertilizer] crop <- cbind(crop, FMean = means[Fertilizer]) crop # The attach() function works by copying variables when it is called. We # should therefore reattach the table after we changed it. #FMean # Gives an error since FMean didn't exist when crop detach() # was attached attach(crop) FMean # We can now calculate the variance that was not explained by the fertilizers. # Again we used an unscaled sum of squares, which is called the residual sum # of squares. Yield - FMean (Yield - FMean)^2 sum( (Yield - FMean)^2 ) ss.res <- sum( (Yield - FMean)^2 ) # Let us also calculate how much variance the fertilizers did explain. The # following is the fertilizer's sum of squares. FMean - mean(Yield) (FMean - mean(Yield))^2 sum( (FMean - mean(Yield))^2 ) ss.fert <- sum( (FMean - mean(Yield))^2 ) # The two latter sum of squares will always add up to the total sum of # squares. ss.total ss.fert + ss.res # We now want to answer whether the fertilizers did explain a lot of # variation, or only as much as could be expected due to chance. This is # done by scaling each sum of squares by the number of degrees of freedom # we had in calculating it. Then we compare the scaled fertilizer's sum of # squares with the scaled residual sum of squares. ss.total / 29 ss.res / 27 ss.fert / 2 # In R analysis of variance (and regression analysis) is implemented using # the so-called General Linear Model. To set up a linear model, we use # lm() and a description of the model. In its easiest form it looks like # # Response ~ Explanatory variable (read ~ as 'explained by') lm(Yield ~ Fertilizer) # Note that the coefficients are simply the (differences in) fertilizer means means # To carry out the full analysis of variance use anova() on the linear model # we constructed above. anova(lm(Yield ~ Fertilizer)) # Let us plot the Yield centered around the grand mean, and the fertilizer # means, respectively. par(mfrow = c(2, 1)) plot(Yield - mean(Yield)) lines( c(1, 30), rep(0, 2) ) for (i in 1:30) lines(rep(i, 2), c(Yield[i] - mean(Yield), 0)) plot(Yield - FMean) lines( c(1, 30), rep(0, 2) ) for (i in 1:30) lines(rep(i, 2), c(Yield[i] - FMean[i], 0)) # Clean up detach() par(mfrow = c(1, 1)) # Regression analysis # # Read in a data set and look at it math <- read.table("http://www.math.wustl.edu/~hjelle/m322/r/mathability.txt", header=TRUE) math plot(math) summary(math) attach(math) # We want to see how math ability (MA) can be explained by age par(mfrow = c(2, 1)) plot(Age, MA) # We can again quantify the variance of math ability by the sum of squares of # the differences between the observations and the grand mean. lines(range(Age), rep(mean(MA), 2)) for (i in 1:30) lines(rep(Age[i], 2), c(MA[i], mean(MA))) MA - mean(MA) sum( (MA - mean(MA))^2 ) ss.total <- sum( (MA - mean(MA))^2 ) # To do the regression we will "rotate" the grand mean around its "center # of mass", or more precisely the mean of the ages. plot(Age, MA) lines(range(Age), rep(mean(MA), 2), lty = 2) points(mean(Age), mean(MA), col="red", pch="X") # The slope of the new line is found such that it minimizes the sum of # squares between the observed values and the values fitted to the line. # In practice, this is done using the method of least squares. For now, # we let R do the job, again using a linear model. The function abline() # plots a straight line. If we pass it the linear model, it plots the # regression line. lm(MA ~ Age) abline(lm(MA ~ Age)) # The fitted values are also available from the linear model. We add the # fitted values to our math table. lm(MA ~ Age)$fitted.values math <- cbind(math, RMA = lm(MA ~ Age)$fitted.values) detach() attach(math) # We can then illustrate how much variance is left unexplained for (i in 1:30) lines(rep(Age[i], 2), c(MA[i], RMA[i])) # Again we quantify the variance using sum of squares ss.res <- sum( (MA - RMA)^2 ) ss.reg <- sum( (RMA - mean(MA))^2 )