Example R programs and commands 14. Simple linear regression # All lines preceded by the "#" character are my comments. # All other left-justified lines are my input. # All other indented lines are the R program output. # ############################################################ # Example: # generate a linear relationship with normal errors # # NOTE: repeating this calculation may give different results due to # different values of the random errors. # x<-1:10 y<-5*x+rnorm(x,0,2.5) # y = 5x + residuals plot(x,y) lm(y~x) # linear model y=a+bx Call: lm(formula = y ~ x) Coefficients: (Intercept) x 1.749 4.828 lm(y~x+0) # linear model y= bx, known intercept a=0. Call: lm(formula = y ~ x + 0) Coefficients: x 5.078 y<-5*x+7+rnorm(x, 0,2.5) # y = 5x + 7 + residuals plot(y~x) # alternative way to specify what to plot lm(y~x) Call: lm(formula = y ~ x) Coefficients: (Intercept) x 6.630 5.048 # Multiple y-values per x-value: x<-rep(x,5) # 5 repetitions of the x-values, concatenated y<-5*x+7+rnorm(x, 0,2.5) # y = 5x + 7 + residuals plot(y~x) lm(y~x) Call: lm(formula = y ~ x) Coefficients: (Intercept) x 8.916 4.720 # ############################################################ # Extract confidence intervals for the regression coefficients: # # NOTE: repeating this calculation may give different results due to # different values of the random errors. # x<-rep(1:10,5) # 5 repetitions of the x-values, concatenated y<-5*x+7+rnorm(x, 0,2.5) # y = 5x + 7 + residuals summary(lm(y~x)) # Produces a data structure Call: lm(formula = y ~ x) Residuals: Min 1Q Median 3Q Max -6.33323 -1.55692 -0.08027 1.52508 8.26962 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 8.3770 0.8216 10.20 1.34e-13 *** x 4.7751 0.1324 36.06 < 2e-16 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 2.689 on 48 degrees of freedom Multiple R-Squared: 0.9644, Adjusted R-squared: 0.9637 F-statistic: 1300 on 1 and 48 DF, p-value: < 2.2e-16 # Summary data structure member "coefficients" is a matrix: summary(lm(y~x))\$coefficients Estimate Std. Error t value Pr(>|t|) (Intercept) 8.376981 0.8216303 10.19556 1.338382e-13 x 4.775073 0.1324177 36.06068 1.999666e-36 summary(lm(y~x))\$coefficients[1,1] # Coefficient `a' in `y=a+bx' [1] 8.376981 summary(lm(y~x))\$coefficients[2,1] # Coefficient `b' in `y=a+bx' [1] 4.775073 summary(lm(y~x))\$coefficients[2,2] # Std. Error for coefficient `b' [1] 0.1324177 # Summary data structure member "df" is a 3-vector (p, n-p, p*), where `p' is # the number of regression parameters (here p=2 for a,b) and `n-p' is # therefore the error DF. summary(lm(y~x))\$df [1] 2 48 2 summary(lm(y~x))\$df[2] # error DF, used in confidence intervals [1] 48 # Extract members to compute the 1-alpha = 95% confidence interval for `b': alpha<-0.05; errdf=summary(lm(y~x))\$df[2]; # error DF, used in confidence intervals talpha<- qt( c(alpha/2, 1-alpha/2), errdf); b<-summary(lm(y~x))\$coefficients[2,1] # Coefficient `b' in `y=a+bx' sb<-summary(lm(y~x))\$coefficients[2,2] # Std. Error for coefficient `b' b+talpha*sb # 1-alpha confidence interval for coefficient `b' [1] 4.508830 5.041317 # ############################################################ # Deviation from linearity # Example 17.8, p.346 of Zar, "Biostatistical Analysis", 4th ed bp<-c(108,110,106,125,120,118,119,132,137,134,148,151,146,147,144,162,156,164,158,159) age<-c(rep(30,3), rep(40,4), rep(50,3), rep(60,5), rep(70,5)) # Compute sums of squares for the linear model `bp = a + b*age + error': aov(lm(bp~age)) Call: aov(formula = lm(bp ~ age)) Terms: age Residuals Sum of Squares 6750.289 118.911 Deg. of Freedom 1 18 Residual standard error: 2.570243 Estimated effects may be unbalanced # Compute sums of squares, not assuming a linear relationship: aov(bp~factor(age)) Call: aov(formula = bp ~ factor(age)) Terms: factor(age) Residuals Sum of Squares 6751.933 117.267 Deg. of Freedom 4 15 Residual standard error: 2.796029 Estimated effects may be unbalanced # The deviation-from-linearity F statistic is the ratio of variance among the # age groups that does not come from the linear relationship, to the residual # variance within the groups. Namely, read the tables above to get: # AmongGroupsSS <- 6751.933; AmongGroupsDF <- 4 RegressionSS <- 6750.289; RegressionDF <- 1 DeviationFromLinearitySS <- AmongGroupsSS - RegressionSS DeviationFromLinearityDF <- AmongGroupsDF - RegressionDF DeviationFromLinearityMS <- DeviationFromLinearitySS / DeviationFromLinearityDF ResidualSS <- 117.267; ResidualDF <- 15 ResidualMS <- ResidualSS / ResidualDF F <- DeviationFromLinearityMS / ResidualMS F [1] 0.07009645 # Since this is less than 1, do not reject H0: relationship is linear. # To compute the p-value, use: pf(F, DeviationFromLinearityDF, ResidualDF, lower.tail=FALSE) [1] 0.9750333 # ############################################################ # Prediction from a linear model # Example 17.8, p.346 of Zar, "Biostatistical Analysis", 4th ed bp<-c(108,110,106,125,120,118,119,132,137,134,148,151,146,147,144,162,156,164,158,159) age<-c(rep(30,3), rep(40,4), rep(50,3), rep(60,5), rep(70,5)) # Predict y (bp) from a new x-value (age) predict(lm(bp~age), newdata=data.frame(age=49)) [1] 132.639 # ...and with 99% confidence interval: predict(lm(bp~age), newdata=data.frame(age=49), interval="confidence", level=0.99) fit lwr upr [1,] 132.639 130.9345 134.3435