################################################################################ #### Example of Multiple Regression Model #### ################################################################################ #### By Jimin Ding, 04/18/2017 #### Source: "Linear models with R", by Faraway (2005) ######################## Example 1. Saving Data ################################ #### Saving rates in 50 countries. #### Source: "Linear models with R", by Faraway (2005, 2014) library(faraway) data(savings) ?savings head(savings) #### Exploratory Data Analysis pairs(savings)## matrix of scatter plots ## enhanced matrix of scatter plots library(car) scatterplotMatrix(savings) cor(savings) #### Regression Model regfit=lm(sr~pop15+pop75+dpi+ddpi,data=savings) summary(regfit) ## some parameters are insignificant and may be excluded from the model #### Model selection regfit2=update(regfit,.~pop15+ddpi,data=savings) summary(regfit2) ## backward selection based on p-value of F test drop1(regfit,test="F") regfit3=update(regfit,.~.-dpi) drop1(regfit3,test="F") ## forward selection regfit0=lm(sr~1,data=savings) add1(regfit0,scope=~pop15+pop75+dpi+ddpi,test="F") regfit4=update(regfit0,.~.+pop15) add1(regfit4,scope=~pop15+pop75+dpi+ddpi,test="F") regfit5=update(regfit4,.~.+ddpi) add1(regfit5,scope=~pop15+pop75+dpi+ddpi,test="F") ## AIC-based stepwise selection ?step step(regfit) #### Inference confint(regfit) confint(regfit2) ## smaller CI due to simpler model (less parameters) ## confidence set library(ellipse) plot(ellipse(regfit,c(2,3)),type="l",xlim=c(-1,0)) points(0,0) points(coef(regfit)[2], coef(regfit)[3], pch=18) abline(v=confint(regfit)[2,], lty=2) abline(h=confint(regfit)[3,], lty=2) #### Diagnostic analysis based on the selected model: regfit2 par(mfrow=c(2,2)) plot(regfit2) ## homogeniety assumption summary (lm (abs (residuals (regfit2)) ? fitted (regfit2))) ## independence assumption dwtest(regfit2) ## normality shapiro.test(residuals(regfit2)) ## potential influential points summary(influence.measures(regfit2)) ## sensitivity analysis k=which(rownames(savings)%in%c("Zambia","Libya")) regfit2D=update(regfit2, .~.,data=savings[-k,]) summary(regfit2D) ## compare with summary(regfit2) ######################## Example 2. Blood Pressure ################################ #### Data Source: https://onlinecourses.science.psu.edu/stat501/node/347 ## Blood pressure of 20 high blood pressure patients with their ## age,weight,body surface area, duration of hypertension, basal pulse, stress index bp=read.table("bp.txt",header=T) head(bp) scatterplotMatrix(bp[,-1]) cor(bp[,-1]) regfit=lm(BP~ Age+ Weight + BSA + Dur + Pulse + Stress,data=bp) summary(regfit) vif(regfit) ## weight has a large VIF, which implies it is collinear with at least one of other predictors. ## further investigate it's relation with other predictors regweight=lm(Weight~ Age + BSA + Dur + Pulse + Stress,data=bp) summary(regweight) ## strongly correrlated with BSA and Pulse, remove them and refit regression regfit2=update(regfit,.~.-BSA-Pulse) summary(regfit2) vif(regfit2) #### F-test to compare the model with and without "BSA" and "Pulse" anova(regfit,regfit2) #### Ridge Regression library(MASS) lam=seq(0, 5,0.001) regrid=lm.ridge(BP~ Age+ Weight + BSA + Dur + Pulse + Stress,data=bp,lambda=lam) ## trace plot plot(lm.ridge(BP~ Age+ Weight + BSA + Dur + Pulse + Stress,data=bp, lambda = lam),col=1:6,lty=1:6) legend('topright',c("Age","Weight"," BSA", "Dur", "Pulse"," Stress"),col=1:6,lty=1:6) abline(0,0, lty=2) select(lm.ridge(BP~ Age+ Weight + BSA + Dur + Pulse + Stress,data=bp, lambda = seq(0,1,0.0001))) #### LASSO library(lars) regfit=lm(BP~ Age+ Weight + BSA + Dur + Pulse + Stress,data=bp,x=TRUE,y=TRUE) lassofit=lars(regfit\$x,regfit\$y) plot(lassofit)